microsoft/AI-For-Beginners

Public

mirrored fromhttps://github.com/microsoft/AI-For-BeginnersAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

9055907df3fb7071d169ef87a2340566e0f176e6

Find a branch or tag

Branches

9055907df3fb7071d169ef87a2340566e0f176e6

Clone

HTTPS

Download ZIP

AI-For-Beginners/lessons/5-NLP/17-GenerativeNetworks

lessons/5-NLP/17-GenerativeNetworks/torchnlp.py

111lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`import builtins`
2	`import torch`
3	`import torchtext`
4	`import collections`
5	`import os`
6
7	`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
8
9	`vocab = None`
10	`tokenizer = torchtext.data.utils.get_tokenizer('basic_english')`
11
12	`def load_dataset(ngrams=1,min_freq=1):`
13	`global vocab, tokenizer`
14	`print("Loading dataset...")`
15	`train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')`
16	`train_dataset = list(train_dataset)`
17	`test_dataset = list(test_dataset)`
18	`classes = ['World', 'Sports', 'Business', 'Sci/Tech']`
19	`print('Building vocab...')`
20	`counter = collections.Counter()`
21	`for (label, line) in train_dataset:`
22	`counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))`
23	`vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)`
24	`return train_dataset,test_dataset,classes,vocab`
25
26	`stoi_hash = {}`
27	`def encode(x,voc=None,unk=0,tokenizer=tokenizer):`
28	`global stoi_hash`
29	`v = vocab if voc is None else voc`
30	`if v in stoi_hash.keys():`
31	`stoi = stoi_hash[v]`
32	`else:`
33	`stoi = v.get_stoi()`
34	`stoi_hash[v]=stoi`
35	`return [stoi.get(s,unk) for s in tokenizer(x)]`
36
37	`def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):`
38	`optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)`
39	`loss_fn = loss_fn.to(device)`
40	`net.train()`
41	`total_loss,acc,count,i = 0,0,0,0`
42	`for labels,features in dataloader:`
43	`optimizer.zero_grad()`
44	`features, labels = features.to(device), labels.to(device)`
45	`out = net(features)`
46	`loss = loss_fn(out,labels) #cross_entropy(out,labels)`
47	`loss.backward()`
48	`optimizer.step()`
49	`total_loss+=loss`
50	`_,predicted = torch.max(out,1)`
51	`acc+=(predicted==labels).sum()`
52	`count+=len(labels)`
53	`i+=1`
54	`if i%report_freq==0:`
55	`print(f"{count}: acc={acc.item()/count}")`
56	`if epoch_size and count>epoch_size:`
57	`break`
58	`return total_loss.item()/count, acc.item()/count`
59
60	`def padify(b,voc=None,tokenizer=tokenizer):`
61	`# b is the list of tuples of length batch_size`
62	`# - first element of a tuple = label,`
63	`# - second = feature (text sequence)`
64	`# build vectorized sequence`
65	`v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]`
66	`# compute max length of a sequence in this minibatch`
67	`l = max(map(len,v))`
68	`return ( # tuple of two tensors - labels and features`
69	`torch.LongTensor([t[0]-1 for t in b]),`
70	`torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])`
71	`)`
72
73	`def offsetify(b,voc=None):`
74	`# first, compute data tensor from all sequences`
75	`x = [torch.tensor(encode(t[1],voc=voc)) for t in b]`
76	`# now, compute the offsets by accumulating the tensor of sequence lengths`
77	`o = [0] + [len(t) for t in x]`
78	`o = torch.tensor(o[:-1]).cumsum(dim=0)`
79	`return (`
80	`torch.LongTensor([t[0]-1 for t in b]), # labels`
81	`torch.cat(x), # text`
82	`o`
83	`)`
84
85	`def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):`
86	`optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)`
87	`loss_fn = loss_fn.to(device)`
88	`net.train()`
89	`total_loss,acc,count,i = 0,0,0,0`
90	`for labels,text,off in dataloader:`
91	`optimizer.zero_grad()`
92	`labels,text = labels.to(device), text.to(device)`
93	`if use_pack_sequence:`
94	`off = off.to('cpu')`
95	`else:`
96	`off = off.to(device)`
97	`out = net(text, off)`
98	`loss = loss_fn(out,labels) #cross_entropy(out,labels)`
99	`loss.backward()`
100	`optimizer.step()`
101	`total_loss+=loss`
102	`_,predicted = torch.max(out,1)`
103	`acc+=(predicted==labels).sum()`
104	`count+=len(labels)`
105	`i+=1`
106	`if i%report_freq==0:`
107	`print(f"{count}: acc={acc.item()/count}")`
108	`if epoch_size and count>epoch_size:`
109	`break`
110	`return total_loss.item()/count, acc.item()/count`
111
112

microsoft/AI-For-Beginners

Branches

Tags

Clone