microsoft/AI-For-Beginners

Public

mirrored fromhttps://github.com/microsoft/AI-For-BeginnersAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

5d97797124da3144ff63633bbe57da7df800b6bc

Find a branch or tag

Branches

5d97797124da3144ff63633bbe57da7df800b6bc

Clone

HTTPS

Download ZIP

AI-For-Beginners/lessons/5-NLP/14-Embeddings

lessons/5-NLP/14-Embeddings/torchnlp.py

104lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`import builtins`
2	`import torch`
3	`import torchtext`
4	`import collections`
5	`import os`
6
7	`device = torch.device("cuda" if torch.cuda.is_available() else "cpu")`
8
9	`vocab = None`
10	`tokenizer = torchtext.data.utils.get_tokenizer('basic_english')`
11
12	`def load_dataset(ngrams=1,min_freq=1):`
13	`global vocab, tokenizer`
14	`print("Loading dataset...")`
15	`train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')`
16	`train_dataset = list(train_dataset)`
17	`test_dataset = list(test_dataset)`
18	`classes = ['World', 'Sports', 'Business', 'Sci/Tech']`
19	`print('Building vocab...')`
20	`counter = collections.Counter()`
21	`for (label, line) in train_dataset:`
22	`counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))`
23	`vocab = torchtext.vocab.Vocab(counter, min_freq=min_freq)`
24	`return train_dataset,test_dataset,classes,vocab`
25
26	`def encode(x,voc=None,unk=0,tokenizer=tokenizer):`
27	`v = vocab if voc is None else voc`
28	`return [v.stoi.get(s,unk) for s in tokenizer(x)]`
29
30	`def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):`
31	`optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)`
32	`loss_fn = loss_fn.to(device)`
33	`net.train()`
34	`total_loss,acc,count,i = 0,0,0,0`
35	`for labels,features in dataloader:`
36	`optimizer.zero_grad()`
37	`features, labels = features.to(device), labels.to(device)`
38	`out = net(features)`
39	`loss = loss_fn(out,labels) #cross_entropy(out,labels)`
40	`loss.backward()`
41	`optimizer.step()`
42	`total_loss+=loss`
43	`_,predicted = torch.max(out,1)`
44	`acc+=(predicted==labels).sum()`
45	`count+=len(labels)`
46	`i+=1`
47	`if i%report_freq==0:`
48	`print(f"{count}: acc={acc.item()/count}")`
49	`if epoch_size and count>epoch_size:`
50	`break`
51	`return total_loss.item()/count, acc.item()/count`
52
53	`def padify(b,voc=None,tokenizer=tokenizer):`
54	`# b is the list of tuples of length batch_size`
55	`# - first element of a tuple = label,`
56	`# - second = feature (text sequence)`
57	`# build vectorized sequence`
58	`v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]`
59	`# compute max length of a sequence in this minibatch`
60	`l = max(map(len,v))`
61	`return ( # tuple of two tensors - labels and features`
62	`torch.LongTensor([t[0]-1 for t in b]),`
63	`torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])`
64	`)`
65
66	`def offsetify(b,voc=None):`
67	`# first, compute data tensor from all sequences`
68	`x = [torch.tensor(encode(t[1],voc=voc)) for t in b]`
69	`# now, compute the offsets by accumulating the tensor of sequence lengths`
70	`o = [0] + [len(t) for t in x]`
71	`o = torch.tensor(o[:-1]).cumsum(dim=0)`
72	`return (`
73	`torch.LongTensor([t[0]-1 for t in b]), # labels`
74	`torch.cat(x), # text`
75	`o`
76	`)`
77
78	`def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):`
79	`optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)`
80	`loss_fn = loss_fn.to(device)`
81	`net.train()`
82	`total_loss,acc,count,i = 0,0,0,0`
83	`for labels,text,off in dataloader:`
84	`optimizer.zero_grad()`
85	`labels,text = labels.to(device), text.to(device)`
86	`if use_pack_sequence:`
87	`off = off.to('cpu')`
88	`else:`
89	`off = off.to(device)`
90	`out = net(text, off)`
91	`loss = loss_fn(out,labels) #cross_entropy(out,labels)`
92	`loss.backward()`
93	`optimizer.step()`
94	`total_loss+=loss`
95	`_,predicted = torch.max(out,1)`
96	`acc+=(predicted==labels).sum()`
97	`count+=len(labels)`
98	`i+=1`
99	`if i%report_freq==0:`
100	`print(f"{count}: acc={acc.item()/count}")`
101	`if epoch_size and count>epoch_size:`
102	`break`
103	`return total_loss.item()/count, acc.item()/count`
104
105

microsoft/AI-For-Beginners

Branches

Tags

Clone