microsoft/AI-For-Beginners

Public

mirrored fromhttps://github.com/microsoft/AI-For-BeginnersAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
d20f5c1a431af491240e15b671bd8e2930788117

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

lessons/5-NLP/17-GenerativeNetworks/torchnlp.py

111lines · modecode

1import builtins
2import torch
3import torchtext
4import collections
5import os
6
7device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
8
9vocab = None
10tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
11
12def load_dataset(ngrams=1,min_freq=1):
13 global vocab, tokenizer
14 print("Loading dataset...")
15 train_dataset, test_dataset = torchtext.datasets.AG_NEWS(root='./data')
16 train_dataset = list(train_dataset)
17 test_dataset = list(test_dataset)
18 classes = ['World', 'Sports', 'Business', 'Sci/Tech']
19 print('Building vocab...')
20 counter = collections.Counter()
21 for (label, line) in train_dataset:
22 counter.update(torchtext.data.utils.ngrams_iterator(tokenizer(line),ngrams=ngrams))
23 vocab = torchtext.vocab.vocab(counter, min_freq=min_freq)
24 return train_dataset,test_dataset,classes,vocab
25
26stoi_hash = {}
27def encode(x,voc=None,unk=0,tokenizer=tokenizer):
28 global stoi_hash
29 v = vocab if voc is None else voc
30 if v in stoi_hash.keys():
31 stoi = stoi_hash[v]
32 else:
33 stoi = v.get_stoi()
34 stoi_hash[v]=stoi
35 return [stoi.get(s,unk) for s in tokenizer(x)]
36
37def train_epoch(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
38 optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
39 loss_fn = loss_fn.to(device)
40 net.train()
41 total_loss,acc,count,i = 0,0,0,0
42 for labels,features in dataloader:
43 optimizer.zero_grad()
44 features, labels = features.to(device), labels.to(device)
45 out = net(features)
46 loss = loss_fn(out,labels) #cross_entropy(out,labels)
47 loss.backward()
48 optimizer.step()
49 total_loss+=loss
50 _,predicted = torch.max(out,1)
51 acc+=(predicted==labels).sum()
52 count+=len(labels)
53 i+=1
54 if i%report_freq==0:
55 print(f"{count}: acc={acc.item()/count}")
56 if epoch_size and count>epoch_size:
57 break
58 return total_loss.item()/count, acc.item()/count
59
60def padify(b,voc=None,tokenizer=tokenizer):
61 # b is the list of tuples of length batch_size
62 # - first element of a tuple = label,
63 # - second = feature (text sequence)
64 # build vectorized sequence
65 v = [encode(x[1],voc=voc,tokenizer=tokenizer) for x in b]
66 # compute max length of a sequence in this minibatch
67 l = max(map(len,v))
68 return ( # tuple of two tensors - labels and features
69 torch.LongTensor([t[0]-1 for t in b]),
70 torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
71 )
72
73def offsetify(b,voc=None):
74 # first, compute data tensor from all sequences
75 x = [torch.tensor(encode(t[1],voc=voc)) for t in b]
76 # now, compute the offsets by accumulating the tensor of sequence lengths
77 o = [0] + [len(t) for t in x]
78 o = torch.tensor(o[:-1]).cumsum(dim=0)
79 return (
80 torch.LongTensor([t[0]-1 for t in b]), # labels
81 torch.cat(x), # text
82 o
83 )
84
85def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200,use_pack_sequence=False):
86 optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
87 loss_fn = loss_fn.to(device)
88 net.train()
89 total_loss,acc,count,i = 0,0,0,0
90 for labels,text,off in dataloader:
91 optimizer.zero_grad()
92 labels,text = labels.to(device), text.to(device)
93 if use_pack_sequence:
94 off = off.to('cpu')
95 else:
96 off = off.to(device)
97 out = net(text, off)
98 loss = loss_fn(out,labels) #cross_entropy(out,labels)
99 loss.backward()
100 optimizer.step()
101 total_loss+=loss
102 _,predicted = torch.max(out,1)
103 acc+=(predicted==labels).sum()
104 count+=len(labels)
105 i+=1
106 if i%report_freq==0:
107 print(f"{count}: acc={acc.item()/count}")
108 if epoch_size and count>epoch_size:
109 break
110 return total_loss.item()/count, acc.item()/count
111
112