# following is an example of something you can only do in a framework that allows # dynamic graph creation #使用负采样 if negative_sample: score = -1*score obj = -1 * torch.sum(F.logsigmoid(score)) return obj
3. 定义一些超参数
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
K=3#number of negative samples N=2#length of window on each side (so N=2 gives a total window size of 5, as in t-2 t-1 t t+1 t+2) EMB_SIZE = 128# The size of the embedding
embeddings_location = "embeddings.txt"#the file to write the word embeddings to labels_location = "labels.txt"#the file to write the labels to
# We reuse the data reading from the language modeling class w2i = defaultdict(lambda: len(w2i))
#word counts for negative sampling word_counts = defaultdict(int)
S = w2i["<s>"] UNK = w2i["<unk>"]
读取数据函数:
1 2 3 4 5 6 7
defread_dataset(filename): withopen(filename, "r") as f: for line in f: line = line.strip().split(" ") for word in line: word_counts[w2i[word]] += 1 yield [w2i[x] for x in line]
# Read in the data train = list(read_dataset("data/ptb/train.txt")) w2i = defaultdict(lambda: UNK, w2i) dev = list(read_dataset("data/ptb/valid.txt")) i2w = {v: k for k, v in w2i.items()} nwords = len(w2i)
# take the word counts to the 3/4, normalize counts = np.array([list(x) for x in word_counts.items()])[:,1]**.75 normalizing_constant = sum(counts) word_probabilities = np.zeros(nwords) for word_id in word_counts: word_probabilities[word_id] = word_counts[word_id]**.75/normalizing_constant
将i2w写入label_location文件:
1 2 3
withopen(labels_location, 'w') as labels_file: for i inrange(nwords): labels_file.write(i2w[i] + '\n')
3. 建模
1 2 3 4 5 6 7 8 9 10
# initialize the model model = WordEmbSkip(nwords, EMB_SIZE) optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
type = torch.LongTensor use_cuda = torch.cuda.is_available()
if use_cuda: type = torch.cuda.LongTensor model.cuda()
# Calculate the loss value for the entire sentence defcalc_sent_loss(sent): # add padding to the sentence equal to the size of the window # as we need to predict the eos as well, the future window at that point is N past it all_neg_words = np.random.choice(nwords, size=2*N*K*len(sent), replace=True, p=word_probabilities)
# Step through the sentence losses = [] for i, word inenumerate(sent): # 采样生成词的位置列表 如果x>=0,就是0或者中心词左边的位置 # x < len 就是0或者中心词右边 pos_words = [sent[x] if x >= 0else S for x inrange(i-N,i)] + \ [sent[x] if x < len(sent) else S for x inrange(i+1,i+N+1)] pos_words_tensor = torch.tensor(pos_words).type(type) neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] neg_words_tensor = torch.tensor(neg_words).type(type) target_word_tensor = torch.tensor([word]).type(type)
#NOTE: technically, one should ensure that the neg words don't contain the # the context (i.e. positive) words, but it is very unlikely, so we can ignore that