tensorflow implements Word2Vec (find similar words of the target English word)

Reading notes written according to their own understanding.

import  collections
import  math
import  os
import  random
import  zipfile
import  urllib
import  numpy  as  np
import  tensorflow  as  tf #Define

the function for downloading text data
# url = 'http://mattmahoney.net/dc/'
#
# def maybe_download(filename,expected_bytes ):
# if not os.path.exists(filename):
# filename,_ = urllib.request.urlretrieve(url + filename,filename)
# statinfo = os.stat(filename) #Access detailed information about a file.
# if statinfo.st_size == expected_bytes: #file size (in bytes)
# print('Found and verified(verified)',filename)
# else:
# print(statinfo.st_size)
# raise Exception('Failed to verify(验证)' + filename + 'Can you get to it with a browser(浏览器)?')
# return filename
#
# filename = maybe_download('text8.zip',31344016)
 

filename = './text8.zip'

I have a few Alibaba Cloud lucky coupons to share with you. There will be special surprises for purchasing or upgrading Alibaba Cloud products with the coupons! Take all the lucky coupons for the products you want to buy! Hurry up, it's about to be sold out.


#Unzip the file and convert the data into a list of words
def  read_data(filename):
with  zipfile.ZipFile(filename)  as  f: #Get
a list of names, read it as a string, encode it into 'utf-8', and finally split it
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return  data

words = read_data(filename)
# print('Data size',len(words))
# print( words) #Create

a vocabulary, and put the most 50,000 words in the dictionary as a vocabulary.
vocabulary_size = 50000

def  build_dataset(words):
count = [[ 'UNK' ,-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
# c=collections.Counter(words).most_common (10)
# print(c)
# count.extend(c)
# print(count) #[['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644)]
dictionary = dict()#新建空字典
for word,_ in count:
dictionary[word] = len(dictionary)
# print(dictionary) #{'UNK': 0, 'the': 1, 'of': 2, 'and': 3, 'one': 4, 'in': 5, 'a': 6, 'to': 7, 'zero': 8, 'nine': 9, 'two': 10}
data = list()
unk_count = 0#未知单词数量
for word in words::elseindex = dictionary[word]dictionary:in word if #Word index, if it is not in the dictionary, the index is 0



index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

return data,count,dictionary,reverse_dictionary

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324398127&siteId=291194637