1、导入库和数据
import graphlab
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)
people = graphlab.SFrame('people_wiki.gl/')
people.head()
2、word_count
obama文章
obama = people[people['name'] == 'Barack Obama']
clooney = people[people['name'] == 'George Clooney']
obama word_count
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
obama_word_count_table.head()
obama_word_count_table.sort('count',ascending=False)
3、TF-IDF
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
# Earlier versions of GraphLab Create returned an SFrame rather than a single SArray
# This notebook was created using Graphlab Create version 1.7.1
if graphlab.version <= '1.6.1':
tfidf = tfidf['docs']
tfidf
people['tfidf'] = tfidf
4、word_count和tf_idf拆分
obama = people[people['name'] == 'Barack Obama']
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
5、cosine distance
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
6、建立模型
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
knn_model.query(obama)