简单的结巴分词与词频统计

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import sys
import jieba
import json
from collections import Counter
reload(sys)
sys.setdefaultencoding("utf-8")
filename = "rowss.txt"
f1 = open("row2.txt", "w+")
with open(filename) as f:
    mytext = f.read()
    mytext = mytext.decode("utf-8")
    mytext = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、|~@#¥%……&*()]+".decode("utf-8"), "".decode("utf8"), mytext)
    mytext = " ".join(jieba.cut(mytext))
    f1.write(mytext)
word_lst = []
word_dict = {}
with open("row2.txt") as f2, open("row4.txt", "w") as f3:
     for word in f2:
        word_lst.append(word.split(' '))
        for item in word_lst:
            for item2 in item:
                if item2 not in word_dict:
                    word_dict[item2] = 1
                else:
                     word_dict[item2] += 1
            sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
            for key in word_dict:
              print key, word_dict[key]
              f3.write(key + ' ' + str(word_dict[key]) + '\n')
            sort = sorted(word_dict.items(), key=lambda e: e[1], reverse=True)  # sort为list
            print json.dumps(sort[:5], encoding="UTF-8", ensure_ascii=False)

猜你喜欢

转载自blog.csdn.net/zhangmary/article/details/80642244