Written in front:
Through this article you will learn:
1. How to use python to achieve word segmentation statistics
2. How to use python to implement the TF_IDF algorithm
3. How to use python to realize word cloud display,
4. Get a data gift package
The data and tutorials in this article have been packaged and placed here. If necessary, you can click to download:
https://download.csdn.net/download/Captain_DUDU/87140215
Follow this account to get the latest information~
Another princess number: captain_data
Text begins:
I saw a big cow in the circle of friends who was transferring his courses, and he happened to be studying recently, so I went to consult. Looking at the rich resources and courses, I was stunned, so I was ruthless and dedicated with tears I spent more than half a year's savings to buy this document
I took a look, and the customer courses are indeed rich, such as Gou, Ma, Nei, Silicon Valley, and Gu* College, courses include data analysis, algorithm, big data, nlp, autonomous driving, product manager, development, front-end, back-end Terminal, operation, etc. have everything, and it is worthy of my savings for more than half a year. I am so happy~
After getting the course data:
I took a closer look at the course information inside. There are 842 organized in the table, and there are scattered ones at the bottom. It is estimated that there are more than 3,000 learning resources ~ all of which are resources of large institutions.
Let's use nlp word segmentation technology to see and analyze what's in this package:
First come to the conclusion:
The main keywords in this document are as follows. Shout out, which industry is the hottest? ? ? ?
'data'! ! ! !
'data'! ! ! !
'data'! ! ! !
I came to Shenzhen only for 3 things! ! Data data or TM* d data! ! !
Briefly introduce how to implement word segmentation technology
Step 1: Paste the data in the document into the text (there are too many resources, it is inconvenient to save it in a table)
Step 2: Read text data:
Before reading the text data, first import several packages to be used in this analysis: specifically jieba word segmentation and word cloud package
import jieba
# import jieba.posseg as jp #lcut cut 分词,获取词性 i.flag 词性 ,i.word 词
# import jieba.posseg as
import jieba.posseg as jp, jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import datetime
from gensim import corpora, models
import warnings
warnings.filterwarnings("ignore")
# jieba.set_dictionary("./in_files/dict.txt") ## //指定dict.txt加载路径,为了方便部署,使用相对路径。
# # jieba.initialize()
Read text data:
with open(r'.\in_files\learn_data.txt','r',encoding='utf8') as f:
txt=f.read()
The result after reading is as follows:
Step Three: Simple Cleaning
As you can see, there are a lot of newline characters "\n \t", etc., so we use the replace method to clean it~
txt=txt.replace("\n","").replace("\t","")
Step 4: Segment words and remove stop words, set word cloud and word frequency specifications
stopwords=pd.read_table("in_files/stopwords.txt", index_col=False, quoting=3, names=['stopword'], encoding="utf-8")
w_st_2_str_list=[w for w in jieba.lcut(txt) if w not in stopwords.values ]
w_st_2_str_list_clean=[i for i in w_st_2_str_list if len(i)>1 and i!='nan'] #词·频格式
# w_st_2_str_list_clean
# w_st_2_str_list_clean
word_cloud_new=" ".join(w_st_2_str_list_clean)# 词云格式
Step 5: Call the word cloud and word frequency (tfidf = TfidfVectorizer) statistical code to directly count the number and frequency of keywords
A time naming function is used in this export to prevent name duplication~
# w_st_2_str_list_clean=
def path_name_time():
global path_nameTime
timestr = datetime.datetime.now().strftime('%Y%m%d%H%M%S') ###生成当下的时间
path_nameTime = timestr
# os.mkdir('out/汇总数据/' + path_nameTime)
# os.mkdir('out/清洗数据/' + path_nameTime)
return timestr
print("--开始词频统计--")
bow_model=corpora.Dictionary([w_st_2_str_list_clean])
arr_count=np.array(list(bow_model.doc2bow(w_st_2_str_list_clean)))[:,1]
arr_keys=np.array(list(bow_model.items()))
df_bow=pd.DataFrame(arr_keys,columns=['num','keywords'])
df_bow['count_times']=arr_count
print("--开始抽取关键字--")
tfidf = TfidfVectorizer(smooth_idf=True,norm='l1')
d= tfidf.fit_transform([word_cloud_new]) #" ".join(ciyun_ci)
TF_result = pd.DataFrame(d.toarray())
TF_result.columns = tfidf.get_feature_names()
TF_result_T=TF_result.T
TF_result_T.columns=['times_p']
TF_result_T['keywords']=TF_result_T.index
# TF_result_T['times']=TF_result_T['times_p']*(len(TF_result_T))
col_index=['keywords','count_times','times_p']
file_name='./result/TF_result_T'+path_name_time()
TF_result_T=TF_result_T.merge(df_bow, on='keywords', how='left')
TF_result_T=TF_result_T[col_index]
print("--开始排序导出--")
TF_result_T.sort_values(by='count_times',ascending=False,inplace=True)
TF_result_T.to_excel(file_name+ '.xlsx',index=False)
## 词云
print('--生成词云图ing--')
ciyun_obj = WordCloud(font_path='simhei.ttf',width=600,height=200,max_font_size=80,min_font_size=10,max_words=100,collocations = False, background_color='white',mask=plt.imread('in_files/tt.jpg'))
ciyun_obj.generate(word_cloud_new) #生成指定的词云图
plt.figure(dpi=150) #词云图缩放比例
plt.imshow(ciyun_obj) #显示词云图
plt.axis('off') #去掉坐标轴
ciyun_obj.to_file(file_name+".jpg") #保存词云
print('--导出关键字文件名为:', file_name)
result:
The full code is here:
import jieba
# import jieba.posseg as jp #lcut cut 分词,获取词性 i.flag 词性 ,i.word 词
# import jieba.posseg as
import jieba.posseg as jp, jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import datetime
from gensim import corpora, models
import warnings
warnings.filterwarnings("ignore")
# jieba.set_dictionary("./in_files/dict.txt") ## //指定dict.txt加载路径,为了方便部署,使用相对路径。
# # jieba.initialize()
with open(r'.\in_files\learn_data.txt','r',encoding='utf8') as f:
txt=f.read() #公主号:captain_data
txt=txt.replace("\n","").replace("\t","") #公主号:captain_data
stopwords=pd.read_table("in_files/stopwords.txt", index_col=False, quoting=3, names=['stopword'], encoding="utf-8")
w_st_2_str_list=[w for w in jieba.lcut(txt) if w not in stopwords.values ]
w_st_2_str_list_clean=[i for i in w_st_2_str_list if len(i)>1 and i!='nan'] #词·频格式
# w_st_2_str_list_clean
# w_st_2_str_list_clean
word_cloud_new=" ".join(w_st_2_str_list_clean)# 词云格式
# w_st_2_str_list_clean=
def path_name_time():
global path_nameTime
timestr = datetime.datetime.now().strftime('%Y%m%d%H%M%S') ###生成当下的时间
path_nameTime = timestr
# os.mkdir('out/汇总数据/' + path_nameTime)
# os.mkdir('out/清洗数据/' + path_nameTime)
return timestr
print("--开始词频统计--")
bow_model=corpora.Dictionary([w_st_2_str_list_clean])
arr_count=np.array(list(bow_model.doc2bow(w_st_2_str_list_clean)))[:,1]
arr_keys=np.array(list(bow_model.items()))
df_bow=pd.DataFrame(arr_keys,columns=['num','keywords'])
df_bow['count_times']=arr_count
print("--开始抽取关键字--")
tfidf = TfidfVectorizer(smooth_idf=True,norm='l1')
d= tfidf.fit_transform([word_cloud_new]) #" ".join(ciyun_ci)
TF_result = pd.DataFrame(d.toarray())
TF_result.columns = tfidf.get_feature_names()
TF_result_T=TF_result.T
TF_result_T.columns=['times_p']
TF_result_T['keywords']=TF_result_T.index
# TF_result_T['times']=TF_result_T['times_p']*(len(TF_result_T))
col_index=['keywords','count_times','times_p']
file_name='./result/TF_result_T'+path_name_time()
TF_result_T=TF_result_T.merge(df_bow, on='keywords', how='left')
TF_result_T=TF_result_T[col_index]
print("--开始排序导出--")
TF_result_T.sort_values(by='count_times',ascending=False,inplace=True)
TF_result_T.to_excel(file_name+ '.xlsx',index=False)
## 词云
print('--生成词云图ing--')
ciyun_obj = WordCloud(font_path='simhei.ttf',width=600,height=200,max_font_size=80,min_font_size=10,max_words=100,collocations = False, background_color='white',mask=plt.imread('in_files/tt.jpg'))
ciyun_obj.generate(word_cloud_new) #生成指定的词云图
plt.figure(dpi=150) #词云图缩放比例
plt.imshow(ciyun_obj) #显示词云图
plt.axis('off') #去掉坐标轴
ciyun_obj.to_file(file_name+".jpg") #保存词云
print('--导出关键字文件名为:', file_name)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2022/11/23 11:48
# @Author : Captain
# @Site :
# @File : nlp_keywords.py
# @Software: PyCharm
import jieba
# import jieba.posseg as jp #lcut cut 分词,获取词性 i.flag 词性 ,i.word 词
# import jieba.posseg as
import jieba.posseg as jp, jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import datetime
from gensim import corpora, models
import warnings
warnings.filterwarnings("ignore")
# jieba.set_dictionary("./in_files/dict.txt") ## //指定dict.txt加载路径,为了方便部署,使用相对路径。
# # jieba.initialize()
def path_name_time():
global file_name
timestr = datetime.datetime.now().strftime('%Y%m%d%H%M%S') ###生成当下的时间
# path_nameTime =
# os.mkdir('out/汇总数据/' + path_nameTime)
# os.mkdir('out/清洗数据/' + path_nameTime)
file_name='./result/TF_result_T'+timestr
return timestr
def split_data(any_df):
df_captain_all=any_df
jieba.load_userdict('in_files/add_keywords.txt') # 本地字典
col_first=['num_index','words']
df_captain=df_captain_all[col_first]
# df_captain=df_captain.drop_duplicates(col_first,keep='first') #删除同类数据
# 读取停用词
stopwords=pd.read_table("in_files/stopwords.txt",
index_col=False, quoting=3,
names=['stopword'],
encoding="utf-8")
#提取待分词列
df_captain[col_first[1]]=df_captain[col_first[1]].astype('str')
print("--开始分词--")
df_captain["分词"]=df_captain[col_first[1]].apply(lambda x: [w for w in jieba.lcut(x) if w not in stopwords.values ])
return df_captain
# serise 打开存储的分词出的列表为字符串
def format_str_list(any_df):
df_captain=any_df
w_Series = df_captain["分词"].apply(lambda x: " ".join(x) )
w_Series_2_str=' '.join(w_Series)
# w_Series_2_str
w_Series_2_str_list=w_Series_2_str.split(' ')
# w_Series_2_str_list
w_Series_2_str_list_clean=[i for i in w_Series_2_str_list if len(i)>1 and i!='nan'] #词频格式
word_cloud_new=" ".join(w_Series_2_str_list_clean) #词云格式 tfidf 格式
out_list=[w_Series_2_str_list_clean,word_cloud_new]
return out_list
# print("TF与bow计算 ")
def bow_tfidf_analyze(any_list):
out_list=any_list
w_Series_2_str_list_clean=out_list[0]
word_cloud_new=out_list[1]
# 词频 bow
print("--开始词频统计--")
bow_model=corpora.Dictionary([w_Series_2_str_list_clean])
arr_count=np.array(list(bow_model.doc2bow(w_Series_2_str_list_clean)))[:,1]
arr_keys=np.array(list(bow_model.items()))
df_bow=pd.DataFrame(arr_keys,columns=['num','keywords'])
df_bow['count_times']=arr_count
#词排序 TF计算
print("--开始抽取关键字--")
tfidf = TfidfVectorizer(smooth_idf=True,norm='l1')
d= tfidf.fit_transform([word_cloud_new]) #" ".join(ciyun_ci)
TF_result = pd.DataFrame(d.toarray())
TF_result.columns = tfidf.get_feature_names()
TF_result_T=TF_result.T
TF_result_T.columns=['times_p']
TF_result_T['keywords']=TF_result_T.index
# TF_result_T['times']=TF_result_T['times_p']*(len(TF_result_T))
col_index=['keywords','count_times','times_p']
TF_result_T=TF_result_T.merge(df_bow, on='keywords', how='left')
TF_result_T=TF_result_T[col_index]
print("--开始排序导出--")
TF_result_T.sort_values(by='count_times',ascending=False,inplace=True)
TF_result_T.to_excel(file_name+ '.xlsx',index=False)
print("--导出成功--")
def plt_WordCloud(any_str):
## 词云
word_cloud_new=any_str
print('--生成词云图ing--')
ciyun_obj = WordCloud(
font_path='simhei.ttf',
width=600,height=200,
max_font_size=80,
min_font_size=10,
max_words=100,
collocations = False,
background_color='white',
mask=plt.imread('in_files/ttren.png'))
ciyun_obj.generate(word_cloud_new) #生成指定的词云图
plt.figure(dpi=150) #词云图缩放比例
plt.imshow(ciyun_obj) #显示词云图
plt.axis('off') #去掉坐标轴
ciyun_obj.to_file(file_name+".jpg") #保存词云
print('--导出关键字文件名为:', file_name)
if __name__ == '__main__':
path_name_time()
path_f=input("输入文件地址:")
#df_wait_clean = pd.read_csv(path_f,sep='\t')#'keywords_captain.xlsx'
# E:\jupyter_notebook\nlp研究\1111师傅报价留言分析BAK.csv
df_wait_clean = pd.read_csv(path_f,encoding='gbk')#'keywords_captain.xlsx'
df_clean=df_wait_clean[['订单编号', '备注']]
df_clean.columns=['num_index', 'words']
df_any_fc=split_data(df_clean)
any_list=format_str_list(df_any_fc)
bow_tfidf_analyze(any_list)
plt_WordCloud(any_list[1])
Remember to like, follow and collect~ If you need resources, private message the blogger~