# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 19:00:38 2020
@author: David
"""
##########################################################
'''
小说数据爬取部分
'''
import requests
import parsel
from lxml import etree
import requests
import lxml.html
#请求头
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}
#请求头获取页面
def get_page(url,headers):
try:
r=requests.get(url, headers=headers)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except Exception as e:
print(e)
#获取网页文本
text = get_page('https://www.bqkan.com/1_1094/5403177.html',headers=headers)
#解析网页文本
text_html = etree.HTML(text,etree.HTMLParser())
#解析小文正文部分
p = text_html.xpath('//*[@id="content"]')
p2 = p[0].xpath('string(.)').strip()
#string(.)可以用于提取标签嵌套标签的内容。
print(p2,end='\n')
p2 = p2[:-379].replace('app2();read2();',' ')
# with open(r'C:\Users\David\Desktop\book.txt','w',encoding='utf-8') as f:
# f.write(p2)
with open(r'C:\Users\David\Desktop\book.txt','r',encoding='utf-8') as f:
p2 = f.read()
p2
#############################################
'''
数据存入数据库部分
'''
import pymysql
conn = pymysql.connect(host='localhost',user='root',password='root',database='novel',port=3306)
cursor = conn.cursor()
id = 1
title = '他叫白小纯'
content = p2
#插入数据格式如下:insert into 表名(插入内容的表头) value(对应的数据)
sql = 'USE novel'
cursor.execute(sql)
conn.commit()
sql = 'INSERT INTO xiao(id, title, content) values(%s,%s, %s)'
#提交sql语句,映射到数据库中。
cursor.execute(sql, (id,title, content))
conn.commit()
# 关闭数据库连接
conn.close()
########################################
'''
HanLP自然语言处理部分
'''
# 作为终端用户,第一步需要从磁盘或网络加载预训练模型。比如,此处用两行代码加载一个名为 LARGE_ALBERT_BASE 的分词模型。
import hanlp
tokenizer = hanlp.load('LARGE_ALBERT_BASE')
#中文分词结果
#去掉换行符
p3 = p2.replace( ' ' , '' )
p4 = p3.replace( '\xa0' , '' ).replace( '“' , '' ).replace( '……' , '' )
p5 = p4.split("。")
#分词完成
cut_word = tokenizer(p5)
# 命名实体识别
# 命名实体识别模块的输入是单词列表,输出是命名实体的边界和类别。
# recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_UNCASED_EN)
# recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House"])
# [('Obama', 'PER', 1, 2), ('White House', 'LOC', 6, 8)]
# 中文命名实体识别是字符级模型,所以不要忘了用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。
#人名地名识别
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
last_word = recognizer(cut_word)
'''
1. 中国人名识别 (默认开启)
标注为 nr
2. 音译人名识别 (默认开启)
标注为 nrf
3. 日本人名识别 (默认关闭)
标注为 nrj
4. 地名识别 (默认关闭)
标注为 ns
5. 机构名识别 (默认关闭)
标注为 nt
HanLP这五个,基于HMM角色标注的命名实体识别 (速度快)
'''
#对列表进行过滤得到人名地名的列表
last_word = [i for i in last_word if len(i) != 0]
nr = []
ns = []
for i in last_word:
for j in i:
if j[1] == "NR":
nr.append(j[0])
if j[1] == "NS":
ns.append(j[0])
####################################
#导入词云想关的包
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np
#分隔列表为字符串
name =' '.join(nr)
ns = [i.replace(" ","") for i in ns if len(i) != 0]
place =' '.join(ns)
#运用jieba进行分词
name_cut=jieba.cut(name)
place_cut=jieba.cut(place)
# 姓名词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(name)
wordcloud.to_file(r'C:\Users\David\Desktop\姓名.png')
#地名词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(place)
wordcloud.to_file(r'C:\Users\David\Desktop\地名.png')
python 小说爬取+HanLP分词+词云
猜你喜欢
转载自blog.csdn.net/weixin_44322234/article/details/112005512
今日推荐
周排行