python 小说爬取+HanLP分词+词云


# -*- coding: utf-8 -*-
"""
Created on Mon Dec 28 19:00:38 2020

@author: David
"""
##########################################################
'''
    小说数据爬取部分
'''

import requests
import parsel
from lxml import etree
import requests
import lxml.html

#请求头
headers = {
    
    
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
}


#请求头获取页面
def get_page(url,headers):
    try:
        r=requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)

#获取网页文本
text = get_page('https://www.bqkan.com/1_1094/5403177.html',headers=headers)

#解析网页文本
text_html = etree.HTML(text,etree.HTMLParser())

#解析小文正文部分
p = text_html.xpath('//*[@id="content"]')
p2 = p[0].xpath('string(.)').strip()
#string(.)可以用于提取标签嵌套标签的内容。
print(p2,end='\n')
p2 = p2[:-379].replace('app2();read2();',' ')

# with open(r'C:\Users\David\Desktop\book.txt','w',encoding='utf-8') as f:
#     f.write(p2)

with open(r'C:\Users\David\Desktop\book.txt','r',encoding='utf-8') as f:
    p2 = f.read()
p2


#############################################
'''
    数据存入数据库部分
'''
import pymysql
conn = pymysql.connect(host='localhost',user='root',password='root',database='novel',port=3306)
cursor = conn.cursor()
 
id = 1
title = '他叫白小纯'
content = p2
#插入数据格式如下:insert into 表名(插入内容的表头) value(对应的数据)

sql = 'USE novel'
cursor.execute(sql)
conn.commit()


sql = 'INSERT INTO xiao(id, title, content) values(%s,%s, %s)'
#提交sql语句,映射到数据库中。
cursor.execute(sql, (id,title, content))
conn.commit()
 
# 关闭数据库连接
conn.close()

########################################
'''
  HanLP自然语言处理部分
'''
# 作为终端用户,第一步需要从磁盘或网络加载预训练模型。比如,此处用两行代码加载一个名为 LARGE_ALBERT_BASE 的分词模型。
import hanlp
tokenizer = hanlp.load('LARGE_ALBERT_BASE')

#中文分词结果

#去掉换行符

p3  = p2.replace( ' ' , '' ) 
p4  = p3.replace( '\xa0' , '' ).replace( '“' , '' ).replace( '……' , '' ) 
p5 = p4.split("。")

        
#分词完成
cut_word = tokenizer(p5)


# 命名实体识别

# 命名实体识别模块的输入是单词列表,输出是命名实体的边界和类别。

# recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_UNCASED_EN)
# recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House"])
# [('Obama', 'PER', 1, 2), ('White House', 'LOC', 6, 8)]


# 中文命名实体识别是字符级模型,所以不要忘了用 list将字符串转换为字符列表。至于输出,格式为 (entity, type, begin, end)。

#人名地名识别
recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
last_word = recognizer(cut_word)


'''

1. 中国人名识别  (默认开启)

  标注为 nr

2. 音译人名识别  (默认开启)

  标注为 nrf

3. 日本人名识别  (默认关闭)

  标注为 nrj

4. 地名识别  (默认关闭)

  标注为 ns

5. 机构名识别  (默认关闭)

  标注为 nt

 HanLP这五个,基于HMM角色标注的命名实体识别 (速度快)
'''

#对列表进行过滤得到人名地名的列表
last_word = [i for i in last_word if len(i) != 0]

nr = []
ns = []
for i in last_word:
    for j in i:
        if j[1] == "NR":
            nr.append(j[0])
        if j[1] == "NS":
            ns.append(j[0])
            
####################################
            


#导入词云想关的包                       
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import numpy as np 


#分隔列表为字符串
name =' '.join(nr)

ns = [i.replace(" ","") for i in ns if len(i) != 0]
place =' '.join(ns)


#运用jieba进行分词
name_cut=jieba.cut(name)
place_cut=jieba.cut(place)


# 姓名词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(name)
wordcloud.to_file(r'C:\Users\David\Desktop\姓名.png')



#地名词云图绘制部分
x,y = np.ogrid[:300,:300]
mask = (x-150) ** 2 + (y-150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)
wc=WordCloud(font_path="C:/Windows/Fonts/simfang.ttf",background_color="white",repeat=True,mask=mask)
wordcloud=wc.generate(place)
wordcloud.to_file(r'C:\Users\David\Desktop\地名.png')

猜你喜欢

转载自blog.csdn.net/weixin_44322234/article/details/112005512