- 豆瓣电影
豆瓣评论分析:
1). 获取豆瓣最新上映的所有电影的前10页评论信息;
2). 清洗数据;
3). 分析每个电影评论信息分析绘制成词云, 保存为png图片,文件名为: 电影名.png;
import requests
from bs4 import BeautifulSoup
import re
import jieba
import wordcloud
import numpy
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
def get_movie(url):
response=requests.get(url)
content=response.text
soup=BeautifulSoup(content,'html.parser')
nowplaying_movie_list=soup.find_all('li',class_='list-item')
movies_info=[]
for item in nowplaying_movie_list:
nowplaying_movie_dict = {}
nowplaying_movie_dict['title']=item['data-title']
nowplaying_movie_dict['id']=item['id']
movies_info.append(nowplaying_movie_dict)
return movies_info
def get_info(id,pageNum):
start=20*(pageNum-1)
url='https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' %(id,start)
content=requests.get(url).text
soup=BeautifulSoup(content,'html.parser')
commentsList=soup.find_all('span',class_='short')
comments=''
for commentTag in commentsList:
comments+=commentTag.text
return comments
def word_cloud(comment,name):
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comment)
newComments = ''
for item in deal_comments:
newComments += item
result = jieba.lcut(newComments)
imageObj = Image.open('./image.jpg')
cloud_mask = numpy.array(imageObj)
wc = wordcloud.WordCloud(
background_color='snow',
mask=cloud_mask,
font_path='./msyh.ttf',
min_font_size=5,
max_font_size=50,
width=260,
height=260,
)
wc.generate(','.join(result))
wc.to_file('./img/%s.png' % (name))
def main():
url = 'https://movie.douban.com/cinema/nowplaying/xian/'
movie_id=get_movie(url)
for dict in movie_id:
id=dict['id']
name=dict['title']
for page in range(1,10):
pool = ThreadPoolExecutor(max_workers=10)
comment=pool.map(get_info(id,page))
word_cloud(comment,name)
main()
- 慕客网
爬取慕客网所有关于python的课程名及描述信息, 并通过词云进行分析展示;
import re
import requests
from bs4 import BeautifulSoup
import jieba
import numpy
from PIL import Image
import wordcloud
def get_html(url):
return requests.get(url).text
def get_name(text):
soup = BeautifulSoup(text, 'html5lib')
nametag_li = soup.find_all('div', class_="course-item-detail")
info_li = []
for i in nametag_li:
info = {}
i = re.findall(r'[\u4E00-\u9FA5]+',str(i))
info['name'] = i[0]
info['info'] = i[1:]
info_li.append(info)
return info_li
def word_cloud(text):
imgobj = Image.open('./image.jpg')
cloud_mask = numpy.array(imgobj)
result = jieba.lcut(text)
wc = wordcloud.WordCloud(
width=500,
mask=cloud_mask,
max_font_size=50,
min_font_size=5,
background_color='snow',
font_path = './msyh.ttf',
)
wc.generate(','.join(result))
wc.to_file('./muke.png')
def main():
li = []
for i in range(2):
url = 'https://www.imooc.com/search/course?words=python&page=%d' %(i+1)
info = get_name(get_html(url))
for j in info:
li.append(j['name'])
li.append(''.join(j['info']))
word_cloud(''.join(li))
main()
执行结果:
- python爬取今日百度热点前10的新闻
from bs4 import BeautifulSoup
from urllib.request import urlopen
def get_html(url):
a = urlopen(url).read().decode('gb2312')
return a
def get_info(text):
soup = BeautifulSoup(text, 'html5lib')
info_li = soup.find_all('a', class_='list-title')
news_li = [info_li[i].string for i in range(10)]
return news_li
def main():
url = 'http://top.baidu.com/buzz?b=341'
new_li = get_info(get_html(url))
[print(i) for i in new_li]
main()
执行结果: