Python advanced application design task

A. Thematic design Web Crawler

1. Name thematic Web Crawler

1.1 Homelink website crawling

2, wherein the content of the data relating to network crawler Analysis

2.1 content reptiles

Name availability of information, the cell name, price, floors, agent, price, release time.

2.2 Data analysis features

2.2.1 on the floor to make a word cloud and visualization

2.2.2 make a line graph of the release time

3, thematic web crawler design overview (including the realization of ideas and technical difficulties)

3.1 realization of ideas

   Create a get class definitions get_alldata () method is used to obtain all the information on the page, get_detail () method is used for further processing and extraction of integer data, then the data stored dictionary.

3.2 Technical Difficulties

 The site will have the opposite crawling mechanisms need to simulate user actions crawling.

Second, the structural characteristics of the subject page analysis

1, features the theme of the page

Data page 30, page 50 of the crawling, the data amount of 1500. See page source code analysis of the need to extract data exists dynamically generated data, a data item to view any data after comparison with the original Web page, find the required data are static crawling through F12.

 

 

 

2, HTML pages parsing

 

 

 

 

 The block contents are to be crawled

 

 

 

Third, the web crawler programming

1, to include the following topics crawler portion, with source code and to explain more annotations, and provides an output result after every part of the program theme.

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
import seaborn as sns
#固定url
url_title = "https://sh.lianjia.com/chengjiao/pg"
url_end = "/"
Total_Data = {}
#获取全部的网址信息
class Get:
#获取所有url
def get_url(self):
list1 = []
#取50个
for i in range(0,51):
url = url_title+str(i)+url_end#拼接参数得到完整的url
list1.append(url)
return list1

#解析出网页
def get_alldata(self,url):
#user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
#headers = {'User-Agent': user_agent}
#data = requests.get(url,headers=headers)
data = requests.get(url)
soup = BeautifulSoup(data.text,'lxml')
return soup
#获取title等,并存入字典
def get_detail(self,soup):
for i in range(0,30):
data1 = soup.select('div[class="info"]')[i]
house_title =data1.find_all("div",class_="title")
house_flood =data1.find_all("div",class_="flood")
house_dealCycleeInfo = data1.find_all("div",class_="dealCycleeInfo")
house_agentInfoList = data1.find_all("div",class_="agentInfoList")

data = {
'Title':house_title[0].get_text(),
'flood':house_flood[0].get_text(),
'dealCycleeInfo':house_dealCycleeInfo[0].get_text(),
'agentInfoList':house_agentInfoList[0].get_text(),

}
Total_Data[data["Title"]] = data
return Total_Data

House_data = Get()
House_url =House_data.get_url()
for house_item in House_url:
house_soup= House_data.get_alldata(house_item)
data = House_data.get_detail(house_soup)

df_house = pd.DataFrame.from_dict(data)
df_house = df_house.T#转置
df_house.index=range(len(df_house))#reindex


#====================================================
#数据清洗
df_house['agent']=df_house['agentInfoList'].apply(lambda x:x.split("免费")[0])
del df_house['agentInfoList']
df_house['price'] = df_house['dealCycleeInfo'].apply(lambda x:x.split("成交")[0]).apply(lambda x:x.split("挂牌")[1])
df_house['time'] = df_house['dealCycleeInfo'].apply(lambda x:x.split("成交")[-1]).apply(lambda x:x.split("周期")[-1])
del df_house['dealCycleeInfo']
df_house['floor']=df_house['flood'].apply(lambda x:x.split(' ')[0])
df_house['floor']=df_house['flood'].apply(lambda x:x.split(' ')[0])
df_house['unitprice']=df_house['flood'].apply(lambda x:x.split(' ')[-1]).apply(lambda x:x.split("楼")[-1])
del df_house['flood']
df_house['time']=df_house['time'].apply(lambda x:x.split("天")[0])
writer = pd.ExcelWriter(r'C:\Users\DATACVG\Desktop\1100\lianjiasale.xlsx')
df_house.to_excel(r'C:\Users\DATACVG\Desktop\1100\lianjiasale.xlsx')


#词云
cut_text = "".join(df_house['floor'])
wordcloud = WordCloud(
#设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
#设置了背景,宽高
background_color="white",width=1000,height=880).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show() 

cut_text = "".join(df_house['agent'])
wordcloud = WordCloud(
#设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
#设置了背景,宽高
background_color="white",width=1000,height=880).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#=================================================
#数据可视化
#柱状图
s =df_house['time'].value_counts()
sns.barplot(x=s.index,y=s)

  

运行过程如下

 

 

 

 

 

 

 

 

 数据清洗

df_house['agent']=df_house['agentInfoList'].apply(lambda x:x.split("免费")[0])
del df_house['agentInfoList']
df_house['price'] = df_house['dealCycleeInfo'].apply(lambda x:x.split("成交")[0]).apply(lambda x:x.split("挂牌")[1])
df_house['time'] = df_house['dealCycleeInfo'].apply(lambda x:x.split("成交")[-1]).apply(lambda x:x.split("周期")[-1])
del df_house['dealCycleeInfo']
df_house['floor']=df_house['flood'].apply(lambda x:x.split(' ')[0])
df_house['floor']=df_house['flood'].apply(lambda x:x.split(' ')[0])
df_house['unitprice']=df_house['flood'].apply(lambda x:x.split(' ')[-1]).apply(lambda x:x.split("楼")[-1])
del df_house['flood']
df_house['time']=df_house['time'].apply(lambda x:x.split("天")[0])
writer = pd.ExcelWriter(r'C:\Users\DATACVG\Desktop\1100\lianjiasale.xlsx')
df_house.to_excel(r'C:\Users\DATACVG\Desktop\1100\lianjiasale.xlsx')

#词云
cut_text = "".join(df_house['fitment'])
wordcloud = WordCloud(
#设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
#设置了背景,宽高
background_color="white",width=1000,height=880).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show() 

cut_text = "".join(df_house['roomnum'])
wordcloud = WordCloud(
#设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
#设置了背景,宽高
background_color="white",width=1000,height=880).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

#=================================================
#数据可视化
#柱状图
s =df_house['follow'].value_counts()
sns.barplot(x=s.index,y=s)

 #3.文本分析(可选):jieba分词、wordcloud可视化
#4.数据分析与可视化
(例如:数据柱形图、直方图、散点图、盒图、分布图、数据回归分析等)

#词云
cut_text = "".join(df_house['floor'])
wordcloud = WordCloud(
#设置字体,不然会出现口字乱码,文字的路径是电脑的字体一般路径,可以换成别的
font_path="C:/Windows/Fonts/simfang.ttf",
#设置了背景,宽高
background_color="white",width=1000,height=880).generate(cut_text)

plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show() 

#=================================================
#数据可视化
#柱状图
s =df_house['time'].value_counts()
sns.barplot(x=s.index,y=s)

  

 

 

 

 

 

 

 

 

 

5.数据持久化

  写入csv文件

 

 

 

 

四、结论(10分)
1.经过对主题数据的分析与可视化,可以得到哪些结论?

  1.1 单价低的房子更容易卖出。

     1.2代理人对房子的销售影响很大

     1.3 人们跟倾向于中层的房子

     本次作业,对爬虫和数据分析做了个整合,将所学的知识都有用上,感觉很好,期待自己的每一次进步。

Guess you like

Origin www.cnblogs.com/liyuchen44/p/12046419.html