python爬虫+数据分析+数据可视化
import csv
import pandas as pd
import numpy as np
import asyncio
import aiohttp
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/30 '
}
async def get_page(url):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url, headers=headers) as response:
page_text = await response.text()
tree = etree.HTML(page_text)
titles = tree.xpath('//div[@class="property-content-title"]/h3//text()')
values = tree.xpath('//p[@class="property-price-total"]/span[1]/text()')
layouts = tree.xpath('//div[@class="property-content-info"]/p[1]//text()')
a = ''
for i in layouts:
if i != ' ':
a = a + i
layout = []
for i in range(int(len(a) / 6)):
layout.append(''.join(list(a)[6 * i:6 * i + 6]))
mi = tree.xpath('//div[@class="property-content-info"]/p[2]//text()')
location = tree.xpath('//div[@class="property-content-info"]/p[3]//text()')
high = tree.xpath('//div[@class="property-content-info"]/p[4]//text()')
build_times = tree.xpath('//div[@class="property-content-info"]/p[5]//text()')
address = tree.xpath('//div[@class="property-content-info property-content-info-comm"]/p[1]//text()')
specific_address = tree.xpath(
'//div[@class="property-content-info property-content-info-comm"]/p[2]//text()')
insertion = []
for i in range(int(len(specific_address))):
insertion.append(specific_address[i])
if (i + 1) % 3 != 0:
insertion.insert(len(specific_address), '-')
name = tree.xpath('//div[@class="property-extra"]/span[1]/text()')
grade = tree.xpath('//div[@class="property-extra"]/span[2]/text()')
website = tree.xpath('//div[@class="property-extra"]/span[3]/text()')
urls = tree.xpath('//div[@class="property"]/a[1]/@href')
new_specific_address = []
for i in range(int(len(insertion) / 5)):
new_specific_address.append(''.join(insertion[5 * i:5 * i + 5]))
print(len(build_times))
print(len(titles))
for i in range(len(titles) - 1):
new_data = [titles[i], values[i] + '万', layout[i], mi[i], location[i], high[i], build_times[i],
address[i],
new_specific_address[i], name[i], grade[i], website[i], urls[i]]
writer.writerow(new_data)
async def main():
urls = []
url = 'https://bj.58.com/ershoufang/p%d/'
for pageNum in range(1, 9):
urls.append(format(url % pageNum))
tasks = []
for url in urls:
c = get_page(url)
task = asyncio.create_task(c)
tasks.append(task)
await asyncio.wait(tasks)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.rcParams["font.sans-serif"] = [u"SimHei"]
plt.rcParams["axes.unicode_minus"] = False
data = pd.read_csv('room01.csv', encoding='gbk')
print(data.shape)
print(data.dtypes)
print(data.columns)
index01 = data[data["建造时间"].str[29:33] == ''].index
data.drop(index01, inplace=True)
data['mi'] = data["房子面积"].str[29:-26].astype('double')
data['price'] = data["¥价格"].str[:-1].astype('double')
data['year'] = data["建造时间"].str[29:33].astype('int')
data['months'] = (2021 - data['year']) * 12 + 6
index02 = data[data['评分'].str[3:4] != '分'].index
data.drop(index02, inplace=True)
data['grade'] = data['评分'].str[:-1].astype('double')
def plot01():
price_cut = pd.cut(data['price'],
bins=[data['price'].min(), 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000,
data['price'].max()])
price_count = price_cut.value_counts()
for i in price_count / price_count.sum():
print(i)
print(price_count.index)
X = np.arange(len(price_count))
print(X)
Y = price_count
print(Y)
plt.figure(figsize=(8, 6))
plt.bar(X, Y, color='b', alpha=0.5)
plt.title("二手房价格分布图")
plt.xlabel("价格区间")
plt.ylabel("数量")
plt.xticks(np.arange(len(price_count)), price_count.index, rotation=30)
plt.ylim([0, price_count.max() + 100])
percents = [str(round(i * 100, 2)) + '%' for i in price_count / price_count.sum()]
for x, y, z in zip(X, Y, percents):
plt.text(x - 0.3, y + 5, z)
plt.show()
def plot02():
means = [int(data[(data['price'] < 100) & (data['price'] >= data['price'].min())]['mi'].mean()),
int(data[(data['price'] < 200) & (data['price'] >= 100)]['mi'].mean()),
int(data[(data['price'] < 300) & (data['price'] >= 200)]['mi'].mean()),
int(data[(data['price'] < 400) & (data['price'] >= 300)]['mi'].mean()),
int(data[(data['price'] < 500) & (data['price'] >= 400)]['mi'].mean()),
int(data[(data['price'] < 600) & (data['price'] >= 500)]['mi'].mean()),
int(data[(data['price'] < 700) & (data['price'] >= 600)]['mi'].mean()),
int(data[(data['price'] < 800) & (data['price'] >= 700)]['mi'].mean()),
int(data[(data['price'] < 900) & (data['price'] >= 800)]['mi'].mean()),
int(data[(data['price'] < 1000) & (data['price'] >= 900)]['mi'].mean()),
int(data[(data['price'] < data['price'].max()) & (data['price'] >= 1000)]['mi'].mean())]
x = [f"[{
data['price'].min()},100)", "[100,200)", "[200,300)", "[300,400)", "[400,500)", "[500,600)", "[600,700)",
"[700,800)", "[800,900)", "[900,1000)", f"[1000,{
data['price'].max()})"]
X = np.arange(len(x))
Y = means
plt.figure(figsize=(8, 10))
plt.plot(X, Y, '-..', color='b')
plt.title('房子价格和面积之间的关系')
plt.xlabel('价格区间')
plt.ylabel('平均面积')
plt.xticks(np.arange(len(X)), x, rotation=30)
ax = plt.gca()
for i, j in zip(X, Y):
ax.text(i + 0.2, j + 4, j, bbox=dict(facecolor='red', alpha=0.3))
plt.grid(True)
plt.show()
def plot03():
plt.figure(figsize=(10, 8))
plt.scatter(data['mi'], data['months'], s=data['price'] / 10, c='r')
plt.xlabel("面积")
plt.ylabel("使用月份")
plt.show()
def plot04():
plt.figure(figsize=(10, 8))
plt.scatter(data['mi'], data['grade'], s=data['price'] / 10, c='r')
plt.xlabel("面积")
plt.ylabel("评分")
plt.show()
if __name__ == '__main__':
head = ['房子描述', '¥价格', '房子构造', '房子面积', '房子朝向', '楼房层数', '建造时间', '地址', '详细地址', '户主姓名', '评分', '发布公司', '网站地址']
with open('room02.csv', 'a', encoding='gbk', newline='')as f:
writer = csv.writer(f)
writer.writerow(head)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
f.close()
plot04()