使用多进程任务池爬自如租房信息

可以把多进程理解为多任务

from bs4 import BeautifulSoup

import requests,re
from multiprocessing import Pool
import time
from pm import Proxy
from mysql import Mysql
# p=Proxy('xici.csv')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
"Cookie":"CURRENT_CITY_CODE=110000; __utm_source=pinzhuan; __utm_medium=baidu; ZIROOM_PHONE=9; CURRENT_CITY_NAME=%E5%8C%97%E4%BA%AC; mapType=%20; gr_user_id=a8024f14-50b6-416b-b80e-b63205162fa3; Hm_lvt_038002b56790c097b74c818a80e3a68e=1529333034; gr_session_id_8da2730aaedd7628=d19dc070-53e8-42bc-a9bf-7961c5bcc071_true; Hm_lpvt_038002b56790c097b74c818a80e3a68e=1529338968"


}

base_url = 'http://www.ziroom.com/z/nl/z3.html?qwd=%E5%9B%9E%E9%BE%99%E8%A7%82&p={}'
db = Mysql()
def get_message(home_url):
# print(home_url)
response = requests.get(home_url,headers=headers)
response.encoding = response.apparent_encoding
html = response.text
# print(html)
html = BeautifulSoup(html,'lxml')
message_list= html.select('div.room_detail_right')

for message in message_list:

home_name = message.select('div.room_name h2')[0].text.strip()
home_site = message.select('span.ellipsis')[0].text.replace('\n','').strip().replace(' ','')[3:].replace(']','')
home_price = message.select('span.room_price')[0].text[1:]
print(home_name,home_site,home_price)
# item = [home_name,home_site,home_price]
item = {
'home_name':home_name,
'home_site':home_site,
'home_price':home_price
}

sql = "insert into py09_ziru(home_name,home_site,home_price)" \
" VALUES(%s,%s,%d)"
data = (item['home_name'],item['home_site'],item['home_price'])
print(sql,data)
db.execute(sql,data)

# print(message_list)

def get_list(i):
# print(base_url)

fullurl = base_url.format(i)
# print(fullurl)

response = requests.get(fullurl,headers=headers)
response.encoding = response.apparent_encoding
#
#
html = response.text
html = BeautifulSoup(html,'lxml')
# print(html)
home_list = html.select("li.clearfix")
for home in home_list:
home_url = home.select('a')[0]['href'][2:]
home_url = 'http://'+home_url
# print(home_url)
get_message(home_url)
# time.sleep(1)


if __name__ =='__main__':
#进程池最大进程数
pool = Pool(4)
# #创建进程
print(time.ctime())
# get_list()
for i in range(1,20+1):
pool.apply_async(func=get_list,args=(i,))
pool.close()
pool.join()
print(time.ctime())

猜你喜欢

转载自www.cnblogs.com/lyxdw/p/9231560.html