'''
http://www.bdwork.com/forum.php?mod=forumdisplay&fid=2&needtype=1&searchtype=14
'''
import requests,bs4,time,random
class bdwork():
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
self.url_list=[]
self.final_href_list=[]
self.spider_url()
self.href_spider()
self.spider()
def spider_url(self):
for i in range(1,3):
url = 'http://www.bdwork.com/forum.php?mod=forumdisplay&searchtype=14&needtype=1&searchindustry=&searcharea=&searchkey=&fid=2&page=' + str(
i)
self.url_list.append(url)
print(self.url_list)
return self.url_list
def href_spider(self):
for url in self.url_list:
response=requests.get(url,headers=self.headers)
soup=bs4.BeautifulSoup(response.text,'html.parser')
link = soup.select('div[class="subject"] a[class="s xst"]')
for href in link:
final_href = 'http://www.bdwork.com/'+href.get('href')
self.final_href_list.append(final_href)
print(self.final_href_list)
return self.final_href_list
def spider(self):
for url in self.final_href_list:
response=requests.get(url,headers=self.headers)
soup = bs4.BeautifulSoup(response.text,'html.parser')
title = soup.select('h1[class="ts"] a[id="thread_subject"]')#[0].getText()
#supply = soup.select('div[class="sort_viewthread"]')[0].getText()
time.sleep(random.randint(2,10))
print(title)
bdwork()
爬取有问题
猜你喜欢
转载自blog.csdn.net/weixin_42518256/article/details/87805477
今日推荐
周排行