爬取有问题

'''
http://www.bdwork.com/forum.php?mod=forumdisplay&fid=2&needtype=1&searchtype=14
'''
import requests,bs4,time,random

class bdwork():
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }
        self.url_list=[]
        self.final_href_list=[]
        self.spider_url()
        self.href_spider()
        self.spider()

    def spider_url(self):
        for i in range(1,3):
            url = 'http://www.bdwork.com/forum.php?mod=forumdisplay&searchtype=14&needtype=1&searchindustry=&searcharea=&searchkey=&fid=2&page=' + str(
                i)
            self.url_list.append(url)
        print(self.url_list)
        return self.url_list

    def href_spider(self):
        for url in self.url_list:
            response=requests.get(url,headers=self.headers)
            soup=bs4.BeautifulSoup(response.text,'html.parser')
            link = soup.select('div[class="subject"] a[class="s xst"]')
            for href in link:
                final_href = 'http://www.bdwork.com/'+href.get('href')
                self.final_href_list.append(final_href)
        print(self.final_href_list)
        return self.final_href_list

    def spider(self):
        for url in self.final_href_list:
            response=requests.get(url,headers=self.headers)
            soup = bs4.BeautifulSoup(response.text,'html.parser')
            title = soup.select('h1[class="ts"] a[id="thread_subject"]')#[0].getText()
            #supply = soup.select('div[class="sort_viewthread"]')[0].getText()
            time.sleep(random.randint(2,10))
            print(title)

bdwork()

猜你喜欢

转载自blog.csdn.net/weixin_42518256/article/details/87805477
今日推荐