爬取豆瓣图书Top250并存入xls

import requests
from bs4 import BeautifulSoup
import re
import xlwt

class DouBanBook:
    def __init__(self,pageIndex):
        self.pageIndex = 0
        self.user_agent = 'Mozila/5.0'
        self.headers = {'User-agent':self.user_agent}
        self.book_list = []

    def getPage(self):
        try:
            url = 'https://book.douban.com/top250?' + str(self.pageIndex)
            r = requests.get(url, headers=self.headers)
            r.raise_for_status()
            r.encoding = 'utf-8'
            return r.text
        except:
            return ''

    def getBooks(self):
        pageCode = self.getPage()
        soup = BeautifulSoup(pageCode, 'html.parser')
        for book in soup.find_all('td', {'valign':'top'}):
            if book.find('div', {'class':re.compile(r'pl[2]{1}')}) ==None:
                continue
            bookUrl = book.a['href'].strip()
            title = book.a['title'].strip()
            detail = book.find('p', {'class':'pl'}).get_text().strip('/')
            author = detail[0].strip()
            if len(detail) == 5:
                translator = detail[1].strip()
                press = detail[2].strip()
                date = detail[3].strip()
                price = detail[4].strip()
            else:
                translator = ''
                press = detail[1].strip()
                date = detail[2].strip()
                price = detail[3].strip()
            score = book.find('span', {'class':'rating_nums'}).get_text().strip()
            scorenum = book.find('span', {'class':'pl'}).get_text().strip('(').strip(')').strip()
            word = book.find('span', {'class':'inq'}).get_text().strip()
            self.book_list.append([title,author,word,press,date,price,score,scorenum,bookUrl])

    def load(self,datalist):
        file = xlwt.Workbook()
        sheet = file.add_sheet('豆瓣图书tuTop250', cell_overwrite_ok=True)
        col = (u'图书名字',u'作者',u'概述',u'出版社',u'发行日期',u'价格',u'评分',u'评价标准',u'图书详细链接')
        for i in range(0,9):
            sheet.write(0,i,col[i])
        for i in range(0,250):
            data = datalist[i]
            for j in range(0,9):
                sheet.write(i+1,j,data[j])
        file.save('豆瓣图书uuTop250.xls')

    def start(self):
        print('*****开始抓取*****')
        while self.pageIndex<=225:
            print('正在抓取第%d页'%(self.pageIndex/25+1))
            self.getBooks()
            self.pageIndex+=25

book = DouBanBook(0)
book.start()

猜你喜欢

转载自blog.csdn.net/u010356229/article/details/81005419