import requests
from bs4 import BeautifulSoup
import re
import xlwt
class DouBanBook:
def __init__(self,pageIndex):
self.pageIndex = 0
self.user_agent = 'Mozila/5.0'
self.headers = {'User-agent':self.user_agent}
self.book_list = []
def getPage(self):
try:
url = 'https://book.douban.com/top250?' + str(self.pageIndex)
r = requests.get(url, headers=self.headers)
r.raise_for_status()
r.encoding = 'utf-8'
return r.text
except:
return ''
def getBooks(self):
pageCode = self.getPage()
soup = BeautifulSoup(pageCode, 'html.parser')
for book in soup.find_all('td', {'valign':'top'}):
if book.find('div', {'class':re.compile(r'pl[2]{1}')}) ==None:
continue
bookUrl = book.a['href'].strip()
title = book.a['title'].strip()
detail = book.find('p', {'class':'pl'}).get_text().strip('/')
author = detail[0].strip()
if len(detail) == 5:
translator = detail[1].strip()
press = detail[2].strip()
date = detail[3].strip()
price = detail[4].strip()
else:
translator = ''
press = detail[1].strip()
date = detail[2].strip()
price = detail[3].strip()
score = book.find('span', {'class':'rating_nums'}).get_text().strip()
scorenum = book.find('span', {'class':'pl'}).get_text().strip('(').strip(')').strip()
word = book.find('span', {'class':'inq'}).get_text().strip()
self.book_list.append([title,author,word,press,date,price,score,scorenum,bookUrl])
def load(self,datalist):
file = xlwt.Workbook()
sheet = file.add_sheet('豆瓣图书tuTop250', cell_overwrite_ok=True)
col = (u'图书名字',u'作者',u'概述',u'出版社',u'发行日期',u'价格',u'评分',u'评价标准',u'图书详细链接')
for i in range(0,9):
sheet.write(0,i,col[i])
for i in range(0,250):
data = datalist[i]
for j in range(0,9):
sheet.write(i+1,j,data[j])
file.save('豆瓣图书uuTop250.xls')
def start(self):
print('*****开始抓取*****')
while self.pageIndex<=225:
print('正在抓取第%d页'%(self.pageIndex/25+1))
self.getBooks()
self.pageIndex+=25
book = DouBanBook(0)
book.start()
爬取豆瓣图书Top250并存入xls
猜你喜欢
转载自blog.csdn.net/u010356229/article/details/81005419
今日推荐
周排行