爬取豆瓣古典文学(数据库存储)

代码如下:

  1 # coding:utf-8
  2 import cPickle
  3 import random
  4 import requests
  5 from lxml import etree
  6 import time
  7 import re
  8 import sys
  9 import codecs
 10 import sqlite3
 11 
 12 class Spider:
 13     def __init__(self):
 14         self.con = sqlite3.connect(r'BookInformation.db')
 15         self.cur = self.con.cursor()
 16         self.home = 'https://book.douban.com/tag/%E5%8F%A4%E5%85%B8%E6%96%87%E5%AD%A6'
 17         self.Referer = 'https://book.douban.com/'
 18         self.user_agent_list = []
 19         self.books_list = []
 20         with open('user_agent.txt', 'rb') as f:
 21             self.user_agent_list = cPickle.load(f)
 22 
 23     def GetHeaders(self):
 24         UserAgent = random.choice(self.user_agent_list)
 25         headers = {'Referer': self.Referer, 'User-Agent': UserAgent}
 26         return headers
 27 
 28     def SaveBook(self,info):
 29         sql = 'INSERT INTO BookInfo VALUES(?,?,?,?,?)'
 30         info_list = (info["Name"],info["Author"],info["Rating"],info["ContentIntro"],info["AuthorIntro"])
 31         self.cur.execute(sql, info_list)
 32         self.con.commit()
 33 
 34     def Crawl(self):
 35         html = requests.get(self.home,headers=self.GetHeaders()).text
 36         html_tree = etree.HTML(html)
 37         booksList = html_tree.xpath('/html/body/div[3]/div[1]/div/div[1]/div/ul/li')
 38         num = 0
 39         for book in booksList:
 40             time.sleep(1)
 41             bookUrl = book.xpath('div[2]/h2/a')[0].get('href')
 42             pageHtml = requests.get(bookUrl,headers=self.GetHeaders()).text
 43             page_tree = etree.HTML(pageHtml)
 44             book_info = self.GetPage(page_tree)
 45             print book_info['Name']
 46             self.SaveBook(book_info)
 47             # self.books_list.append(book_info)
 48             # f = codecs.open('text.txt','a',encoding='utf-8')
 49             # f.write(book_info['AuthorIntro'])
 50             # f.close()
 51             # print book_info['AuthorIntro']
 52             num = num+1
 53             if num==5:
 54                 break
 55 
 56 
 57     def GetPage(self, page_tree):
 58         book_info = {}
 59         try:
 60             Name = self.GetName(page_tree)
 61             book_info['Name'] = Name
 62         except:
 63             book_info['Name'] = ''
 64         try:
 65             Author = self.GetAuthor(page_tree)
 66             book_info['Author'] = Author
 67         except:
 68             book_info['Author'] = ''
 69         try:
 70             Rating = self.GetRating(page_tree)
 71             book_info['Rating'] = Rating
 72         except:
 73             book_info['Rating'] = ''
 74         try:
 75             ContentIntro = self.GetContentIntro(page_tree)
 76             book_info['ContentIntro'] = ContentIntro
 77         except:
 78             book_info['ContentIntro'] = ''
 79         try:
 80             AuthorIntro = self.GetAuthorIntro(page_tree)
 81             book_info['AuthorIntro'] = AuthorIntro
 82         except:
 83             book_info['AuthorIntro'] = ''
 84 
 85 
 86         return book_info
 87 
 88     def GetName(self, page_tree):
 89         return page_tree.xpath('/html/body/div[3]/h1/span')[0].text
 90 
 91     def GetAuthor(self,page_tree):
 92         author_list = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/span[1]/a')
 93         result = ''
 94         if len(author_list) is not 0:
 95             list = []
 96             for author in author_list:
 97                 list.append(author.text.strip())
 98             result = '/'.join(list)
 99         else:
100             result = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[1]/div[2]/a')[0].text.strip()
101         return re.sub(r'\s+',' ',result)
102 
103 
104     def GetRating(self, page_tree):
105         return page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[1]/div[1]/div[2]/div/div[2]/strong')[0].text.strip()
106 
107     def GetContentIntro(self, page_tree):
108         para_div = page_tree.xpath('//*[@id="link-report"]//div[@class="intro"]')
109         result = ''
110         if len(para_div) is not 0:
111             para_para = para_div[len(para_div)-1].xpath('p')
112             for para in para_para:
113                 result = result+'\t'+para.text+'\n'
114         return result
115 
116     def GetAuthorIntro(self, page_tree):
117         para_div = page_tree.xpath('/html/body/div[3]/div[2]/div/div[1]/div[3]/div[@class="indent "]//div[@class="intro"]')
118         result = ''
119         if len(para_div) is not 0:
120             para_para = para_div[len(para_div) - 1].xpath('p')
121             for para in para_para:
122                 result = result + '\t' + para.text + '\n'
123         return result
124 
125     # def GetCatalogue(self, page_tree):
126     #     pass
127     #
128     # def GetTag(self, page_tree):
129     #     pass
130     #
131     # def GetShortCommentary(self, page_tree):
132     #     pass
133 
134 if __name__ == '__main__':
135     s = Spider()
136     s.Crawl()

猜你喜欢

转载自www.cnblogs.com/DOLFAMINGO/p/9210568.html
今日推荐