selenium+PhantomJS爬取(豆瓣读书)

	获取关于Python的全部书籍信息;
	通过代码测试 request携带‘User-Agent’及 ‘data’数据信息的方式均无法获取到相关信息,获取数据时,部分数据为空,导致获取过程中报错,无法获取全部数据,初步判定豆瓣读书的反爬机制较为严格;通过selenium 模拟浏览器请求的方法测试后发现,可利用 selenium 方法请求获取数据;
#导入需要的模块
from selenium import webdriver
import time
from lxml import etree
import pymysql
import re

#创建一个函数
def my_browers(url, page):

    # 获取浏览器对象
    browers = webdriver.PhantomJS(executable_path=r'd:\Desktop\pythonjs\phantomjs-2.1.1-windows\bin\phantomjs.exe')
    
    # 用浏览器发起请求
    browers.get(url)
    
    #休息两秒,频率低一点,爬的时间久一点,安全就多一点
    time.sleep(2)
    
     # 获取页面信息
    html = browers.page_source
    
    # 调用页面解析函数
    parse_html(html)
    
# 解析页面信息
def parse_html(html):

    # 生成一个xpath对象
    html = etree.HTML(html)

    # 获取所有的书籍信息列表
    books = html.xpath('//div[contains(@class,"sc-bZQynM")]')

    # 遍历每一本书籍 然后拿到我们想要的数据
    for book in books:
        # 创建一个存书字典存数据用
        book_dict = {}

        # 获取封面信息
        pic = book.xpath('//img/@src')
        if pic:
            book_dict['pic'] = pic[0]
        else:
            book_dict['pic'] = ''
        # print(pic)

        # 获取书名
        book_name = book.xpath('//div[@class="title"]/a/text()')
        # print(book_name)
        if book_name:
            book_name = book_name[0]
            # 删除书名中最后出现的引号,
            #由于存数据库的时候书名最后面的引号会导致数据库报错,删除可以使代码更健壮
            if '"' in book_name:
                pattern = re.compile(r'"')
                book_name = pattern.sub('', book_name)
            if "'" in book_name:
                pattern = re.compile(r"'")
                book_name = pattern.sub('', book_name)
                # 删除书名中最后出现的\,存数据的时候书名最后的\会把sql语句最后的引号转义,
                #删除可以使代码更健壮
            if '\\' in book_name:
                book_name = book_name[:-1]
            book_dict['book_name'] = book_name
        else:
            book_dict['book_name'] = ''

        # 获取书籍详情连接
        book_url = book.xpath('//div[@class="title"]/a/@href')
        if book_url:
            book_dict['book_url'] = book_url[0]
        else:
            book_dict['book_url'] = ''

        # 获取评分信息
        score_book = book.xpath('//span[@class="rating_nums"]/text()')
        if score_book:
            book_dict['score_book'] = score_book[0]
        else:
            book_dict['score_book'] = ''

        # 获取出版社信息
        book_detail = book.xpath('//div[@class="meta abstract"]/text()')
        if book_detail:
            # 删除书详情中最后出现的引号;
            book_detail = book_detail[0]
            if "'" in book_detail:
                pattern = re.compile(r"'")
                book_detail = pattern.sub('', book_detail)

            book_dict['book_detail'] = book_detail
        else:
            book_dict['book_detail'] = ''
        print(book_dict)

        # 调用数据库函数
        insert_mysql(book_dict)

# 插入数据库
def insert_mysql(book_dict):
    # 连接数据库
    conn = pymysql.connect('localhost', 'root', 'root', 'test', charset='utf8')

    # 创建操作数据库的对象
    cursor = conn.cursor()

    pic = book_dict['pic']
    book_name = book_dict['book_name']
    book_url = book_dict['book_url']
    score = book_dict['score_book']
    book_detail = book_dict['book_detail']

    sql = f"insert into python_book (pic,book_name,book_url,score,book_detail) " \
          f"VALUE ('{pic}','{book_name}','{book_url}','{score}','{book_detail}')"

    # 执行并提交
    cursor.execute(sql)
    conn.commit()


if __name__ == '__main__':
    for i in range(0, 199):
        print('=================下载第{}页========================'.format(i + 1))
        page = i * 15
        base_url = 'https://book.douban.com/subject_search?search_text=python&cat=1001&start={}'.format(page)
        my_browers(base_url, page)

猜你喜欢

转载自blog.csdn.net/douyaoxin_126/article/details/86100205