Python3-爬虫~selenium\phantomjs\豆瓣应用例子

 
 
import requests,ssl
import os,time,json
from selenium import webdriver,common
from lxml import etree
root_dir='douban/img'
if not os.path.exists(root_dir):
    os.mkdir(root_dir)

driver=webdriver.PhantomJS()
def spider(page):

    #新方法:解决数据加密问题
    base_url='https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15)

    driver.get(base_url)
    time.sleep(3)
    # file_name=root_dir+"/%s.png"%page
    # driver.save_screenshot(file_name)
    #页面内容
    # print(driver.page_source)
    content_parse(driver.page_source)

    '''
    #正常方法
    #加密数据,原理是可逆的,但是很难破译,一般用selenium\phantomjs方法处理
    base_url='https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15)
    # base_url='https://book.douban.com/tag/python?start=%s&type=T'%(page*20)#可以爬取数据,只是少些
    data={
    "search_text":"python",
    "cat":"1001",
    "start":"0",
    }
    headers={
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language":"zh-CN,zh;q=0.9",
    "Connection":"keep-alive",
    "Cookie":"ll=108288; bid=zC8kpZs6khI; __utmz=30149280.1525247893.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DC81A8A4F064F1DE970236B8B877DBFFE|d52a04191f120a868c6c3ff087438bf3; __utma=30149280.1644564112.1525247893.1525250902.1526440071.3; __utmc=30149280; __utmt=1; __utmt_douban=1; __utma=81379588.1571071381.1526440124.1526440124.1526440124.1; __utmc=81379588; __utmz=81379588.1526440124.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmb=81379588.1.10.1526440124; gr_user_id=4e32d00a-05ad-4881-8ad5-4fc2e338d275; gr_cs1_bbbbbbeb-cc42-4ba4-84a0-76472504134b=user_id%3A0; __yadk_uid=DmyfVTl078LDu7W8HHzZx9OFcQfxkdGE; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1526440125%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=bbbbbbeb-cc42-4ba4-84a0-76472504134b_true; _pk_id.100001.3ac3=33dc8abe8c87c40b.1526440125.1.1526440186.1526440125.; __utmb=30149280.6.10.1526440071",
    "Host":"book.douban.com",
    "Referer":"https://book.douban.com/subject_search?search_text=python&cat=1001&start=0",
    "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
    }
    response=requests.get(base_url,data=data,headers=headers,verify=False)
    # print(response.content.decode('utf-8'))

    file_name=root_dir+"/%s.html"%page
    with open(file_name,'w',encoding='utf-8') as f:
        f.write(response.content.decode('utf-8'))
        
    '''
#解析函数
def content_parse(content):
    data = []

    tree=etree.HTML(content)
    books=tree.xpath('//div[@class="item-root"]')
    print(len(books))
    ##遍历列表,获取每本书的信息

    for book in books:
        data_dict = {}
        # 书名
        book_name = book.xpath('.//div[@class="title"]/a')
        if book_name != []:
            book_name = book_name[0].text
            print(book_name)
        # 图片
        book_src = book.xpath('./a/img/@src')
        if book_src!=[]:
            book_src = book_src[0]
            print(book_src)

        #书的链接
        book_href = book.xpath('.//div[@class="title"]/a/@href')
        if book_href != []:
            book_href = book_href[0]
            print(book_href)
        # 评分
        book_score = book.xpath('.//span[@class="rating_nums"]')
        if book_score != []:
            book_score = book_score[0].text
            print(book_score)

        #details
        book_details = book.xpath('.//div[@class="meta abstract"]')
        if book_details != []:
            book_details = book_details[0].text
            print(book_details)
        print('~~~~~~~~~~~~~~~~~~~~`')
        data_dict['book_name']=book_name
        data_dict['book_src']=book_src
        data_dict['book_href']=book_href
        data_dict['book_score']=book_score
        data_dict['book_details']=book_details
        data.append(data_dict)
    file_name='douban/douban_data.json'
    dj=json.dumps(data,ensure_ascii=False)

    with open(file_name,'w',encoding='utf-8') as f:
        f.write(dj)
        f.close()
    print(len(data), type(dj))

if __name__=='__main__':
    for i in range(2):
        spider(i)

 
 

/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/2018_3_13/03douban.py
15
Head First Python(中文版) : Head First Python
https://img3.doubanio.com/view/subject/l/public/s27262723.jpg
https://book.douban.com/subject/10561367/
7.9
巴里(Barry.P.) / 林琪 等 / 中国电力出版社 / 2012-3-1 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python灰帽子 : 黑客与逆向工程师的Python编程之道
https://img3.doubanio.com/view/subject/l/public/s4676930.jpg
https://book.douban.com/subject/6025284/
7.5
[美] Justin Seitz / 丁赟卿 译 / 崔孝晨 审校 / 电子工业出版社 / 2011-3 / 39.00元
~~~~~~~~~~~~~~~~~~~~`
Python网络编程基础 : 使用Python构建网络程序的综合指南
https://img3.doubanio.com/view/subject/l/public/s2604186.jpg
https://book.douban.com/subject/2152386/
7.2
John Goerzen / 莫迟 等 / 电子工业出版社 / 2007 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python基础教程
https://img3.doubanio.com/view/subject/l/public/s4387251.jpg
https://book.douban.com/subject/4866934/
8.0
Magnus Lie Hetland / 司维 / 曾军崴 / 谭颖华 / 人民邮电出版社 / 2010-7 / 69.00元
~~~~~~~~~~~~~~~~~~~~`
Python Algorithms : Mastering Basic Algorithms in the Python Language
https://img3.doubanio.com/view/subject/l/public/s6999960.jpg
https://book.douban.com/subject/4915945/
8.9
Magnus Lie Hetland / Apress / 2010-11-24 / USD 49.99
~~~~~~~~~~~~~~~~~~~~`
父与子的编程之旅 : 与小卡特一起学Python
https://img3.doubanio.com/view/subject/l/public/s28825823.jpg
https://book.douban.com/subject/26005639/
8.5
桑德 (Warren Sande) / 桑德 (Carter Sande) / 苏金国 / 易郑超 / 人民邮电出版社 / 2014-10-1 / CNY 69.00
~~~~~~~~~~~~~~~~~~~~`
Python核心编程(第二版)
https://img3.doubanio.com/view/subject/l/public/s3140466.jpg
https://book.douban.com/subject/3112503/
7.7
[美]Wesley J. Chun(陳仲才) / CPUG / 人民邮电出版社 / 2008-06 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
利用Python进行数据分析
https://img3.doubanio.com/view/subject/l/public/s27275372.jpg
https://book.douban.com/subject/25779298/
8.5
Wes McKinney / 唐学韬 / 机械工业出版社 / 2013-11-18 / 89.00
~~~~~~~~~~~~~~~~~~~~`
贝叶斯思维 : 统计建模的Python学习法
https://img3.doubanio.com/view/subject/l/public/s28023092.jpg
https://book.douban.com/subject/26340992/
7.4
[美]Allen B. Downey(艾伦·唐尼) / 许杨毅 / 人民邮电出版社 / 2015-3 / 49.00
~~~~~~~~~~~~~~~~~~~~`
Django Web开发指南 : Python Web Development with Django
https://img3.doubanio.com/view/subject/l/public/s3789820.jpg
https://book.douban.com/subject/3740086/
6.5
Jeff Forcier / Paul Bissex / 徐旭铭 / 机械工业出版社 / 2009-5 / 49.00元
~~~~~~~~~~~~~~~~~~~~`
Python源码剖析 : 深度探索动态语言核心技术
https://img3.doubanio.com/view/subject/l/public/s3435132.jpg
https://book.douban.com/subject/3117898/
8.7
陈儒 / 电子工业出版社 / 2008-6 / 69.80元
~~~~~~~~~~~~~~~~~~~~`
Python学习手册 : (第3版)
https://img1.doubanio.com/view/subject/l/public/s3952568.jpg
https://book.douban.com/subject/3948354/
8.2
Mark Lutz / 侯靖 / 机械工业出版社 / 2009-8 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
Python编程:从入门到实践 : 从入门到实践
https://img3.doubanio.com/view/subject/l/public/s28891775.jpg
https://book.douban.com/subject/26829016/
9.1
[美]埃里克·马瑟斯 / 袁国忠 / 人民邮电出版社 / 2016-7-1 / CNY 89.00
~~~~~~~~~~~~~~~~~~~~`
Python入门经典 : 以解决计算问题为导向的Python编程实践
https://img3.doubanio.com/view/subject/l/public/s11430346.jpg
https://book.douban.com/subject/11610789/
8.2
(美)William F. Punch/Richard Enbody / 张敏 / 机械工业出版社 / 2012-8-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
学习Python : Learning Python, Second Edition
https://img3.doubanio.com/view/subject/l/public/s1436455.jpg
https://book.douban.com/subject/1426816/
8.2
Mark Lutz / David Ascher / 东南大学出版社 / 2005年6月 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>
15
python绝技:运用python成为顶级黑客 : 运用Python成为顶级黑客
https://img1.doubanio.com/view/subject/l/public/s28385338.jpg
https://book.douban.com/subject/26702570/
7.6
[美] TJ O'Connor / 崔孝晨 / 武晓音 / 电子工业出版社 / 2016-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
Expert Python Programming : Best practices for designing, coding, and distributing your Pyth
https://img3.doubanio.com/view/subject/l/public/s29690103.jpg
https://book.douban.com/subject/3285148/
8.3
Tarek Ziadé / Packt Publishing / 2008-9-26 / USD 44.99
~~~~~~~~~~~~~~~~~~~~`
"笨办法"学Python
https://img1.doubanio.com/view/subject/l/public/s27836847.jpg
https://book.douban.com/subject/26264642/
7.9
肖 (Zed A.Shaw) / 王巍巍 / 人民邮电出版社 / 2014-11-1 / CNY 49.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 编写高质量Python代码的59个有效方法
https://img3.doubanio.com/view/subject/l/public/s28384052.jpg
https://book.douban.com/subject/26709315/
8.7
布雷特·斯拉特金(Brett Slatkin) / 爱飞翔 / 机械工业出版社 / 2016-1 / 59
~~~~~~~~~~~~~~~~~~~~`
Python语言及其应用 : Python语言及其应用
https://img3.doubanio.com/view/subject/l/public/s28352586.jpg
https://book.douban.com/subject/26675127/
8.4
[美] Bill Lubanovic / 丁嘉瑞 / 梁 杰 / 禹常隆 / 人民邮电出版社 / 2016-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
A Byte of Python
https://img3.doubanio.com/view/subject/l/public/s4612135.jpg
https://book.douban.com/subject/5948760/
8.7
Swaroop C H / Lulu Marketplace / 2008-10-1 / USD 27.98
~~~~~~~~~~~~~~~~~~~~`
Python Cookbook : (第2版)中文版
https://img3.doubanio.com/view/subject/l/public/s4357883.jpg
https://book.douban.com/subject/4828875/
8.6
Alex Martelli / Anna Ravenscroft / David Ascher / 高铁军 / 人民邮电出版社 / 2010-5-1 / 99.00元
~~~~~~~~~~~~~~~~~~~~`
Dive Into Python
https://img3.doubanio.com/view/subject/l/public/s29694522.jpg
https://book.douban.com/subject/1440658/
8.2
Mark Pilgrim / Apress / 2004-11-5 / GBP 31.49
~~~~~~~~~~~~~~~~~~~~`
Python学习手册(第4版)
https://img3.doubanio.com/view/subject/l/public/s4683230.jpg
https://book.douban.com/subject/6049132/
7.9
[美] Mark Lutz / 李军 / 刘红伟 / 机械工业出版社 / 2011-4 / 119.00元
~~~~~~~~~~~~~~~~~~~~`
Flask Web Development : Developing Web Applications with Python
https://img1.doubanio.com/view/subject/l/public/s27205547.jpg
https://book.douban.com/subject/25814739/
8.3
Miguel Grinberg / O'Reilly Media / 2014-5-25 / USD 24.99
~~~~~~~~~~~~~~~~~~~~`
可爱的Python
https://img1.doubanio.com/view/subject/l/public/s3901817.jpg
https://book.douban.com/subject/3884108/
7.4
哲思社区 / 电子工业出版社 / 2009-9 / 55.00元
~~~~~~~~~~~~~~~~~~~~`
流畅的Python
https://img3.doubanio.com/view/subject/l/public/s29434304.jpg
https://book.douban.com/subject/27028517/
9.3
[巴西] Luciano Ramalho / 安道 / 吴珂 / 人民邮电出版社 / 2017-5-15 / 139元
~~~~~~~~~~~~~~~~~~~~`
Python网络数据采集
https://img1.doubanio.com/view/subject/l/public/s29086659.jpg
https://book.douban.com/subject/26740503/
7.7
米切尔 (Ryan Mitchell) / 陶俊杰 / 陈小莉 / 人民邮电出版社 / 2016-3-1 / CNY 59.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 59 Specific Ways to Write Better Python
https://img3.doubanio.com/view/subject/l/public/s28008426.jpg
https://book.douban.com/subject/26312313/
8.3
Brett Slatkin / Addison-Wesley Professional / 2015-3-8 / USD 39.99
~~~~~~~~~~~~~~~~~~~~`
Python高级编程
https://img3.doubanio.com/view/subject/l/public/s4163751.jpg
https://book.douban.com/subject/4212921/
7.6
Tarek Ziadé / 姚军 / 夏海轮 / 译 / 人民邮电出版社 / 2010-1 / 45.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>


Process finished with exit code 0

猜你喜欢

转载自blog.csdn.net/zbrj12345/article/details/80349218