import requests,ssl import os,time,json from selenium import webdriver,common from lxml import etree root_dir='douban/img' if not os.path.exists(root_dir): os.mkdir(root_dir) driver=webdriver.PhantomJS() def spider(page): #新方法:解决数据加密问题 base_url='https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15) driver.get(base_url) time.sleep(3) # file_name=root_dir+"/%s.png"%page # driver.save_screenshot(file_name) #页面内容 # print(driver.page_source) content_parse(driver.page_source) ''' #正常方法 #加密数据,原理是可逆的,但是很难破译,一般用selenium\phantomjs方法处理 base_url='https://book.douban.com/subject_search?search_text=python&cat=1001&start=%s'%(page*15) # base_url='https://book.douban.com/tag/python?start=%s&type=T'%(page*20)#可以爬取数据,只是少些 data={ "search_text":"python", "cat":"1001", "start":"0", } headers={ "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Language":"zh-CN,zh;q=0.9", "Connection":"keep-alive", "Cookie":"ll=108288; bid=zC8kpZs6khI; __utmz=30149280.1525247893.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=DC81A8A4F064F1DE970236B8B877DBFFE|d52a04191f120a868c6c3ff087438bf3; __utma=30149280.1644564112.1525247893.1525250902.1526440071.3; __utmc=30149280; __utmt=1; __utmt_douban=1; __utma=81379588.1571071381.1526440124.1526440124.1526440124.1; __utmc=81379588; __utmz=81379588.1526440124.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmb=81379588.1.10.1526440124; gr_user_id=4e32d00a-05ad-4881-8ad5-4fc2e338d275; gr_cs1_bbbbbbeb-cc42-4ba4-84a0-76472504134b=user_id%3A0; __yadk_uid=DmyfVTl078LDu7W8HHzZx9OFcQfxkdGE; _pk_ref.100001.3ac3=%5B%22%22%2C%22%22%2C1526440125%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.3ac3=*; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=bbbbbbeb-cc42-4ba4-84a0-76472504134b_true; _pk_id.100001.3ac3=33dc8abe8c87c40b.1526440125.1.1526440186.1526440125.; __utmb=30149280.6.10.1526440071", "Host":"book.douban.com", "Referer":"https://book.douban.com/subject_search?search_text=python&cat=1001&start=0", "Upgrade-Insecure-Requests":"1", "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", } response=requests.get(base_url,data=data,headers=headers,verify=False) # print(response.content.decode('utf-8')) file_name=root_dir+"/%s.html"%page with open(file_name,'w',encoding='utf-8') as f: f.write(response.content.decode('utf-8')) ''' #解析函数 def content_parse(content): data = [] tree=etree.HTML(content) books=tree.xpath('//div[@class="item-root"]') print(len(books)) ##遍历列表,获取每本书的信息 for book in books: data_dict = {} # 书名 book_name = book.xpath('.//div[@class="title"]/a') if book_name != []: book_name = book_name[0].text print(book_name) # 图片 book_src = book.xpath('./a/img/@src') if book_src!=[]: book_src = book_src[0] print(book_src) #书的链接 book_href = book.xpath('.//div[@class="title"]/a/@href') if book_href != []: book_href = book_href[0] print(book_href) # 评分 book_score = book.xpath('.//span[@class="rating_nums"]') if book_score != []: book_score = book_score[0].text print(book_score) #details book_details = book.xpath('.//div[@class="meta abstract"]') if book_details != []: book_details = book_details[0].text print(book_details) print('~~~~~~~~~~~~~~~~~~~~`') data_dict['book_name']=book_name data_dict['book_src']=book_src data_dict['book_href']=book_href data_dict['book_score']=book_score data_dict['book_details']=book_details data.append(data_dict) file_name='douban/douban_data.json' dj=json.dumps(data,ensure_ascii=False) with open(file_name,'w',encoding='utf-8') as f: f.write(dj) f.close() print(len(data), type(dj)) if __name__=='__main__': for i in range(2): spider(i)
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3.6 /Users/apple/PycharmProjects/stage4/spider/2018_3_13/03douban.py
15
Head First Python(中文版) : Head First Python
https://img3.doubanio.com/view/subject/l/public/s27262723.jpg
https://book.douban.com/subject/10561367/
7.9
巴里(Barry.P.) / 林琪 等 / 中国电力出版社 / 2012-3-1 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python灰帽子 : 黑客与逆向工程师的Python编程之道
https://img3.doubanio.com/view/subject/l/public/s4676930.jpg
https://book.douban.com/subject/6025284/
7.5
[美] Justin Seitz / 丁赟卿 译 / 崔孝晨 审校 / 电子工业出版社 / 2011-3 / 39.00元
~~~~~~~~~~~~~~~~~~~~`
Python网络编程基础 : 使用Python构建网络程序的综合指南
https://img3.doubanio.com/view/subject/l/public/s2604186.jpg
https://book.douban.com/subject/2152386/
7.2
John Goerzen / 莫迟 等 / 电子工业出版社 / 2007 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
Python基础教程
https://img3.doubanio.com/view/subject/l/public/s4387251.jpg
https://book.douban.com/subject/4866934/
8.0
Magnus Lie Hetland / 司维 / 曾军崴 / 谭颖华 / 人民邮电出版社 / 2010-7 / 69.00元
~~~~~~~~~~~~~~~~~~~~`
Python Algorithms : Mastering Basic Algorithms in the Python Language
https://img3.doubanio.com/view/subject/l/public/s6999960.jpg
https://book.douban.com/subject/4915945/
8.9
Magnus Lie Hetland / Apress / 2010-11-24 / USD 49.99
~~~~~~~~~~~~~~~~~~~~`
父与子的编程之旅 : 与小卡特一起学Python
https://img3.doubanio.com/view/subject/l/public/s28825823.jpg
https://book.douban.com/subject/26005639/
8.5
桑德 (Warren Sande) / 桑德 (Carter Sande) / 苏金国 / 易郑超 / 人民邮电出版社 / 2014-10-1 / CNY 69.00
~~~~~~~~~~~~~~~~~~~~`
Python核心编程(第二版)
https://img3.doubanio.com/view/subject/l/public/s3140466.jpg
https://book.douban.com/subject/3112503/
7.7
[美]Wesley J. Chun(陳仲才) / CPUG / 人民邮电出版社 / 2008-06 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
利用Python进行数据分析
https://img3.doubanio.com/view/subject/l/public/s27275372.jpg
https://book.douban.com/subject/25779298/
8.5
Wes McKinney / 唐学韬 / 机械工业出版社 / 2013-11-18 / 89.00
~~~~~~~~~~~~~~~~~~~~`
贝叶斯思维 : 统计建模的Python学习法
https://img3.doubanio.com/view/subject/l/public/s28023092.jpg
https://book.douban.com/subject/26340992/
7.4
[美]Allen B. Downey(艾伦·唐尼) / 许杨毅 / 人民邮电出版社 / 2015-3 / 49.00
~~~~~~~~~~~~~~~~~~~~`
Django Web开发指南 : Python Web Development with Django
https://img3.doubanio.com/view/subject/l/public/s3789820.jpg
https://book.douban.com/subject/3740086/
6.5
Jeff Forcier / Paul Bissex / 徐旭铭 / 机械工业出版社 / 2009-5 / 49.00元
~~~~~~~~~~~~~~~~~~~~`
Python源码剖析 : 深度探索动态语言核心技术
https://img3.doubanio.com/view/subject/l/public/s3435132.jpg
https://book.douban.com/subject/3117898/
8.7
陈儒 / 电子工业出版社 / 2008-6 / 69.80元
~~~~~~~~~~~~~~~~~~~~`
Python学习手册 : (第3版)
https://img1.doubanio.com/view/subject/l/public/s3952568.jpg
https://book.douban.com/subject/3948354/
8.2
Mark Lutz / 侯靖 / 机械工业出版社 / 2009-8 / 89.00元
~~~~~~~~~~~~~~~~~~~~`
Python编程:从入门到实践 : 从入门到实践
https://img3.doubanio.com/view/subject/l/public/s28891775.jpg
https://book.douban.com/subject/26829016/
9.1
[美]埃里克·马瑟斯 / 袁国忠 / 人民邮电出版社 / 2016-7-1 / CNY 89.00
~~~~~~~~~~~~~~~~~~~~`
Python入门经典 : 以解决计算问题为导向的Python编程实践
https://img3.doubanio.com/view/subject/l/public/s11430346.jpg
https://book.douban.com/subject/11610789/
8.2
(美)William F. Punch/Richard Enbody / 张敏 / 机械工业出版社 / 2012-8-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
学习Python : Learning Python, Second Edition
https://img3.doubanio.com/view/subject/l/public/s1436455.jpg
https://book.douban.com/subject/1426816/
8.2
Mark Lutz / David Ascher / 东南大学出版社 / 2005年6月 / 68.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>
15
python绝技:运用python成为顶级黑客 : 运用Python成为顶级黑客
https://img1.doubanio.com/view/subject/l/public/s28385338.jpg
https://book.douban.com/subject/26702570/
7.6
[美] TJ O'Connor / 崔孝晨 / 武晓音 / 电子工业出版社 / 2016-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
Expert Python Programming : Best practices for designing, coding, and distributing your Pyth
https://img3.doubanio.com/view/subject/l/public/s29690103.jpg
https://book.douban.com/subject/3285148/
8.3
Tarek Ziadé / Packt Publishing / 2008-9-26 / USD 44.99
~~~~~~~~~~~~~~~~~~~~`
"笨办法"学Python
https://img1.doubanio.com/view/subject/l/public/s27836847.jpg
https://book.douban.com/subject/26264642/
7.9
肖 (Zed A.Shaw) / 王巍巍 / 人民邮电出版社 / 2014-11-1 / CNY 49.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 编写高质量Python代码的59个有效方法
https://img3.doubanio.com/view/subject/l/public/s28384052.jpg
https://book.douban.com/subject/26709315/
8.7
布雷特·斯拉特金(Brett Slatkin) / 爱飞翔 / 机械工业出版社 / 2016-1 / 59
~~~~~~~~~~~~~~~~~~~~`
Python语言及其应用 : Python语言及其应用
https://img3.doubanio.com/view/subject/l/public/s28352586.jpg
https://book.douban.com/subject/26675127/
8.4
[美] Bill Lubanovic / 丁嘉瑞 / 梁 杰 / 禹常隆 / 人民邮电出版社 / 2016-1 / 79.00元
~~~~~~~~~~~~~~~~~~~~`
A Byte of Python
https://img3.doubanio.com/view/subject/l/public/s4612135.jpg
https://book.douban.com/subject/5948760/
8.7
Swaroop C H / Lulu Marketplace / 2008-10-1 / USD 27.98
~~~~~~~~~~~~~~~~~~~~`
Python Cookbook : (第2版)中文版
https://img3.doubanio.com/view/subject/l/public/s4357883.jpg
https://book.douban.com/subject/4828875/
8.6
Alex Martelli / Anna Ravenscroft / David Ascher / 高铁军 / 人民邮电出版社 / 2010-5-1 / 99.00元
~~~~~~~~~~~~~~~~~~~~`
Dive Into Python
https://img3.doubanio.com/view/subject/l/public/s29694522.jpg
https://book.douban.com/subject/1440658/
8.2
Mark Pilgrim / Apress / 2004-11-5 / GBP 31.49
~~~~~~~~~~~~~~~~~~~~`
Python学习手册(第4版)
https://img3.doubanio.com/view/subject/l/public/s4683230.jpg
https://book.douban.com/subject/6049132/
7.9
[美] Mark Lutz / 李军 / 刘红伟 / 机械工业出版社 / 2011-4 / 119.00元
~~~~~~~~~~~~~~~~~~~~`
Flask Web Development : Developing Web Applications with Python
https://img1.doubanio.com/view/subject/l/public/s27205547.jpg
https://book.douban.com/subject/25814739/
8.3
Miguel Grinberg / O'Reilly Media / 2014-5-25 / USD 24.99
~~~~~~~~~~~~~~~~~~~~`
可爱的Python
https://img1.doubanio.com/view/subject/l/public/s3901817.jpg
https://book.douban.com/subject/3884108/
7.4
哲思社区 / 电子工业出版社 / 2009-9 / 55.00元
~~~~~~~~~~~~~~~~~~~~`
流畅的Python
https://img3.doubanio.com/view/subject/l/public/s29434304.jpg
https://book.douban.com/subject/27028517/
9.3
[巴西] Luciano Ramalho / 安道 / 吴珂 / 人民邮电出版社 / 2017-5-15 / 139元
~~~~~~~~~~~~~~~~~~~~`
Python网络数据采集
https://img1.doubanio.com/view/subject/l/public/s29086659.jpg
https://book.douban.com/subject/26740503/
7.7
米切尔 (Ryan Mitchell) / 陶俊杰 / 陈小莉 / 人民邮电出版社 / 2016-3-1 / CNY 59.00
~~~~~~~~~~~~~~~~~~~~`
Effective Python : 59 Specific Ways to Write Better Python
https://img3.doubanio.com/view/subject/l/public/s28008426.jpg
https://book.douban.com/subject/26312313/
8.3
Brett Slatkin / Addison-Wesley Professional / 2015-3-8 / USD 39.99
~~~~~~~~~~~~~~~~~~~~`
Python高级编程
https://img3.doubanio.com/view/subject/l/public/s4163751.jpg
https://book.douban.com/subject/4212921/
7.6
Tarek Ziadé / 姚军 / 夏海轮 / 译 / 人民邮电出版社 / 2010-1 / 45.00元
~~~~~~~~~~~~~~~~~~~~`
15 <class 'str'>
Process finished with exit code 0