# -*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
'''''重新运行之前请删除content.txt,因为文件操作使用追加方式,会导致内容太多。'''
def towrite(contentdict):
f.writelines(u'文章标题:' + str(contentdict['title']) + '\n')
f.writelines(u'作者名称:' + str(contentdict['user_name']) + '\n')
f.writelines(u'创建时间:' + str(contentdict['category']) + '\n')
f.writelines(u'类别:' + str(contentdict['created_at']) + '\n\n')
def spider(url):
print(url)
html = requests.get(url)
# print(html.text)
# selector = etree.HTML(html.text)
content_field = json.loads(html.text)
articles = content_field['articles']
print(len(articles))
item = {}
for each in articles:
item['title'] = each['title']
item['user_name'] = each['user_name']
item['category'] = each['category']
item['created_at'] = each['created_at']
towrite(item)
if __name__ == '__main__':
pool = ThreadPool(2)
f = open('content.txt', 'a')
page = []
for i in range(0, 2):
newpage = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset=0'
page.append(newpage)
results = pool.map(spider, page)
pool.close()
pool.join()
f.close()