python爬虫get json解析

# -*-coding:utf8-*-
from lxml import etree
from multiprocessing.dummy import Pool as ThreadPool
import requests
import json
import sys

reload(sys)

sys.setdefaultencoding('utf-8')

'''''重新运行之前请删除content.txt，因为文件操作使用追加方式，会导致内容太多。'''


def towrite(contentdict):
    f.writelines(u'文章标题:' + str(contentdict['title']) + '\n')
    f.writelines(u'作者名称:' + str(contentdict['user_name']) + '\n')
    f.writelines(u'创建时间:' + str(contentdict['category']) + '\n')
    f.writelines(u'类别:' + str(contentdict['created_at']) + '\n\n')


def spider(url):
    print(url)
    html = requests.get(url)
    # print(html.text)
    # selector = etree.HTML(html.text)

    content_field = json.loads(html.text)

    articles = content_field['articles']
    print(len(articles))
    item = {}
    for each in articles:
        item['title'] = each['title']
        item['user_name'] = each['user_name']
        item['category'] = each['category']
        item['created_at'] = each['created_at']
        towrite(item)


if __name__ == '__main__':
    pool = ThreadPool(2)
    f = open('content.txt', 'a')
    page = []
    for i in range(0, 2):
        newpage = 'https://www.csdn.net/api/articles?type=more&category=home&shown_offset=0'
        page.append(newpage)

    results = pool.map(spider, page)
    pool.close()
    pool.join()
    f.close()
猜你喜欢

目录

热门文章