封装了一个函数用来实现全自动爬取雪球网的数据

import requests
import json
from pymysql_text import Mysql_text
# url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111'
# url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184275&count=15&category=111'
# url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=184086&count=15&category=111'
# url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=183687&count=15&category=111'

#封装成一个函数
def datapq(url):
    #定义头部信息
    headers = {
        # 'Accept' : '*/*',
        # 'Accept-Encoding' : 'gzip, deflate, br',
        # 'Accept-Language' : 'zh-CN,zh;q=0.9',
        # 'Connection' : 'keep-alive',
        'Cookie' : 'aliyungf_tc=AQAAAHV3wnlDrgcAJxJ5ah0CThlP1yWJ; xq_a_token=584d0cf8d5a5a9809761f2244d8d272bac729ed4; xq_a_token.sig=x0gT9jm6qnwd-ddLu66T3A8KiVA; xq_r_token=98f278457fc4e1e5eb0846e36a7296e642b8138a; xq_r_token.sig=2Uxv_DgYTcCjz7qx4j570JpNHIs; _ga=GA1.2.121527135.1534335428; _gid=GA1.2.1591594565.1534335428; _gat_gtag_UA_16079156_4=1; u=681534335428965; device_id=3ad9d628b7f64b88029d0cce211de4c7; Hm_lvt_1db88642e346389874251b5a1eded6e3=1534335430; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1534335430',
        # 'Host' : 'xueqiu.com',
        # 'Referer' : 'https://xueqiu.com/',
        'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        # 'X-Requested-With' : 'XMLHttpRequest',
    }
    #使用request 的get方法获取到response
    response = requests.get(url,headers=headers)
    #把结果转成字典类型
    res = json.loads(response.text)
    #定义一个空字典
    data = {}
    ys = []
    #循环遍历并组成新的字典data
    for v in res['list']:
        #取出他的关键信息 max_id
        data['tid'] = v['id']
        #将res转为字典
        res = json.loads(v['data'])
        # print(res)
        data['ids'] = res['id']
        data['title'] = res['title']
        data['description'] = res['description']
        data['target'] = res['target']


        print(data)


        try:
            #调用封装的存数据库的类并进行存储操作
            m = Mysql_text()
            m.sqlzz('insert into text(tid,ids,title,description,target) values (%(tid)s,%(ids)s,%(title)s,%(description)s,%(target)s)',data)

        except:
            #过滤掉存储不了的信息
            pass
    ys.append(data.pop('tid'))
    resul = ys[-1]
    return resul
# s = Mysql_text()
# a=s.sqlzz('select tid from text')
# print(a)

url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=111'
#调用函数
i = 0
while i<100:
    i += 1
    ss = datapq(url)


    url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id='+str(ss)+'&count=15&category=111'






猜你喜欢

转载自blog.csdn.net/yangbenhao/article/details/81712985