基于知乎关键词爬虫

# -*- coding: utf-8 -*-
"""
Created on Mon Dec 25 10:34:27 2017

@author: gzs10227
"""

import urllib,urllib2,re,json
import requests
urllib.getproxies_registry =lambda:{}
import sys
stderr = sys.stderr
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf8')
sys.stderr = stderr
sys.stdout = stdout
from lxml import etree
import re
import pandas as pd
from pandas import DataFrame
import os
import random
import datetime,time





url = 'https://www.zhihu.com/topic/20105184/top-answers?page=1'


headers = {
'accept':'application/json, text/plain, */*',
'Accept-Language':'zh-CN,zh;q=0.9',
'authorization':'Bearer 2|1:0|10:1514185979|4:z_c0|92:Mi4xZzZSRkFnQUFBQUFBZ0VJeThBYTdEQ1lBQUFCZ0FsVk4tdkl0V3dCSHVJTm93T3d6a3lBSmotSjJQR0xzaV82QUJR|fd43b64c754f3e2d34658961f5db3f233ea367c86eb90c299685746fb8e1cbf8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':'',

'Host':'www.zhihu.com',
'Pragma':'no-cache',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'X-UDID':'AIBCMvAGuwyPThPGhuYxg9CTUALjTwfhGYg='}






url = 'https://www.zhihu.com/question/31116099'
header2 = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}

def get_question_link(url):
    url = 'https://www.zhihu.com/search?type=content&q=荒野行动'
    requests.get(url,)


def deal_dict(di):
    result_df = {}
    author_info = di['author']
    result_df['created_time'] = di['created_time']
    result_df['id'] = di['id']
    result_df['content'] = di['content']
    result_df['comment_count'] = di['comment_count']
    result_df['url'] = di['url']
    return result_df,author_info
    

def get_zhihu_dict(url):    
    time.sleep(1)
    req = urllib2.Request(url,None,header2)
    content = urllib2.urlopen(req).read()
    content2 = content.split('http://www.zhihu.com/api/v4/questions/')[-2].split('limit=')[0].replace('amp;','')
    apiurl = 'http://www.zhihu.com/api/v4/questions/%s'%content2
    print apiurl        
    limit = 20
    offset = 0
    answers_url = '%slimit=%s&offset=%s'%(apiurl,limit,offset)
    print answers_url
    temp = requests.get(answers_url,headers = headers)
    html = temp.content   
    data = json.loads(html)
    totals = data['paging']['totals']
    result = data['data']
    
    while len(result) <= totals and not data['paging']['is_end']:
        time.sleep(1)
        try:
            next_url = data['paging']['next']
            print next_url
            temp = requests.get(next_url,headers = headers)
            html = temp.content    
            data = json.loads(html)    
            result.extend(data['data'])
        except:
            break
    all_result = map(deal_dict,result)
    author_info = []; comment_info = []
    for dict2 in all_result:
        author_info.append(dict2[1])
        comment_info.append(dict2[0])
    author_info_df = pd.DataFrame(author_info)    
    comment_df = pd.DataFrame(comment_info)
    comment_df['id'] = comment_df['id'].astype(str)    
    return author_info_df,comment_df


author_info_df,comment_df = get_zhihu_dict(url)
author_info_df = author_info_df[['id','name','gender','url']]
comment_url = list(comment_df['url'][comment_df['comment_count'] > 0])
Str_p = '/comments?include=data%5B*%5D.author%2Ccollapsed%2Creply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2Cis_author&order=normal&limit=20&offset=0&status=open'
comment_url = map(lambda i:i+Str_p,comment_url)


def get_comment_more(url):
    time.sleep(1)
    temp = requests.get(url,headers = headers).content  
    data = json.loads(temp)
    result = data['data']
    totals = data['paging']['totals']
    while len(result) <= totals and not data['paging']['is_end']:
        time.sleep(1)
        try:
            next_url = data['paging']['next']
            print next_url
            temp = requests.get(next_url,headers = headers)
            html = temp.content    
            data = json.loads(html)    
            result.extend(data['data'])
        except:
            break
    return result

Comment_list = map(get_comment_more,comment_url)
comment_more = [];
for C in Comment_list:
    comment_more.extend(C)
comment_more_df = pd.DataFrame(comment_more)
more_anchor = pd.DataFrame(list(comment_more_df['author']))
more_anchor_df = pd.DataFrame(list(more_anchor['member']))
author_df =  more_anchor_df[['id','name','gender','url']]   
all_author_info = pd.concat([author_info_df,author_df],axis = 0)    
all_author_info = all_author_info.drop_duplicates()    
more_comment_df = comment_more_df[['id','url','created_time','content','vote_count']]
all_comment_df = pd.concat([more_comment_df,comment_df])

基于知乎关键词爬虫

猜你喜欢