Python知乎热榜爬虫

Python知乎热榜爬虫

环境

re
urllib
bs4==0.0.1
numpy==1.18.5
pandas==0.25.1

热搜URL

  • 找到指定URL可以区别于其他热搜项目的关键值,需要观察并多次测试
  • 转到知乎热榜页面,点击Ctrl + Shift + I进入检查页面,点击ElementCtrl+F进入搜索得到的关键值,可以看到在script标签内,且idjs-initialData
  • UTF-8格式解码,标题和摘要都是正确解析的
  • 链接没有正确解析
  • 基于得到的内容,设计正确的正则表达式匹配模式
self.titlePattern = re.compile(r'{"titleArea":{"text":"(.*?)"}')
self.excerptPattern = re.compile(r'"excerptArea":{"text":"(.*?)"}')
  • 提取完标题和摘要后,以UTF-8格式编码,再以unicode-escape解码
frame = str(frame).encode('utf-8').decode('unicode-escape')
  • 得到正确解析的链接
  • 基于得到的内容,设计正确的正则表达式匹配模式
self.urlPattern = re.compile(r'"link":{"url":"(.*?)"}', re.S)

核心代码

从抓取到的页面提取信息,利用正则表达式进行提取,对热搜链接的提取需要进行特殊处理

def extractData(self):

    page = self.crawlPage()

    beautifulSoup = BeautifulSoup(page, 'html.parser')

    linkList = []
    titleList = []
    excerptList = []

    for frame in beautifulSoup.find_all('script', id = 'js-initialData'):
        frame = str(frame)
        titleList = re.findall(self.titlePattern, str(frame))
        excerptList = re.findall(self.excerptPattern, str(frame))
        # encode string to utf-8 and decode to unicode-escape to get links
        frame = str(frame).encode('utf-8').decode('unicode-escape')
        linkList = re.findall(self.urlPattern, str(frame))

    billboardList = []

    # generate list of list
    for i in range(0, len(titleList)):
        billboardList.append([titleList[i], excerptList[i], linkList[i]])

    return billboardList

pandas导出为excel文件

def export2Excel(self, data, index, columns, path):
    dataFrame = pd.DataFrame(data, index = index, columns = columns)
    print("Exporting...")
    dataFrame.to_excel(path)
    print("Export successfully")

实现代码

# -*- coding:utf-8  -*-
import re
from bs4 import BeautifulSoup
import urllib
import pandas as pd

class Spider():
    '''
        Description:
            Spider to crawl page and extract top hot searches from https://www.zhihu.com/billboard
        Attributes:
            None
    '''

    def __init__(self):
        self.url = 'https://www.zhihu.com/billboard'

        self.headers = {
    
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}

        # regular expression
        self.urlPattern = re.compile(r'"link":{"url":"(.*?)"}', re.S)
        self.titlePattern = re.compile(r'{"titleArea":{"text":"(.*?)"}')
        self.excerptPattern = re.compile(r'"excerptArea":{"text":"(.*?)"}')

    '''
        Description:
            crawl page from https://www.zhihu.com/billboard
        Args:
            None
        Returns:
            page:
                the page which contain content of Zhihu top hot searches rank
    '''


    def crawlPage(self):

        request = urllib.request.Request(headers = self.headers, url = self.url)

        response = urllib.request.urlopen(request)
        page = response.read().decode('utf-8')

        return page

    '''
        Description:
            extract data from the page crawled
        Args:
            None
        Returns:
            billboardList:
                the list contain the list of hot-search title, excerpt and link
    '''

    def extractData(self):

        page = self.crawlPage()

        beautifulSoup = BeautifulSoup(page, 'html.parser')

        linkList = []
        titleList = []
        excerptList = []

        for frame in beautifulSoup.find_all('script', id = 'js-initialData'):
            frame = str(frame)
            titleList = re.findall(self.titlePattern, str(frame))
            excerptList = re.findall(self.excerptPattern, str(frame))
            # encode string to utf-8 and decode to unicode-escape to get links
            frame = str(frame).encode('utf-8').decode('unicode-escape')
            linkList = re.findall(self.urlPattern, str(frame))

        billboardList = []

        # generate list of list
        for i in range(0, len(titleList)):
            billboardList.append([titleList[i], excerptList[i], linkList[i]])

        return billboardList


    '''
        Description:
            export data to .xlsx format into given path according to  given index and column name
        Args:
            data:
                the data to be exported
            index:
                the index of data
            columns:
                the names of columns
            path:
                path to save the .xlsx file
        Returns:
            None
    '''

    def export2Excel(self, data, index, columns, path):
        dataFrame = pd.DataFrame(data, index = index, columns = columns)
        print("Exporting...")
        dataFrame.to_excel(path)
        print("Export successfully")

if __name__ == "__main__":
    spider = Spider()
    billboardList = spider.extractData()
    index = [i for i in range(1, len(billboardList) + 1)]
    columns = ['title', 'excerpt', 'link']
    path = './ZhihuBillboard.xlsx'

    spider.export2Excel(billboardList, index, columns, path)

测试结果

最后

  • 由于博主水平有限,不免有疏漏之处,欢迎读者随时批评指正,以免造成不必要的误解!

猜你喜欢

转载自blog.csdn.net/qq_44486439/article/details/108176852