Python: AppStore comment capture and word frequency analysis (1)

Overview

The teacher asked me to help him catch the comments, so I have this article. The crawler part does not go into details-there are ready-made on CSDN, but it is garbled, I help the original blogger Debug.

Reference statement

Disclaimer: The code is quoted from the original article of the CSDN blogger "Trisyp" and follows the CC 4.0 by-sa copyright agreement.
Original link: https://blog.csdn.net/Trisyp/article/details/78602783

Code

# -*- coding: utf-8 -*-
import requests
import re

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding()
        print(r.text)
        return r.text
    except:
        return ''

def printAPPName(html):
    try:
        pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S)
        #如果不使用re.S参数,则只在每一行内进行匹配,如果一行没有,就换下一行重新开始,不会跨行。
        #而使用re.S参数以后,正则表达式会将这个字符串作为一个整体,将“\n”当做一个普通的字符加入到这个字符串中,在整体中进行匹配
        APPName = re.findall(pattern, str(html))
        return 'APPName:' + str(APPName)
    except:
        return ''

def fillUnivlist(titles, comments, stars, html):
    try:
        pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取标题
        nbaInfo = re.findall(pattern, str(html)) #提取title

        # findStr = '"title":{"label":'
        # nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):]
        patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content
        floorText = re.findall(patternFloor, str(html))

        patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S)  # 提取星级
        star = re.findall(patternStar, str(html))
        # print(str(star))

        number = len(nbaInfo)
        print(number)
        for i in range(number):
            Info = nbaInfo[i] #利用Tools类移除不想要的格式字符
            if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):]
            # print(Info)
            Info1 = floorText[i]
            Info2 = star[i]
            # print(Info2+"hello")
            titles.append('title:' + Info)
            comments.append('content:' + Info1)
            stars.append('star:' + Info2)
    except:
        return ''

def writeText(titleText, fpath):
    try:
        with open(fpath, 'a', encoding='utf-8') as f:
            f.write(str(titleText)+'\n')
            f.write('\n')
            f.close()
    except:
        return ''

def writeUnivlist(titles, comments, stars, fpath, num):
    with open(fpath, 'a', encoding='utf-8') as f:
        for i in range(num):
            f.write(str(stars[i]) + '\n')
            f.write('*' * 10 + '\n')
            f.write(str(titles[i]) + '\n')
            f.write('*' * 50 + '\n') #输入一行*号
            f.write(str(comments[i]) + '\n')
            f.write('*' * 100 + '\n')
        f.close()

def main():
    count = 0
    url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=982191521/sortby=mostrecent/json?l=en&&cc=cn' #要访问的网址
    output_file = 'D:/Comments.txt' #最终文本输出的文件
    html = getHTMLText(url) #获取HTML
    APPName = printAPPName(html)
    # print(html)
    writeText(APPName, output_file)
    for i in range(10):
        i = i + 1
        titles = []
        comments = []
        stars = []
        url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=982191521/sortby=mostrecent/json?l=en&&cc=cn'
        html = getHTMLText(url)
        fillUnivlist(titles, comments, stars, html)
        writeUnivlist(titles, comments, stars, output_file, len(titles))
        # print(html)
        count = count + 1
        print("\r当前进度: {:.2f}%".format(count * 100 / 10), end="")

if __name__ == '__main__':
    main()

Step 1: Get the ID of the target App

Find it by yourself at https://apps.apple.com/cn/genre/ios/id36, and copy the ID in the URL bar after you find it. Replace the ID in the code with the target ID (note that there are two codes that need to be changed)
and search for "XXX on AppStore" on Baidu

Step 2: Find out the encoding method of the original website

Below is the beginning part of the AppStore Html source code, which is obviously UTF-8

<html lang="zh-cn" prefix="og: http://ogp.me/ns#">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, 

So everyone knows where to change
, just replace getHTMLText with the following code

def getHTMLText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = "utf-8"
        print(r.text)
        return r.text
    except:
        return ''

Guess you like

Origin blog.csdn.net/Kaiser_syndrom/article/details/99766071