Overview
The teacher asked me to help him catch the comments, so I have this article. The crawler part does not go into details-there are ready-made on CSDN, but it is garbled, I help the original blogger Debug.
Reference statement
Disclaimer: The code is quoted from the original article of the CSDN blogger "Trisyp" and follows the CC 4.0 by-sa copyright agreement.
Original link: https://blog.csdn.net/Trisyp/article/details/78602783
Code
# -*- coding: utf-8 -*-
import requests
import re
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding()
print(r.text)
return r.text
except:
return ''
def printAPPName(html):
try:
pattern = re.compile(r'{"im:name":{"label":(.*?)}, "rights"', re.S)
#如果不使用re.S参数,则只在每一行内进行匹配,如果一行没有,就换下一行重新开始,不会跨行。
#而使用re.S参数以后,正则表达式会将这个字符串作为一个整体,将“\n”当做一个普通的字符加入到这个字符串中,在整体中进行匹配
APPName = re.findall(pattern, str(html))
return 'APPName:' + str(APPName)
except:
return ''
def fillUnivlist(titles, comments, stars, html):
try:
pattern = re.compile(r'"title":{"label":(.*?)}, "content"', re.S) #提取标题
nbaInfo = re.findall(pattern, str(html)) #提取title
# findStr = '"title":{"label":'
# nbaInfo = nbaInfo1[nbaInfo1.find(findStr)+len(findStr):]
patternFloor = re.compile(r'"content":{"label":(.*?), "attributes":{"type":"text"}}', re.S) #提取content
floorText = re.findall(patternFloor, str(html))
patternStar = re.compile(r'"im:rating":{"label":(.*?)}, "id"', re.S) # 提取星级
star = re.findall(patternStar, str(html))
# print(str(star))
number = len(nbaInfo)
print(number)
for i in range(number):
Info = nbaInfo[i] #利用Tools类移除不想要的格式字符
if i==0:Info = Info[Info.find('"title":{"label":')+len('"title":{"label":'):]
# print(Info)
Info1 = floorText[i]
Info2 = star[i]
# print(Info2+"hello")
titles.append('title:' + Info)
comments.append('content:' + Info1)
stars.append('star:' + Info2)
except:
return ''
def writeText(titleText, fpath):
try:
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(titleText)+'\n')
f.write('\n')
f.close()
except:
return ''
def writeUnivlist(titles, comments, stars, fpath, num):
with open(fpath, 'a', encoding='utf-8') as f:
for i in range(num):
f.write(str(stars[i]) + '\n')
f.write('*' * 10 + '\n')
f.write(str(titles[i]) + '\n')
f.write('*' * 50 + '\n') #输入一行*号
f.write(str(comments[i]) + '\n')
f.write('*' * 100 + '\n')
f.close()
def main():
count = 0
url = 'https://itunes.apple.com/rss/customerreviews/page=1/id=982191521/sortby=mostrecent/json?l=en&&cc=cn' #要访问的网址
output_file = 'D:/Comments.txt' #最终文本输出的文件
html = getHTMLText(url) #获取HTML
APPName = printAPPName(html)
# print(html)
writeText(APPName, output_file)
for i in range(10):
i = i + 1
titles = []
comments = []
stars = []
url = 'https://itunes.apple.com/rss/customerreviews/page=' + str(i) + '/id=982191521/sortby=mostrecent/json?l=en&&cc=cn'
html = getHTMLText(url)
fillUnivlist(titles, comments, stars, html)
writeUnivlist(titles, comments, stars, output_file, len(titles))
# print(html)
count = count + 1
print("\r当前进度: {:.2f}%".format(count * 100 / 10), end="")
if __name__ == '__main__':
main()
Step 1: Get the ID of the target App
Find it by yourself at https://apps.apple.com/cn/genre/ios/id36, and copy the ID in the URL bar after you find it. Replace the ID in the code with the target ID (note that there are two codes that need to be changed)
and search for "XXX on AppStore" on Baidu
Step 2: Find out the encoding method of the original website
Below is the beginning part of the AppStore Html source code, which is obviously UTF-8
<html lang="zh-cn" prefix="og: http://ogp.me/ns#">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width,
So everyone knows where to change
, just replace getHTMLText with the following code
def getHTMLText(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = "utf-8"
print(r.text)
return r.text
except:
return ''