python爬虫实例——阿里云云栖社区博文

爬取这个网站:https://yq.aliyun.com/articles/

# -*- coding: utf-8 -*-

import requests
import re
import time
from parsel import Selector

key = "Python"
url = "https://yq.aliyun.com/search/articles/"

hds = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
response = requests.get(url, headers=hds, params={"q":key})
response.encoding = response.apparent_encoding
data = response.text

pat1= '<div class="_search-info">找到(.*?)条关于'
alline = re.compile(pat1, re.S).findall(data)[0]
allpage = int(alline) // 15 + 1
#print(allpage)
for i in range(0, int(allpage)):
    print("----正在爬第" + str(i + 1) + "页------")
    index = str(i + 1)
    getdata = {"q" : key ,
               "p" : index,
               }
    data = requests.get(url, params=getdata).text
#    pat_url = '<div class="media-body text-overflow">.*?<a href="(.*?)">'
#    articles = re.compile(pat_url, re.S).findall(data)
    sel = Selector(data)
    articles = sel.xpath("//div[@class='media-body text-overflow']/a/@href").getall()
    for j in articles:
        thisurl = "https://yq.aliyun.com" + j
#        print(thisurl)
        thisdata = requests.get(thisurl).text
        title = Selector(thisdata).xpath("//p[@class='hiddenTitle']/text()").get()
        print(title)
#        pat_content = Selector(thisdata).xpath("//div[@class='content-detail unsafe markdown-body']/text()").get()
        pat_content = '<div class="content-detail unsafe markdown-body">(.*)</div>'
        content = re.compile(pat_content, re.S).findall(thisdata)[0]
        print(content)
        with open('./aliyun/' + str(title) + '.html', 'w', encoding='utf8') as f:
            f.write(title + "<br /><br />" + content)
            
        

猜你喜欢

转载自www.cnblogs.com/douzujun/p/12288591.html