Exercise 1-- use python acquire Baidu first three pages of search results (you can change pages)

1. Code

import requests
import os
from re import findall,DOTALL,search
from bs4 import BeautifulSoup
from urllib import parse

#1.通过关键字获取百度前5页的url
    # 参数:keyword,返回url列表
#2.爬取每个url获取该url页面需求后缀的href
    # 参数:url,extension_word 返回该页面中所需后缀的url列表
#3.分析每个url是否可以访问
#4.以每行每个的格式写入txt

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
num = 0

#通过百度搜索栏的base_url获取搜索出的url
def parse_baidu_url(url):
    global headers,num
    url_list = []
    response = requests.get(url=url,headers=headers)
    response = response.content.decode("utf-8")
    soup = BeautifulSoup(response,"lxml")
    h3_labels = soup.find_all("h3",attrs={"class":"t"})
    for h3_label in h3_labels:
        a_labels = h3_label.find_all("a")
        for a_label in a_labels:
            href = a_label['href']
            #验证搜索结果中的url可用性
            try:
                response = requests.get(href,headers=headers,timeout=3)
                try:
                    if response.status_code == 200:
                        test_url = response.url
                        url_list.append(test_url)
                        #进度计数器
                        num = num + 1
                        print(num)
                    elif response.status_code == 302:
                        test_url = response.headers['Location']
                        url_list.append(test_url)
                        #进度计数器
                        num = num + 1
                        print(num)
                except Exception as e:
                    pass
            except Exception as e:
                pass
    return url_list

#1.通过关键字获取百度前3页的url
# 参数:keyword,返回url列表
def get_baidu_url(keyword):
    url_list = []
    base_url = "https://www.baidu.com/s?wd={}&pn={}&ie=utf-8"
    for page in range(1,4):
        pn = (page - 1 )*10
        base_url = base_url.format(keyword,pn)
        url_list.append(parse_baidu_url(base_url))
    return url_list

#2.爬取每个url获取该url页面需求后缀的href
# 参数:url,extension_word 返回该页面中所需后缀的url列表
def get_keyword_url(url,keyword):
    global headers
    response = requests.get(url=url,headers=headers).text
    hrefs = findall('<a.*?href=(\".*?\").*?>.*?</a>',response,DOTALL)
    #去重
    list = hrefs
    temp = 0  #比较数据的标识位
    for href in hrefs:
        for index,href_new in enumerate(list):
            if index > temp:
                if href_new == href:
                    del hrefs[index]
        temp = temp+1
        hrefs = list #每次比较完将旧列表替换为新列表
    #在url中查询是否存在对应后缀
    print("[+] 去重完成")
    print(hrefs)
    url_list = []
    base_Domains = parse.urlparse(url)
    base_Domain = str(base_Domains[0])+"://"+str(base_Domains[1])
    for href in hrefs:
        filename = os.path.basename(href).strip("\"")
        (shotname,extension) = os.path.splitext(filename)
        if extension == '.action' or extension == '.jsp' or extension == '.do':
            if "http://" in href or "https://" in href:
                result_url = href.strip("\"")
                url_list.append(result_url)
            else:
                temp = bool(search(r".*?\..*?\.*?/",href))
                if temp == True:
                    result_url = str(base_Domains[0])+":"+ href.strip("\"")
                    url_list.append(result_url)
                else:
                    result_url = base_Domain+"/"+ href.strip("\"")
                    url_list.append(result_url)
    print("[+] 关键字url提取完成")
    print(url_list)
    return url_list

#3.验证可用性
ls = []
def check_url(list0):
    #递归遍历列表
    def getitem(l):
        global ls
        for item in l:
            if isinstance(item,list):
                getitem(item)
            else:
                ls.append(item)
    #check可用性
    getitem(list0)
    list3 = []
    print("[+] 元素递归遍历完成")
    print(ls)
    for url in ls:
        try:
            response = requests.get(url=url,headers=headers,timeout=3)
            if response.status_code == 200:
                list3.append(url)
        except:
            pass
    print("[+] 元素可用性检查完成")
    return list3


#4.列表写入文件
def file_write_list(url_list):
    with open("url_list.txt","w",encoding="utf-8") as file:
        for url in url_list:
            file.write(url+"\n")
    print("[+] 文件写入完成")


#5.主函数
def main():
    #获取百度关键字后的搜索url
    url_list1 = get_baidu_url("nihao")
    url_list1 = check_url(url_list1)
    #从每个url页面中提取所需关键词的url
    url_list4 = []
    for url in url_list1:
        url_list3 = get_keyword_url(url=url,keyword=".action")
        url_list4.append(url_list3)
    url_list4 = check_url(url_list4)
    file_write_list(url_list4)


if __name__ == '__main__':
    main()


Guess you like

Origin www.cnblogs.com/qianxinggz/p/11415488.html