셀레늄을 사용하여 Baidu 검색 URL 크롤링

셀레늄을 사용하여 취약점 자동 테스트에 유용해야하는 검색 URL을 간단히 크롤링합니다. Google을 사용하여 검색하고 싶지만 에이전트를 구입할 돈이 없습니다. Google 문법이 Baidu 문법보다 훨씬 유용하다고 느낍니다. .

암호

# -*- coding: utf-8 -*-
"""
Created on Sat May  2 15:17:58 2020

@author: 14504
"""


from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from urllib.parse import quote
from pyquery import PyQuery as pq 
import requests
import time

url_save_path="./url.txt"
SearchInformation="inurl: (admin)"
starPage=1   #页数
endPage=1

# 添加无界面参数
options = webdriver.ChromeOptions()
options.add_argument('--headless')
browser = webdriver.Chrome(options=options)

#browser = webdriver.Chrome()
wait= WebDriverWait(browser,10)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
    }



def searchURL(page):
    pageScema="&pn="+str(page)
    url="https://www.baidu.com/s?wd="+quote(SearchInformation)+pageScema
    try:
        browser.get(url)
        urlnum=geturl()
        return urlnum
    
    except TimeoutException:
        print("请求超时")

def geturl():
    urlnum=0;
    html=browser.page_source 
    doc=pq(html)
    items = doc('div#content_left  .result.c-container').items()
    for item in items:
       BDurl=item.children('div.f13 > a').attr('href')
       real_url=urlDecode(BDurl)
       if real_url=="":
           print("none")
       else:
           saveTotxt(real_url)
           urlnum=urlnum+1
    print("这一页成功爬取了"+str(urlnum)+"个\n")
    return urlnum
    
#百度url解码    
def urlDecode(BDurl):    
    try:
        res = requests.get(BDurl,allow_redirects=False) 
        Real_url=res.headers['Location']
        return Real_url
    except requests.exceptions.ConnectionError as e:
        print('ConnectionError', e.args)
        return("")

    except requests.exceptions.MissingSchema as e:
        print('Schema is none', e.args)
        return("")

    except:
        return("")
        

def saveTotxt(real_url):
    with open(url_save_path, 'a', encoding='utf-8') as file:
        file.write(real_url)
        file.write("\n")

def main():
    urlsum=0
    for page in range(starPage-1,endPage):
        print("正在爬取第"+str(page+1)+"页")
        page=page*10
        urlnum=searchURL(page)
        urlsum=urlnum+urlsum      
        time.sleep(1)
    
    print("成功爬取"+str(urlsum)+"个url地址")
        

main()

 

추천

출처blog.csdn.net/weixin_40943540/article/details/105890796