Python爬虫百度搜索

一、bs4实现 

import requests
from urllib import parse
from bs4 import BeautifulSoup
import time

headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}

def getPage(url):
    try:
        re=requests.get(url,headers=headers)
        re.encoding=re.apparent_encoding
        return re.text
    except:
        print(re.status_code)

def parsePage(text):
    soup=BeautifulSoup(text,'lxml')
    content=soup.select('div .result')
    for item in content:
        try:
            title=item.select('h3 a')[0].text
            href=item.select('h3 a')[0]['href']
            abstract=item.select('.c-abstract')[0].text
            print("{}-{}\n{}".format(title,abstract,href))
        except:
            pass
    # baike
    try:
        bk=soup.select("div .result-op .op-bk-polysemy-piccontent")[0]
        baike_abstract=bk.select('p')[0].text.strip()
        baike_title=soup.select('.result-op .c-gap-bottom-small a')[0].text.strip()
        baike_href=soup.select('.result-op .c-gap-bottom-small a')[0]['href']
        print("{}-{}\n{}".format(baike_title,baike_abstract,baike_href))
    except:
        pass

    # 其他人还在搜
    try:
        items=soup.select("div .result-op .list_1V4Yg a")
        print("其他人还在搜>>>")
        for item in items:
            e_title=item.text
            e_href=parse.urljoin('http://www.baidu.com',item['href'])
            print(e_title,e_href)
    except:
        pass

if __name__ == '__main__':
    word=parse.quote(input('请输入关键字:'))
    pn=int(input("请输入爬取的页数:"))
    for i in range(pn):
        print("开始爬取第%d页>>>"%(i+1))
        url=f'http://www.baidu.com/s?wd={word}&pn={i*10}'
        text=getPage(url)
        parsePage(text)
        time.sleep(2)

二、正则表达式实现

import requests
import re
from urllib import parse
import time

def getPage(url):
    try:
        re=requests.get(url)
        re.encoding=re.apparent_encoding
        # with open('02_regex_baidu.html','w',encoding='utf8') as f:
        #     f.write(re.text)
        return re.text
    except:
        print(re.status_code)
def parse_page(html):
    content=re.findall(r'{"?title"?:("|\')(.*?)("|\'),"?url"?:("|\')(.*?)("|\')}',html)
    # baike=re.findall(r'{title:\'(.*?)\',url:\'(.*?)\'}',html)
    # print(content)
    for item in content[:-1]:
        print("{}\n{}".format(item[1],item[4]))
    # 其他人都在搜
    try:
        everybody=re.findall(r'href="(/s.*?oq=)">([-_\w\u2e80-\u9fff]+)',html)
        print("其他人都在搜...")
        for item in everybody:
            e_href=parse.urljoin("http://www.baidu.com",item[0])
            e_title=item[1]
            print(e_title,e_href)

        # 第二种方法
        '''
        everybody2=re.finditer(r'href="(/s.*?oq=)">([-_\w\u2e80-\u9fff]+)',html)
        print("其他人都在搜...")
        for item in everybody2:
            e2_title=item.group(2)
            e2_url=parse.urljoin("http://www.baidu.com",item.group(1))
            print(e2_title,e2_url)
        '''
    except:
        pass

if __name__ == '__main__':
    word=parse.quote(input("请输入关键字:"))
    pn=int(input("请输入想爬取的页数:"))
    for i in range(pn):
        print("开始爬取第%d页>>>"%(i+1))
        url=f"http://www.baidu.com/s?wd={word}&pn={i*10}"
        html=getPage(url)
        parse_page(html)
        time.sleep(2)

 

猜你喜欢

转载自blog.csdn.net/KK_2018/article/details/113092634