一、爬取百度贴吧

import re
titleR ='<a rel="noreferrer" href=".*?" title=".*?" target="_blank" class="j_th_tit ">(.*?)</a>'
authorR='<span class=".*?" title="主题作者:(.*?)" data-field'
reduR ='<span class=".*?" title="回复">(.*?)</span>'
with open('test.html','r',encoding='utf-8') as f:
    data = f.read()
    title = re.findall(titleR,data)
    author = re.findall(authorR,data)
    redu = re.findall(reduR,data)

for i in range(0,len(author)) :
    print(redu[i]+author[i]+'   '+title[i]+'    ')

二、提取小说内容

from lxml import etree
with open('work2.html','r') as f:
    text = f.read()
html = etree.HTML(text)
result = html.xpath('//*[@id="content"]/text()')
with open('斗罗大陆.txt','w',encoding='utf-8') as f:
    f.write(''.join(result))
print(result)

三、豆瓣小说

from lxml import etree
with open('work3.html','r',encoding='utf-8') as f:
    text = f.read()
html = etree.HTML(text)
allInfo =''
for i in range(1,25):
    title = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[1]/a/span[1]/text()'%(i))
    score = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/div/span[2]/text()'%(i))
    comment = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[2]/span/text()'%(i))
    time = html.xpath('//*[@id="content"]/div/div[1]/ol/li[%d]/div/div[2]/div[2]/p[1]/text()[2]'%(i))
    info = ''.join(title)+' '+''.join(score)+' '+''.join(comment)+' '+''.join(time)+'\n'
    allInfo=allInfo+info
with open('豆瓣电影.txt','w',encoding='utf-8') as f:
    f.write(allInfo)

四、Ajax爬微博

from urllib.parse import urlencode
from pyquery import PyQuery as pq
import requests
base_url = 'https://m.weibo.cn/api/container/getIndex?'
headers = {
    
    
    'Host': 'm.weibo.cn',
    'Referer': 'https://m.weibo.cn/u/2360812967',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}
def get_page():
    params = {
    
    
        'uid':'2360812967',
        't': '0',
        'luicode': '10000011',
        'lfid': '100103type=1&amp;q=李现',
        'type': 'uid',
        'value': '2360812967',
        'containerid': '1076032360812967',

    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:\
                print('Error', e.args)

def parse_page(json):
    if json:
        items = json.get('data').get('cards')
        i = 0;
        for item in items:
            if(i ==0):
                i = 1
                continue

            item = item.get('mblog')
            weibo = {
    
    }
            weibo['id'] = item.get('id')
            weibo['text'] = pq(item.get('text')).text()
            weibo['attitudes'] = item.get('attitudes_count')
            weibo['comments'] = item.get('comments_count')
            weibo['reposts'] = item.get('reposts_count')
            yield weibo


if __name__ == '__main__':
    # result = get_page()
    # print(result)
    for page in range(1, 2):
        json = get_page()
        results = parse_page(json)
        for result in results:
            print(result)

五、多线程爬淘宝

from selenium import webdriver
import time
import threading


def workthis(name):
    browser = webdriver.Chrome()
    browser.get('https://www.taobao.com')
    input = browser.find_element_by_id('q')
    input.send_keys(name)#向搜索框输输入值为iPhone
    time.sleep(1)#暂停1s为了模拟人的行为，防止被屏蔽
    button = browser.find_element_by_class_name('btn-search')
    button.click()#点击操作
    phone = browser.find_element_by_id('fm-login-id')
    phone.send_keys('18224393018')
    password = browser.find_element_by_id('fm-login-password')
    password.send_keys('***********')
    login = browser.find_element_by_xpath('//*[@id="login-form"]/div[4]/button')
    login.click()
    time.sleep(3)#暂停1s为了模拟人的行为，防止被屏蔽
    for i in range(1,48):
        price = browser.find_element_by_xpath(
            '//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[%d]/div[2]/div[1]/div[1]/strong'%(i))
        title = browser.find_element_by_xpath(
            '//*[@id="mainsrp-itemlist"]/div/div/div[1]/div[%d]/div[2]/div[2]'%(i))
        print(title.text+'\t'+price.text)
    browser.quit()
if __name__ == '__main__':
    threading.Thread(target=workthis,args=('小米手机',)).start()
    threading.Thread(target=workthis,args=('苹果手机',)).start()
    threading.Thread(target=workthis,args=('华为手机',)).start()

【Python网络编程】爬取百度贴吧、小说内容、豆瓣小说、Ajax爬微博、多线程爬淘宝

一、爬取百度贴吧

二、提取小说内容

三、豆瓣小说

四、Ajax爬微博

五、多线程爬淘宝

猜你喜欢