Python 爬取虎扑爆照区图片

#coding:utf-8
import requests
import json
import time
import re
import os
from bs4 import BeautifulSoup



url = 'https://bbs.hupu.com/'
def get_article_html(page):
    url = 'https://bbs.hupu.com/selfie-type3' + '-' + str(page)
    headers = {'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'}

    html = requests.get(url,headers=headers).text

    soup = BeautifulSoup(html,'lxml')

    value = (soup.find_all(attrs={'class':'truetit'}))

    for i in value:
        ht = i.attrs['href']
        photo_name = ht.split('.')[0].strip('/')
        result = get_photo_html(url,ht)
        if not result:continue
        download_img(photo_name,result)
        time.sleep(0.1)


def download_img(photo_name,url):

    if not os.path.exists('美女'):
        os.mkdir('美女')
    response=requests.get(url.split('?')[0])
    with open('美女/%s.jpg'%(photo_name),'wb') as h:
        h.write(response.content)
        h.flush()
    print (photo_name+'finish')

def get_photo_html(u1,u2):
     url =  'https://bbs.hupu.com/' + u2
    
     headers = {'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
     html = requests.get(url,headers=headers).text
     soup = BeautifulSoup(html,'lxml')

     value = (soup.select('.quote-content img'))
     for i in range(0,len(value)):
         data = value[i]['src']
         if data and data.endswith('webp'):
             return data

    
    
#value = get_photo_html("https://bbs.hupu.com","/24646486.html")

if __name__ =='__main__':
    for i in range(1,11):
        get_article_html(i)

猜你喜欢

转载自blog.csdn.net/qq_40771567/article/details/85002395