#coding:utf-8
import requests
import json
import time
import re
import os
from bs4 import BeautifulSoup
url = 'https://bbs.hupu.com/'
def get_article_html(page):
url = 'https://bbs.hupu.com/selfie-type3' + '-' + str(page)
headers = {'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
html = requests.get(url,headers=headers).text
soup = BeautifulSoup(html,'lxml')
value = (soup.find_all(attrs={'class':'truetit'}))
for i in value:
ht = i.attrs['href']
photo_name = ht.split('.')[0].strip('/')
result = get_photo_html(url,ht)
if not result:continue
download_img(photo_name,result)
time.sleep(0.1)
def download_img(photo_name,url):
if not os.path.exists('美女'):
os.mkdir('美女')
response=requests.get(url.split('?')[0])
with open('美女/%s.jpg'%(photo_name),'wb') as h:
h.write(response.content)
h.flush()
print (photo_name+'finish')
def get_photo_html(u1,u2):
url = 'https://bbs.hupu.com/' + u2
headers = {'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36(KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
html = requests.get(url,headers=headers).text
soup = BeautifulSoup(html,'lxml')
value = (soup.select('.quote-content img'))
for i in range(0,len(value)):
data = value[i]['src']
if data and data.endswith('webp'):
return data
#value = get_photo_html("https://bbs.hupu.com","/24646486.html")
if __name__ =='__main__':
for i in range(1,11):
get_article_html(i)
Python 爬取虎扑爆照区图片
猜你喜欢
转载自blog.csdn.net/qq_40771567/article/details/85002395
今日推荐
周排行