xpath版本
import requests
from lxml import etree
import os
from hashlib import md5
def get_html(url, headers):
html = requests.get(url, headers)
return html.text
def parse_html(html):
content = etree.HTML(html)
href_list = content.xpath(
'//li[contains(@class,"j_thread_list")]//div[contains(@class,"threadlist_title")]/a/@href'
)
return href_list
def parse_image(img_list):
content = etree.HTML(img_list)
src_list = content.xpath('//img[@class="BDE_Image"]/@src')
return src_list
def download_image(url, headers):
image_content = requests.get(url, headers).content
if not os.path.exists("yangmi"):
os.mkdir("yangmi")
file = md5(str(image_content).encode('utf-8')).hexdigest()
filename = "yangmi" + "//" + file + ".jpg"
if not os.path.exists(filename):
with open(filename, 'wb') as f:
f.write(image_content)
def main():
pn = 0
url = "http://tieba.baidu.com/f?kw=%E6%9D%A8%E5%B9%82&ie=utf-8&pn=" + str(pn)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
for i in range(10):
html = get_html(url, headers)
href_list = parse_html(html)
for href in href_list:
src = "http://tieba.baidu.com" + href
img_html = get_html(src, headers)
src_list = parse_image(img_html)
for src in src_list:
download_image(src, headers)
pn += 50
if __name__ == '__main__':
main()
美丽汤版本:
import requests
from bs4 import BeautifulSoup
import os
from hashlib import md5
def get_html(url, headers):
html = requests.get(url, headers)
return html.text
def parse_html(html):
html_soup = BeautifulSoup(html, 'lxml')
a_list = html_soup.select('.j_thread_list .threadlist_title a')
href_list = [a.get('href') for a in a_list]
return href_list
def parse_image(img_html):
img_html_soup = BeautifulSoup(img_html, 'lxml')
img_list = img_html_soup.select('.BDE_Image')
src_list = [img.get('src') for img in img_list]
return src_list
def download_img(url, headers):
imgcontent = requests.get(url, headers).content
if not os.path.exists("yangmi2"):
os.mkdir("yangmi2")
file = md5(str(imgcontent).encode('utf-8')).hexdigest()
filename = "yangmi2" + "//" + file + ".jpg"
if not os.path.exists(filename):
with open(filename, 'wb') as f:
f.write(imgcontent)
def main():
pn = 0
url = "http://tieba.baidu.com/f?kw=%E6%9D%A8%E5%B9%82&ie=utf-8&pn=" + str(pn)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
for i in range(10):
html = get_html(url, headers)
href_list = parse_html(html)
for href in href_list:
src = "http://tieba.baidu.com" + href
img_html = get_html(src, headers)
src_list = parse_image(img_html)
for src in src_list:
print(src)
download_img(src, headers)
pn += 50
if __name__ == '__main__':
main()