前言
提示:以下是本篇文章正文内容,下面案例可供参考
一、爬优美图库(使用bs4和requests)
#用到的技术
#1.requests 发送请求,从服务器获取到数据
#2.beautifulsoup 来解析整个页面源代码
import requests
from bs4 import BeautifulSoup
#爬取网站的第一件事 发送请求到服务器
resp=requests.get("https://www.umei.cc/bizhitupian/diannaobizhi/")#请求
resp.encoding='utf-8'#改字符编码
#print(resp.text) 打印的源码
#解析html
main_page=BeautifulSoup(resp.text,"html.parser")
#从页面中找到某些东西
#find() 找一个
#find_all() 找所有
alist=main_page.find("div",attrs={
"class":"TypeList"}).find_all("a",attrs={
"class":"TypeBigPics"})
#print(typelist)
#print(alist)
n=1
for a in alist:
#print(a.get("href"))
href="https://www.umei.cc/"+a.get("href")
#print(href)
#发送请求到子页面,进入下一个图片的位置
resp1=requests.get(href)
resp1.encoding="utf-8"
child_page=BeautifulSoup(resp1.text,"html.parser")
src=child_page.find("div", attrs={
"class":"ImageBody"}).find("img").get("src")
#发送请求到服务器,图片保存到本地
#创建文件
f=open("picture/tu_%s.jpg"%n,mode="wb")#wb写入的内容非文本文件
f.write(requests.get(src).content)#requests.get(src).content#向外拿图片数据 不是文本信息
print("恭喜你下好%s张图片"%n)
n+=1
text=child_page.find("div", attrs={
"class": "NewPages"}).find_all("a")
#print(text)
#print(a.get("href"))
for b in text:
href2="https://www.umei.cc/"+b.get('href')
resp2 = requests.get(href2)
resp2.encoding = "utf-8"
child_page1 = BeautifulSoup(resp2.text, "html.parser")
src = child_page1.find("div", attrs={
"class": "ImageBody"}).find("img").get("src")
# 创建文件
f = open("picture/tu_%s.jpg" % n, mode="wb") # wb写入的内容非文本文件
f.write(requests.get(src).content) # requests.get(src).content#向外拿图片数据 不是文本信息
print("恭喜你下好%s张图片" % n)
n+=1
二、爬ZOL壁纸网(使用bs4和requests)
import requests
from bs4 import BeautifulSoup
resp=requests.get("https://desk.zol.com.cn/meinv/")
resp.encoding='utf-8'
main_page=BeautifulSoup(resp.text,"html.parser")
alist=main_page.find("ul",attrs={
"class":"pic-list2"}).find_all("a",attrs={
"class":"pic"})
#print(alist)
n=1
for a in alist:
href='https://desk.zol.com.cn/'+a.get("href")
if href=='https://desk.zol.com.cn/https://file.cdn.cqttech.com/xzdesktop/XZDesktop_4020_2.0.11.12.exe':continue
#print(href)
resp1=requests.get(href)
resp1.encoding='utf-8'
child_page=BeautifulSoup(resp1.text,"html.parser")
# print(child_page)
src=child_page.find("div",attrs={
"id":"mouscroll"}).find("img").get("src")
#print(src)
f = open("ZOL/tu_%s.jpg" % n, mode="wb")
f.write(requests.get(src).content)
print("恭喜你下好%s张图片" % n)
n += 1
page=child_page
for i in range(1,10):
b=page.find("div", attrs={
"id": "mouscroll"}).find("div", attrs={
"id": "photo-next"}).find("a")
href2='https://desk.zol.com.cn/'+b.get("href")
if href2 == 'https://desk.zol.com.cn/https://file.cdn.cqttech.com/xzdesktop/XZDesktop_4020_2.0.11.12.exe': continue
if href2=='https://desk.zol.com.cn/javascript:;':continue
#print(href2)
resp2=requests.get(href2)
resp2.encoding='utf-8'
new_page=BeautifulSoup(resp2.text,"html.parser")
src = new_page.find("div", attrs={
"id": "mouscroll"}).find("img").get("src")
f = open("ZOL/tu_%s.jpg" % n, mode="wb") # wb写入的内容非文本文件
f.write(requests.get(src).content) # requests.get(src).content#向外拿图片数据 不是文本信息
print("恭喜你下好%s张图片" % n)
n += 1
page=new_page