爬虫简介
爬虫主要用到requests库和re库,首先导入库,然后利用库中方法
import requests
import re
response = requests.get('http://ishuo.cn/') # 模拟浏览器打开网页
# print(response.status_code) # 200成功,301,404网页丢失
# print(response.encoding) # utf-8
data = response.text #
# print(data)
# .匹配所有字符,*表示前面的字符0到无穷个
res = re.findall('<div class="content">(.*?)</div>',data)
print(res)
段子网爬墙
# requests库
## requests.get(url) 模拟浏览器打开网页
# re库
import requests
import re
response = requests.get('http://ishuo.cn/') # 模拟浏览器打开网页
# print(response.status_code) # 200成功,301,404网页丢失
print(response.encoding="gd231-8") # utf-8
data = response.text #
print(data)
# # .匹配所有字符,*表示前面的字符0到无穷个
# content_res = re.findall('<div class="content">(.*?)</div>', data)
# title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
# # print(title_res.index('活得糊涂的人,容易幸福'))
# # print(title_res.index('购买银行理财产品亏损后如何起诉'))
# title_res= title_res[10:60]
# # print(title_res)
#
#
# title_content_dic = {}
# for i in range(len(title_res)):
# title_content_dic[title_res[i]] = content_res[i]
# # print(title_content_dic)
#
# # print(title_content_dic)
# for i in title_content_dic.items():
# # print(str(i)+'\n')
# print(f'{i[0]:<40} | {i[1]:<1000}')
# 第一种:
'<a href="/subject/5898">这个笑话好内涵,你懂了没?</a>'
'''
//g.alicdn.com
//img.alicdn.com
//tce.alicdn.com
'''
#方法二
import requests
import re
response = requests.get('http://ishuo.cn/') # 模拟浏览器打开网页
data = response.text
res = re.findall('<li class="list_li">(.*?)</li>',data)
title_content_desc_dic = {}
for i in res:
content = re.findall('<div class="content">(.*?)</div>',i)[0]
title = re.findall('<a href="/subject/.*?">(.*?)</a>',i)[0]
desc = re.findall('</a>(04月.*?)</div>',i)[0]
title_content_desc_dic[title] = (content,desc)
for i in title_content_desc_dic.items():
print(f'{i[0]:<100} | {i[1]}')
#图片爬墙
import requests
import re
response = requests.get('http://www.nipic.com/design/acg/renwu/index.html?page=1&tdsourcetag=s_pcqq_aiomsg')
data = response.text
# print(data)
img_url_res = re.findall('data-src="(.*?)"',data)
for i in img_url_res:
img_response = requests.get(i)
img_data = img_response.content
img_name = i.split('/')[-1]
f=open(img_name,'wb')
f.write(img_data)
# f.flush() # 快速刷新
import re
import os
import time
import requests
from selenium import webdriver
#########################
###此段代码不需要关心啥意思###
#########################
if not os.path.exists('百度图片'):
os.mkdir('百度图片')
#####################
###限制30张图片的代码###
####################
# 获取所有图片
response = requests.get(
'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=风景')
data = response.text
img_desc_dics = re.findall("app.setData(\('imgData.*?\));", data, re.S)[0]
img_desc_dics = eval(str(img_desc_dics))
# 获取所有图片的数据
img_datas = img_desc_dics[1]['data']
count = 0
for img_data in img_datas:
# 获取搜索图片的参数
os_ = img_data.get('os')
cs_ = img_data.get('cs')
if os_ and cs_:
# 获取搜索图片的信息
img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
img_search_response = requests.get(img_search_url)
img_search_data = img_search_response.text
# 获取图片信息
img_url = re.findall('''\('firstSc'\);" src="(.*?)"''', img_search_data)[0]
img_name = img_url.split('/')[-1]
img_name = os.path.join('百度图片', img_name) # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg
# 保存图片
img_response = requests.get(img_url)
img_data = img_response.content
fw = open(img_name, 'wb')
fw.write(img_data)
fw.flush()
# 提示
count += 1
print(f'{img_name}保存成功,成功保存{count}张')
# 防止百度禁ip,慢一点
time.sleep(0.01)
#########################################################################
###自行百度selenium的用法,使用这一套代码可以无限爬取所有图片,否则将被限制30张###
########################################################################
#
# page_count_end = 2 # 爬取 指定数字(10)* 30 = 300张图片
# chrome = webdriver.Chrome()
#
# try:
#
# chrome.implicitly_wait(10)
# chrome.get(
# 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs7&word=风景')
#
# js_code = '''
# window.scrollTo(0, document.body.scrollHeight);
# var lenOfPage = document.body.scrollHeight;
# return lenOfPage
# '''
#
# # selenium控制爬取页数
# count = 0
# page_count = 0
# while page_count < page_count_end:
# try:
# page_count += 1
# chrome.execute_script(js_code)
# time.sleep(0.3)
# except:
# continue
#
# img_desc_search_urls = re.findall('href="(/search/detail\?.*?)"', chrome.page_source, re.S) # re.S使.可以匹配换行符
#
# # 获取所有图片的数据
# for img_data in img_desc_search_urls:
# try:
# # 获取搜索图片的参数
# os_ = re.findall('os=(.*?)&', img_data)[0]
# cs_ = re.findall('cs=(.*?)&', img_data)[0]
#
# if os_ and cs_:
# # 获取搜索图片的信息
# img_search_url = f'http://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E9%A3%8E%E6%99%AF&step_word=&hs=0&pn=1&spn=0&di=195030&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=0&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs={cs_}&os={os_}'
# img_search_response = requests.get(img_search_url)
# img_search_data = img_search_response.text
#
# # 获取图片信息
# img_url = re.findall('''\('firstSc'\);" src="(.*?)"''', img_search_data)[0]
# img_name = img_url.split('/')[-1]
# img_name = os.path.join('百度图片', img_name) # 拼接出图片的地址,如 百度图片/3822951_144045377000_2.jpg
#
# # 保存图片
# img_response = requests.get(img_url)
# img_data = img_response.content
# fw = open(img_name, 'wb')
# fw.write(img_data)
# fw.flush()
#
# # 提示
# count += 1
# print(f'{img_name}保存成功,成功保存{count}张')
#
# # 防止百度禁ip,慢一点
# time.sleep(0.01)
# except:
# continue
#
# except Exception:
# pass
#
# finally:
# chrome.close()
视频爬墙
import requests
import re
response = requests.get('http://www.mod.gov.cn/v/index.htm')
# response.encoding = 'utf8'
data = response.text
# print(data)
# mp4_res1 = re.findall('<a href="(.*?)" class="img">',data)
# for i in mp4_res1:
# print(i)
mp4_res2 = re.findall('<a href="(.*?)">', data)
for i in mp4_res2: # type:str
res = re.findall('(.*?htm)', i)[0]
res = 'http://www.mod.gov.cn/v/' + res
response = requests.get(res)
data = response.text
# http://vv.chinamil.com.cn/asset/category3/2019/06/27/asset_357593.mp4
url_res = re.findall('//Video (.*?.mp4)',data)[0]
mp4_response = requests.get(url_res)
mp4_data = mp4_response.content
f = open('test.mp4','wb')
f.write(mp4_data)
# break
'''
<a href="2019-07/20/content_4846213.htm" class="img"><img src="attachement/jpg/site21/20190720/6c4b9041ab8b1e9ca1be01.jpg" border="0"><em class="video_40x40"></em></a>
'''