requests模块高级.ipynb、获取cookie、代理操作、代理池、爬西刺免费代理IP、爬雪球网、模拟登陆古诗文网、验证码的识别、进程(multiprocessing)中的线程(dummy)、协程、多任务、flask_server、单线程+多任务异步协程在爬虫中的应用、

- HttpConnectinPool:
- 原因:
- 1.短时间内发起了高频的请求导致ip被禁
- 2.http连接池中的连接资源被耗尽
- 解决:
- 1.代理
- 2.headers中加入Conection:“close”
- 代理:代理服务器,可以接受请求然后将其转发。
- 匿名度
- 高匿:啥也不知道
- 匿名:知道你使用了代理,但是不知道你的真实ip
- 透明:知道你使用了代理并且知道你的真实ip
- 类型:
- http
- https
- 免费代理:
- www.goubanjia.com
- 快代理
- 西祠代理
- http: // http.zhiliandaili.cn /

获取cookie.py

import os
import sqlite3
import win32crypt

username = os.environ.get('USERNAME')
cookie_file = 'C:/Users/{UserName}/AppData/Local/Google/Chrome/User Data/Default/Cookies'.format(UserName=username)
con = sqlite3.connect(cookie_file)
cursor = con.cursor()
sql = 'SELECT host_key, name, value, encrypted_value FROM cookies WHERE name = "xxxxx"' and 'host_key="xxxxx";'
try:
if cursor.execute(sql):
for en_value in cursor:
pwdHash = en_value[3]
if pwdHash:
ret = win32crypt.CryptUnprotectData(pwdHash, None, None, None, 0)
a = bytes.decode(ret[1])
except Exception as e:
print(e)

代理操作.py

import requests

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
url = "https://www.baidu.com/s?wd=ip"
page_text = requests.get(url, headers=headers, proxies={"https": "111.231.94.44:8888"}).text
with open("ip1.html", "w", encoding="utf-8") as fp:
fp.write(page_text)

代理池.py

import random
import requests

proxy_list = [
{'https': '111.231.94.44:8888'},
{'https': '121.231.94.44:8888'},
{'https': '131.231.94.44:8888'},
]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
url = "https://www.baidu.com/s?wd=ip"
page_text = requests.get(url, headers=headers, proxies=random.choice(proxy_list)).text
with open("ip1.html", "w", encoding="utf-8") as fp:
fp.write(page_text)

爬西刺免费代理IP.py

import requests
import random
from lxml import etree

# 伪造请求头的连接状态:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
"Connection": "close",
}

# 要爬取的网站:
url = "https://www.xicidaili.com/nn/%d"

# 获取代理ip的URL并放入代理池:
ip_url = "http://ip.11jsq.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=50&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2"
page_text = requests.get(ip_url, headers=headers).text
tree = etree.HTML(page_text)
ip_list = tree.xpath("//body//text()")
print("代理池中的ip有:", ip_list)

# HTTP和HTTPS池中的ip
proxy_list_http = []
proxy_list_https = []

# 拿到前20页的ip信息:ip地址+端口+协议
for page in range(1, 20):
new_url = format(url % page)
ip_port = random.choice(ip_list)
page_text = requests.get(new_url, headers=headers,
proxies={'https': ip_port}).text # verity=False是忽略信息操作、proxies是加代理的ip地址和端口
tree = etree.HTML(page_text)
tr_list = tree.xpath('//*[@id="ip_list"]//tr')[1:]
for tr in tr_list:
ip = tr.xpath('./td[2]/text()')[0]
port = tr.xpath('./td[3]/text()')[0]
t_type = tr.xpath('./td[6]/text()')[0]
ips = ip + ":" + "port"
if t_type == "HTTP":
dic = {
t_type: ips
}
proxy_list_http.append(dic)
else:
dic = {
t_type: ips
}
proxy_list_https.append(dic)
print(len(proxy_list_http), len(proxy_list_https)) # HTTP和HTTPS池中的ip数量

# 检测:
for ip in proxy_list_http:
response = requests.get("https://www.sougou.com", headers=headers, proxies={"https": ip})
if response.status_code == "200":
print('检测到了可用ip')
- cookie的处理
- 手动处理:将cookie封装到headers中
- 自动处理:session对象。可以创建一个session对象,改对象可以像requests一样进行请求发送。不同之处在于如果在使用session进行请求发送的过程中产生了cookie,则cookie会被自动存储在session对象中。

爬雪球网.py

import requests
from lxml import etree

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
}
session = requests.Session()
session.get("https://xueqiu.com", headers=headers) # 用session发请求
url = "https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20365831&count=15&category=-1"
page_text = session.get(url=url, headers=headers).json()
print(page_text)

模拟登陆古诗文网.py

import requests
from lxml import etree
from hashlib import md5


class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
"""用户名、密码、软件id"""
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}

def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()

def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()


def tranformImgData(imgPath, t_type):
"""识别功能"""
chaojiying = Chaojiying_Client('17338132275', '17338132275', '903523') # 用户中心>>软件ID 生成一个替换 96001
im = open(imgPath, 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
return chaojiying.PostPic(im, t_type)["pic_str"] # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()


# 伪造请求头:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
}

# session请求:
s = requests.Session()

# 要识别的url:
url = "https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx"
page_text = s.get(url, headers=headers).text
tree = etree.HTML(page_text)

# 图片的url:
img_src = 'https://so.gushiwen.org/' + tree.xpath('//*[@id="imgCode"]/@src')[0]

# 识别到的图片数据:
img_data = s.get(img_src, headers=headers).content

# 储存:
with open("./code.jpg", "wb") as f:
f.write(img_data)

print("识别到的验证码为:", tranformImgData("./code.jpg", 1004))

# 动态获取变化的请求参数
__VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
__VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]

# 获取到的验证码为:
code_text = tranformImgData("./code.jpg", 1004)
print("获取到的验证码为", code_text)

# 登陆的url
login_url = 'https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'

# post请求验证的参数:
data = {
"__VIEWSTATE": __VIEWSTATE,
"__VIEWSTATEGENERATOR": __VIEWSTATEGENERATOR,
"from": "http://so.gushiwen.org/user/collect.aspx",
"email": "[email protected]",
"pwd": "bobo328410948",
"code": code_text,
"denglu": "登录",
}
page_text1 = s.post(url=login_url, headers=headers, data=data).text
with open("login.html", "w", encoding="utf-8") as fp:
fp.write(page_text1)
- 验证码的识别
- 超级鹰:http://www.chaojiying.com/about.html
- 注册:(用户中心身份)
- 登陆:
- 创建一个软件:899370
- 下载示例代码
- 打码兔
- 云打码
- 动态变化的请求参数
- 通常情况下动态变化的请求参数都会被隐藏在前台页面源码中
### 单线程+多任务异步协程
- 协程
- 在函数(特殊的函数)定义的时候,如果使用了async修饰的话,则改函数调用后会返回一个协程对象,并且函数内部的实现语句不会被立即执行
- 任务对象
- 任务对象就是对协程对象的进一步封装。任务对象==高级的协程对象==特殊的函数
- 任务对象时必须要注册到事件循环对象中
- 给任务对象绑定回调:爬虫的数据解析中
- 事件循环
- 当做是一个容器,容器中必须存放任务对象。
- 当启动事件循环对象后,则事件循环对象会对其内部存储任务对象进行异步的执行。
- aiohttp:支持异步网络请求的模块

进程(multiprocessing)中的线程(dummy).py

import time
import requests
# 导入进程(multiprocessing)中的线程(dummy)
from multiprocessing.dummy import Pool
from time import sleep

# 开始时间:
start = time.time()

# urls列表:
urls = [
"http://127.0.0.1:5000/index",
"http://127.0.0.1:5000/index",
"http://127.0.0.1:5000/index",
]


def get_request(url):
"""获取请求的url"""
pass


# 实例化3个线程对象处理异步操作:
pool = Pool(3)
pool.map(get_request, urls) # 自定义函数get_request处理每个列表的元素

print("总耗时", time.time() - start)
结果: 正在下载: www.1.com
正在下载: www.2.com
正在下载: www.3.com
下载结束
下载结束
下载结束
总耗时 2.011126756668091

server1.py

import time
import requests
from flask import Flask
from time import sleep

# 实例化:
app = Flask(__name__)


# 增加路由:
@app.route("/index")
def index():
"""主页功能"""
sleep(2)
return "hello"

@app.route("/index1")
def index1():
""""""
sleep(2)
return "hello1"


if __name__ == '__main__':
app.run()

server2.py

import time
import requests
# 导入进程(multiprocessing)中的线程(dummy)
from multiprocessing.dummy import Pool
from time import sleep

# 开始时间:
start = time.time()

# urls列表:
urls = [
"http://127.0.0.1:5000/index1",
"http://127.0.0.1:5000/index",
]


def get_request(url):
"""获取请求的url"""
page_text = requests.get(url).text
print(page_text)


# 实例化3个线程对象处理异步操作:
pool = Pool(5)
pool.map(get_request, urls) # 自定义函数get_request处理每个列表的元素

print("总耗时", time.time() - start)

协程.py

import asyncio


def callback(task):
"""作为任务对象的回调函数"""
print("i am callback and ", task.result())


# 函数前面加async修饰就是协程:
async def test():
print("i am test()")
return "bobo"


c = test() # c是返回的协程对象

# 封装一个任务对象:
task = asyncio.ensure_future(c)

# 给任务对象绑定回调:
task.add_done_callback(callback)

# 创建一个事件循环的对象:
loop = asyncio.get_event_loop()

# 任务对象放入到事件循环对象中:
loop.run_until_complete(task)

多任务.py

import asyncio
import time

start = time.time()


# 函数前面加async修饰就是协程、在特殊函数内部的实现中不可以出现不支持异步的模块代码
async def get_request(url):
asyncio.sleep(2)
print("下载成功:", url)


urls = [
"www.1.com",
"www.2.com",
]

tasks = [] # 多任务列表
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c) # 生成了任务对象
tasks.append(task)

loop = asyncio.get_event_loop() # 创建事件循环对象
loop.run_until_complete(asyncio.wait(tasks)) # 多任务对象放入到事件循环对象中并挂起、
print(time.time() - start)

flask_server.py

from flask import Flask
import time

app = Flask(__name__)


@app.route('/bobo')
def index_bobo():
time.sleep(2)
return 'Hello bobo'


@app.route('/jay')
def index_jay():
time.sleep(2)
return 'Hello jay'


@app.route('/tom')
def index_tom():
time.sleep(2)
return 'Hello tom'


if __name__ == '__main__':
app.run(threaded=True)

单线程+多任务异步协程在爬虫中的应用.py

import requests
import aiohttp
import time
import asyncio

s = time.time()
urls = [
'http://127.0.0.1:5000/bobo',
'http://127.0.0.1:5000/jay'
]


async def get_request(url):
async with aiohttp.ClientSession() as s:
async with await s.get(url=url) as response:
page_text = await response.text()
print(page_text)
return page_text


tasks = []
for url in urls:
c = get_request(url)
task = asyncio.ensure_future(c)
tasks.append(task)

loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))

print(time.time() - s)

猜你喜欢

转载自www.cnblogs.com/zhang-da/p/12323932.html