不堪回首的爬虫请求urllib.request模块

import urllib.request
import urllib.error
import random
import socket
"""
 1、request 它是最基本的 HTTP 请求模块，我们可以用它来模拟发送一请求，
 就像在浏览器里输入网址然后敲击回车一样，
 只需要给库方法传入 URL 还有额外的参数，就可以模拟实现这个过程了。
"""
#1.1 urlopen
# urllib.request 模块中提供的最基本的HTTP构造方法，利用他可以模拟浏览器的请求发起过程。
# url : 爬取目标的URL；
# data : 请求参数，如果设置该参数，则请求默认为post请求；没有默认为get请求；
# timeout : 用于设置超时时间，单位为秒；
# context : 必须是一个ssl.SSLContext类型,用来指定SSL设置,忽略未认证的CA证书;
def use_urlopen():

    url = "http://www.baidu.com"
    response = urllib.request.urlopen(url=url)
    print(type(response))
    # 类文件对象支持文件对象的操作方法，如read()方法读取文件全部内容，返回字符串
    html = response.read()
    # 打印响应结果（byte类型）
    print(html)
    # 打印响应结果（utf-8类型）
    print(html.decode())
    # 打印状态码
    print(response.status)
    print(response.getcode())
    # 获取响应头
    print(response.getheaders())
    # 获取响应头Server信息
    print(response.getheader('Server'))
    # 获取响应结果原因
    print(response.reason)
def use_urlopen_data():
    url = "http://www.baidu.com"
    # urlopen data 参数
    data={
        'wd':'csdn'
    }
    # 格式化参数
    # 将参数字典转化为字符串
    form_data = urllib.parse.urlencode(data).encode()
    response = urllib.request.urlopen(url=url,data=form_data)
    print(response.read())
def use_urlopen_timeout():
    try:
        url = "http://www.baidu.com"
        response = urllib.request.urlopen(url=url,timeout=0.01)
        print(response.read())
    except urllib.error.URLError as e:
        if isinstance(e.reason,socket.timeout):
            print("TIME OUT 超时")

# 1.2 Requsert
# url : 爬取目标的URL,必选参数
# data 若过是字典，需要用urllib.parse.urlencode(data).encode()进行编码
# headers 用于伪装成浏览器，可以打开浏览器，F12，network，刷新，headers中寻找
# meathod ’POST' 'GET' 等
def use_Requsert():
    url = "http://www.baidu.com"
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
    }
    data = {
        'wd': 'csdn'
    }
    # 格式化参数
    form_data = urllib.parse.urlencode(data).encode()
    # 构建Request类
    req = urllib.request.Request(url=url,data=form_data,headers=headers,method='POST')
    # print(type(req))
    """
    request类Request方法常用的内置方法：
    Request.add_data(data)设置data参数，如果一开始创建的时候没有给data参数，那么可以使用该方法追加data参数；
    Request.get_method() 返回HTTP请求方法，一般返回GET或是POST；
    Request.has_data() 查看是否设置了data参数；
    Request.get_data() 获取data参数的数据；
    Request.add_header(key, val) 添加头部信息，key为头域名，val为域值；
    Request.get_full_url() 获取请求的完整url；
    Request.get_host() 返回请求url的host（主域名）；
    Request.set_proxy(host, type) 设置代理，第一个参数是代理ip和端口，第二个参数是代理类型（http/https)。
    """
    response = urllib.request.urlopen(req)
    print(response.read())

# 代理
def use_proxy():
    # 声明定义代理服务器列表
    url = "http://www.baidu.com"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
    }
    data = {
        'wd': 'csdn'
    }
    # 代理可能失效
    # urllib.error.URLError: <urlopen error [WinError 10061] 由于目标计算机积极拒绝，无法连接。>
    # 可以更换新的代理
    proxy_list = [
        {"http": "121.40.108.76:80"},
        {"http": "218.249.45.162:35586"},
        {"http": "218.27.136.169:8085"},
    ]
    # 随机选择一个代理,headers也可进行随机选择
    proxy = random.choice(proxy_list)

    # 使用选择的代理构建代理处理器对象
    http_proxy_handler = urllib.request.ProxyHandler(proxy)

    # 通过 urllib.request.build_opener(),创建自定义opener对象
    opener = urllib.request.build_opener(http_proxy_handler)
    # 格式化参数
    form_data = urllib.parse.urlencode(data).encode()
    # 创建Request对象
    req = urllib.request.Request(url=url,data=form_data, headers=headers)
    # 使用opener.open()方法发送请求才使用自定义的代理，而urlopen()则不使用自定义代理
    response = opener.open(req)
    print(response.read().decode())

if __name__ == '__main__':
    use_urlopen()
    use_urlopen_data()
    use_urlopen_timeout()
    use_Requsert()
    use_proxy()
参考资料：python3网络爬虫开发实战崔庆才
魏振东
发布了39 篇原创文章 · 获赞 41 · 访问量 1万+
私信关注
不堪回首的爬虫请求urllib.request模块

猜你喜欢