Python网络爬虫-3

用户代理池构建

  • 用户代理池概述
  • 用户代理池构建实战
# 用户代理池概述
'''
所谓用户代理池,即将不用的用户代理组建成为一个池子,随后随机调用。
'''
# 用户代理池构建实战
'''

import urllib.request
import re
import random
uapools = [
    "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
    "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
    "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
def ua(uapools):
    thisua = random.choice(uapools)
    print(thisua)
    headers = ('User-Agent', thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)  # 安装为全局
for i in range(0,10):
    url = "http://blog.csdn.net"
    ua(uapools)
    data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
    pat = '<a href="([a-z]{4,5}.*?)" target="_blank" data-report-click=.*?'
    alllink = re.compile(pat).findall(data)
    print(alllink)
'''

IP代理与IP代理池的构建

  • IP代理概述
  • IP代理池构建的方式一
  • IP代理池构建的方式二
# IP代理概述
'''
IP代理指的是让爬虫使用代理IP去爬网站。
国外IP更为可用,国内IP更易失效
西刺代理(免费)
大象代理(专业)

import urllib.request
# ip代理
ip = "66.228.54.238:8080"
proxy = urllib.request.ProxyHandler({"http": ip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
# 浏览器伪装
headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
opener.addheaders = [headers]
urllib.request.install_opener(opener)
url = "https://www.baidu.com"
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
print(len(data))
print(data)
fh = open("./ip_baidu.html", "w")
fh.write(data)
fh.close()
'''
# IP代理池构建的方式一
'''
代理IP稳定的话,使用直接IP构建IP池
随机调用法实现IP代理池的构建

打印结果:
107.0.68.29:3128
198934
66.228.54.238:8080
166441
142.54.191.252:8080
198934

import random
import urllib.request
ippools = [
    "107.0.68.29:3128",
    "66.228.54.238:8080",
    "142.54.191.252:8080"
]
def ip(ippools):
    thisip = random.choice(ippools)
    print(thisip)
    proxy = urllib.request.ProxyHandler({"http": thisip})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
for i in range(0, 3):
    try:
        ip(ippools)
        url = "https://www.baidu.com"
        data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
        print(len(data))
        fh = open("./ip_baidu_"+str(i)+".html", "w")
        fh.write(data)
        fh.close()
    except Exception as err:
        print(err)
'''
# IP代理池构建的方式二
'''
代理IP不稳定的话,使用接口请求方式获取IP构建IP池

import urllib.request
def api():
    # 当调用接口比较稳定时候
    #thisip = urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=1&foreign=only").read().decode("utf-8", "ignore")
    # 当调用接口经常出现问题时候
    print("本次调用了接口")
    thisips = urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")
    ippools = []
    for item in thisips:
        ippools.append(item.decode("utf-8", "ignore"))
    return ippools
def ip(ippools, time):
    thisip = ippools[time]
    print(thisip)
    proxy = urllib.request.ProxyHandler({"http": thisip})
    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
    headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
x = 0
for i in range(0, 35):
    try:
        if(x%10==0):
            time = x%10
            ippools = api()
            ip(ippools, time)
        else:
            time = x%10
            ip(ippools, time)
        url = "https://www.baidu.com"
        data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
        print(len(data))
        x += 1
    except Exception as err:
        print(err)
        x += 1
'''
# 网上商城商品图片爬虫
'''
由于tb未登录不能搜索商品,所以学习案例用jd

import urllib.request
import re
import random
keyname = "华为"
key = urllib.request.quote(keyname)
uapools = [
    "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
    "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
    "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
def ua(uapools):
    thisua = random.choice(uapools)
    print(thisua)
    headers = ('User-Agent', thisua)
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)  # 安装为全局
for i in range(1, 2):
    #url = "https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    url = "https://search.jd.com/Search?keyword="+key+"&enc=utf-8&page="+str(2*i-1)
    ua(uapools)
    data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
    #print(data)
    pat = '<img width="220" height="220" class="err-product" data-img="1" source-data-lazy-img="(.*?)"'
    imglist = re.compile(pat).findall(data)
    #print(imglist)
    for j in range(0, len(imglist)):
        thisimg = imglist[j]
        thisurl = "https:"+thisimg
        localfile = "./imgs/"+str(i)+str(j)+".jpg"
        urllib.request.urlretrieve(thisurl, filename=localfile)
'''
# 如何同时使用用户代理池与IP代理池
'''
break: 终止最近的循环,并进行程序的下一阶段(整个循环后面的语句)
continue: 终止本次循环,进入下一次循环,continue后面的代码不会执行
打印结果:
107.0.68.29:3128
166179

def ua_ip(myurl):
    import urllib.request
    import random
    uapools = [
        "Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
        "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
        "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
    ]
    ippools = [
    "107.0.68.29:3128",
    "66.228.54.238:8080",
    "142.54.191.252:8080"
	]
    def api(ippools, uapools):
        thisua = random.choice(uapools)
        print(thisua)
        thisip = random.choice(ippools)
        print(thisip)
        proxy = urllib.request.ProxyHandler({"http": thisip})
        opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
        headers = ('User-Agent', thisua)
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)
    datas = []
    for i in range(0, 3):
        try:
            api(ippools, uapools)
            url = myurl
            data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
            datas.append(data)
            print(len(data))
            break
        except Exception as err:
            print(err)
    return datas
ua_ip("https://www.baidu.com")
'''
发布了42 篇原创文章 · 获赞 0 · 访问量 1858

猜你喜欢

转载自blog.csdn.net/qq_44750620/article/details/104959520