用户代理池构建
'''
所谓用户代理池,即将不用的用户代理组建成为一个池子,随后随机调用。
'''
'''
import urllib.request
import re
import random
uapools = [
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
def ua(uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ('User-Agent', thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener) # 安装为全局
for i in range(0,10):
url = "http://blog.csdn.net"
ua(uapools)
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<a href="([a-z]{4,5}.*?)" target="_blank" data-report-click=.*?'
alllink = re.compile(pat).findall(data)
print(alllink)
'''
IP代理与IP代理池的构建
- IP代理概述
- IP代理池构建的方式一
- IP代理池构建的方式二
'''
IP代理指的是让爬虫使用代理IP去爬网站。
国外IP更为可用,国内IP更易失效
西刺代理(免费)
大象代理(专业)
import urllib.request
# ip代理
ip = "66.228.54.238:8080"
proxy = urllib.request.ProxyHandler({"http": ip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
# 浏览器伪装
headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
opener.addheaders = [headers]
urllib.request.install_opener(opener)
url = "https://www.baidu.com"
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
print(len(data))
print(data)
fh = open("./ip_baidu.html", "w")
fh.write(data)
fh.close()
'''
'''
代理IP稳定的话,使用直接IP构建IP池
随机调用法实现IP代理池的构建
打印结果:
107.0.68.29:3128
198934
66.228.54.238:8080
166441
142.54.191.252:8080
198934
import random
import urllib.request
ippools = [
"107.0.68.29:3128",
"66.228.54.238:8080",
"142.54.191.252:8080"
]
def ip(ippools):
thisip = random.choice(ippools)
print(thisip)
proxy = urllib.request.ProxyHandler({"http": thisip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
opener.addheaders = [headers]
urllib.request.install_opener(opener)
for i in range(0, 3):
try:
ip(ippools)
url = "https://www.baidu.com"
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
print(len(data))
fh = open("./ip_baidu_"+str(i)+".html", "w")
fh.write(data)
fh.close()
except Exception as err:
print(err)
'''
'''
代理IP不稳定的话,使用接口请求方式获取IP构建IP池
import urllib.request
def api():
# 当调用接口比较稳定时候
#thisip = urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=1&foreign=only").read().decode("utf-8", "ignore")
# 当调用接口经常出现问题时候
print("本次调用了接口")
thisips = urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only")
ippools = []
for item in thisips:
ippools.append(item.decode("utf-8", "ignore"))
return ippools
def ip(ippools, time):
thisip = ippools[time]
print(thisip)
proxy = urllib.request.ProxyHandler({"http": thisip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
headers = ('User-Agent', "Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1")
opener.addheaders = [headers]
urllib.request.install_opener(opener)
x = 0
for i in range(0, 35):
try:
if(x%10==0):
time = x%10
ippools = api()
ip(ippools, time)
else:
time = x%10
ip(ippools, time)
url = "https://www.baidu.com"
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
print(len(data))
x += 1
except Exception as err:
print(err)
x += 1
'''
'''
由于tb未登录不能搜索商品,所以学习案例用jd
import urllib.request
import re
import random
keyname = "华为"
key = urllib.request.quote(keyname)
uapools = [
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
def ua(uapools):
thisua = random.choice(uapools)
print(thisua)
headers = ('User-Agent', thisua)
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener) # 安装为全局
for i in range(1, 2):
#url = "https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
url = "https://search.jd.com/Search?keyword="+key+"&enc=utf-8&page="+str(2*i-1)
ua(uapools)
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
#print(data)
pat = '<img width="220" height="220" class="err-product" data-img="1" source-data-lazy-img="(.*?)"'
imglist = re.compile(pat).findall(data)
#print(imglist)
for j in range(0, len(imglist)):
thisimg = imglist[j]
thisurl = "https:"+thisimg
localfile = "./imgs/"+str(i)+str(j)+".jpg"
urllib.request.urlretrieve(thisurl, filename=localfile)
'''
'''
break: 终止最近的循环,并进行程序的下一阶段(整个循环后面的语句)
continue: 终止本次循环,进入下一次循环,continue后面的代码不会执行
打印结果:
107.0.68.29:3128
166179
def ua_ip(myurl):
import urllib.request
import random
uapools = [
"Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50",
"Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1",
"Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11"
]
ippools = [
"107.0.68.29:3128",
"66.228.54.238:8080",
"142.54.191.252:8080"
]
def api(ippools, uapools):
thisua = random.choice(uapools)
print(thisua)
thisip = random.choice(ippools)
print(thisip)
proxy = urllib.request.ProxyHandler({"http": thisip})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
headers = ('User-Agent', thisua)
opener.addheaders = [headers]
urllib.request.install_opener(opener)
datas = []
for i in range(0, 3):
try:
api(ippools, uapools)
url = myurl
data = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
datas.append(data)
print(len(data))
break
except Exception as err:
print(err)
return datas
ua_ip("https://www.baidu.com")
'''