免费代理:免费代理是有地域和实效性的,所以要频繁更新。
import random
import requests
import re
normalHeaders = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
"t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
class GetProxyServer:
defaultHttpsURL = "http://www.xicidaili.com/wn/"
defaultHttpURL = "http://www.xicidaili.com/wt/"
ipAndPortRegex = r"<td>(([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\.){3}" \
r"([1]?\d?\d|[2][0-4]\d|[2][5][0-5])</td>\s*<td>\d+</td>"
ipRegex = r"\b(([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\.){3}([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\b"
portRegex = r"\b\d+\b"
ipAndPortPatter = re.compile(ipAndPortRegex)
ipPatter = re.compile(ipRegex)
portPatter = re.compile(portRegex)
# 获取https代理的网页
def getHttpsSrcPage(self, httpsURL=None):
while True:
index = 1
if httpsURL:
self.defaultHttpsURL = httpsURL
resp = requests.get(self.defaultHttpsURL+str(index), headers=normalHeaders)
yield resp.text
index+=1
# 获取http代理的网页
def getHttpSrcPage(self, httpURL=None):
while True:
index = 1
if httpURL:
self.defaultHttpURL = httpURL
resp = requests.get(self.defaultHttpURL+str(index), headers=normalHeaders)
yield resp.text
index+=1
def getAllIpAndPortSpan(self, strSrc):
return self.ipAndPortPatter.finditer(strSrc)
def getIpAndPortTuple(self, srcStr):
ipMatch = self.ipPatter.search(srcStr)
ip = ipMatch.group()
ipSpan = ipMatch.span()
srcStr = srcStr[ipSpan[1]:]
port = self.portPatter.search(srcStr).group()
return ip, port
def getProxyServer(self, srcStrPage):
for match in self.getAllIpAndPortSpan(srcStrPage):
yield self.getIpAndPortTuple(match.group())
@staticmethod
def getHttpProxyServer():
gps = GetProxyServer()
httpYiedSrcPage = gps.getHttpSrcPage()
return gps.getProxyServer(next(httpYiedSrcPage)) # 返回一个可next()的对象
@staticmethod
def getHttpsProxyServer():
gps = GetProxyServer()
httpsYiedSrcPage = gps.getHttpsSrcPage()
return gps.getProxyServer(next(httpsYiedSrcPage)) # 返回一个可next()的对象
@staticmethod
def getAvailableProxy(proxyType,proxSrc):
url = ""
if proxyType == "http":
url = "http://httpbin.org/ip"
elif proxyType == "https":
url = "https://httpbin.org/ip"
else:
print("proxyType参数错误!")
return
times = 0
while True:
print("###############################################################")
times+=1
print("第",times,"个IP!")
try:
tempHttpServer = next(proxSrc)
print(proxyType.upper(),"正在尝试连接远程代理服务器:",tempHttpServer[0],"......")
resp = requests.get(url=url,proxies={proxyType:tempHttpServer[0]+":"+tempHttpServer[1]},timeout=3,headers=normalHeaders)
respTxt = resp.text
if tempHttpServer[0] in respTxt:
print("找到一个有效",proxyType.upper(),"类型代理服务器:",tempHttpServer)
return tempHttpServer
except StopIteration:
if proxyType == "http":
proxSrc = GetProxyServer.getHttpProxyServer()
elif proxyType == "https":
proxSrc = GetProxyServer.getHttpsProxyServer()
except Exception:
print(proxyType.upper(),"远程代理服务器:",tempHttpServer[0],"无效!")
print("###############################################################",end="\n\n")
#最终我们要调用的方法
@staticmethod
def getProxyRule(num):
rules = []
httpProxy = GetProxyServer.getHttpProxyServer()
httpsProxy = GetProxyServer.getHttpsProxyServer()
for i in range(num):
tempHttp = GetProxyServer.getAvailableProxy("http",httpProxy)
tempHttps = GetProxyServer.getAvailableProxy("https",httpsProxy)
rules.append({
"http":tempHttp[0]+":"+tempHttp[1],
"https":tempHttps[0]+":"+tempHttps[1]
})
return rules
模块测试:
import requests
import random
import GetProxyServer
normalHeaders = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
"t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
prox = random.choice(GetProxyServer.getProxyRule(1))
try:
response = requests.get("http://httpbin.org/ip",
proxies=prox,
headers=normalHeaders, timeout=6)
print(response.text)
except Exception:
print("连接代理服务器失败!")
输出情况:
###############################################################
第 1 个IP!
HTTP 正在尝试连接远程代理服务器: 139.224.135.94 ......
HTTP 远程代理服务器: 139.224.135.94 无效!
###############################################################
###############################################################
第 2 个IP!
HTTP 正在尝试连接远程代理服务器: 123.149.163.243 ......
HTTP 远程代理服务器: 123.149.163.243 无效!
###############################################################
###############################################################
第 3 个IP!
HTTP 正在尝试连接远程代理服务器: 61.135.217.7 ......
HTTP 远程代理服务器: 61.135.217.7 无效!
###############################################################
###############################################################
第 4 个IP!
HTTP 正在尝试连接远程代理服务器: 122.114.31.177 ......
HTTP 远程代理服务器: 122.114.31.177 无效!
###############################################################
###############################################################
第 5 个IP!
HTTP 正在尝试连接远程代理服务器: 113.86.221.244 ......
HTTP 远程代理服务器: 113.86.221.244 无效!
###############################################################
###############################################################
第 6 个IP!
HTTP 正在尝试连接远程代理服务器: 114.250.25.19 ......
HTTP 远程代理服务器: 114.250.25.19 无效!
###############################################################
###############################################################
第 7 个IP!
HTTP 正在尝试连接远程代理服务器: 60.175.213.156 ......
HTTP 远程代理服务器: 60.175.213.156 无效!
###############################################################
###############################################################
第 8 个IP!
HTTP 正在尝试连接远程代理服务器: 180.121.163.53 ......
HTTP 远程代理服务器: 180.121.163.53 无效!
###############################################################
###############################################################
第 9 个IP!
HTTP 正在尝试连接远程代理服务器: 114.113.126.86 ......
HTTP 远程代理服务器: 114.113.126.86 无效!
###############################################################
###############################################################
第 10 个IP!
HTTP 正在尝试连接远程代理服务器: 139.129.166.68 ......
HTTP 远程代理服务器: 139.129.166.68 无效!
###############################################################
###############################################################
第 11 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.211 ......
HTTP 远程代理服务器: 111.155.116.211 无效!
###############################################################
###############################################################
第 12 个IP!
HTTP 正在尝试连接远程代理服务器: 49.71.81.131 ......
HTTP 远程代理服务器: 49.71.81.131 无效!
###############################################################
###############################################################
第 13 个IP!
HTTP 正在尝试连接远程代理服务器: 116.22.53.147 ......
HTTP 远程代理服务器: 116.22.53.147 无效!
###############################################################
###############################################################
第 14 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.208 ......
HTTP 远程代理服务器: 111.155.116.208 无效!
###############################################################
###############################################################
第 15 个IP!
HTTP 正在尝试连接远程代理服务器: 182.91.135.139 ......
HTTP 远程代理服务器: 182.91.135.139 无效!
###############################################################
###############################################################
第 16 个IP!
HTTP 正在尝试连接远程代理服务器: 110.73.53.65 ......
HTTP 远程代理服务器: 110.73.53.65 无效!
###############################################################
###############################################################
第 17 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.217 ......
找到一个有效 HTTP 类型代理服务器: ('111.155.116.217', '8123')
###############################################################
第 1 个IP!
HTTPS 正在尝试连接远程代理服务器: 122.72.18.35 ......
HTTPS 远程代理服务器: 122.72.18.35 无效!
###############################################################
###############################################################
第 2 个IP!
HTTPS 正在尝试连接远程代理服务器: 180.173.66.20 ......
找到一个有效 HTTPS 类型代理服务器: ('180.173.66.20', '9797')
{
"origin": "111.155.116.217"
}
http代理只能代理http类型网站,http不能代理https类型网站,分享一个比较好用的国内http类型代理网站:http://cn-proxy.com/
通过xpath改进代理网页中获取ip和端口:
import requests
import lxml.etree as etree
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
"t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
resp = requests.get("http://www.xicidaili.com/wt/",headers=headers)
page = resp.text
pageEtree = etree.HTML(page)
wanted = pageEtree.xpath('//*[@id="ip_list"]/tr[position()>1]')
ip = "td[2]/text()"
port = "td[3]/text()"
for temp in wanted:
print("IP:",temp.xpath(ip),"端口:",temp.xpath(port))
输出:
IP: ['111.155.116.247'] 端口: ['8123']
IP: ['203.174.112.13'] 端口: ['3128']
IP: ['114.250.25.19'] 端口: ['80']
IP: ['122.114.31.177'] 端口: ['808']
IP: ['61.135.217.7'] 端口: ['80']
IP: ['139.224.135.94'] 端口: ['80']
IP: ['111.155.116.236'] 端口: ['8123']
IP: ['114.113.126.86'] 端口: ['80']
IP: ['171.38.41.110'] 端口: ['8123']
IP: ['110.90.133.85'] 端口: ['32420']
IP: ['120.37.174.49'] 端口: ['23890']
IP: ['115.210.72.4'] 端口: ['31844']
IP: ['121.31.195.60'] 端口: ['8123']
IP: ['139.129.166.68'] 端口: ['3128']
IP: ['110.189.207.86'] 端口: ['31759']
IP: ['175.11.50.117'] 端口: ['8118']
IP: ['110.73.3.148'] 端口: ['8123']
IP: ['218.74.82.123'] 端口: ['39960']
IP: ['49.82.54.12'] 端口: ['44318']
IP: ['180.120.200.83'] 端口: ['31395']
IP: ['115.46.66.241'] 端口: ['8123']
IP: ['116.213.98.6'] 端口: ['8080']
IP: ['58.216.202.149'] 端口: ['8118']
IP: ['59.56.242.89'] 端口: ['38598']
IP: ['218.20.54.179'] 端口: ['9797']
IP: ['139.198.191.105'] 端口: ['8888']
IP: ['222.182.53.254'] 端口: ['8118']
IP: ['121.31.100.21'] 端口: ['8123']
IP: ['118.254.145.30'] 端口: ['3128']
IP: ['27.46.74.23'] 端口: ['9999']
IP: ['116.3.205.22'] 端口: ['8888']
IP: ['116.231.242.248'] 端口: ['8118']
IP: ['14.211.117.224'] 端口: ['9797']
IP: ['14.153.54.72'] 端口: ['3128']
IP: ['111.155.116.208'] 端口: ['8123']
IP: ['112.228.170.47'] 端口: ['8118']
IP: ['61.178.238.122'] 端口: ['63000']
IP: ['111.155.116.226'] 端口: ['8123']
IP: ['182.90.91.127'] 端口: ['8123']
IP: ['222.183.210.106'] 端口: ['8118']
IP: ['180.122.149.141'] 端口: ['44871']
IP: ['110.73.1.65'] 端口: ['8123']
IP: ['221.7.175.182'] 端口: ['8123']
IP: ['118.254.155.64'] 端口: ['3128']
IP: ['115.215.51.136'] 端口: ['43619']
IP: ['221.229.18.14'] 端口: ['3128']
IP: ['110.73.28.249'] 端口: ['8123']
IP: ['115.221.121.158'] 端口: ['41932']
IP: ['125.109.194.143'] 端口: ['48398']
IP: ['111.155.116.211'] 端口: ['8123']
IP: ['111.155.116.224'] 端口: ['8123']
IP: ['222.71.89.180'] 端口: ['47202']
IP: ['171.11.228.255'] 端口: ['20802']
IP: ['112.124.39.77'] 端口: ['123']
IP: ['222.89.82.113'] 端口: ['22073']
IP: ['113.86.221.244'] 端口: ['808']
IP: ['123.149.163.243'] 端口: ['42989']
IP: ['60.175.213.156'] 端口: ['33188']
IP: ['180.121.163.53'] 端口: ['23441']
IP: ['49.71.81.131'] 端口: ['3128']
IP: ['116.22.53.147'] 端口: ['8118']
IP: ['182.91.135.139'] 端口: ['9999']
IP: ['110.73.53.65'] 端口: ['8123']
IP: ['111.155.116.217'] 端口: ['8123']
IP: ['111.155.116.237'] 端口: ['8123']
IP: ['171.39.40.152'] 端口: ['8123']
IP: ['111.155.116.196'] 端口: ['8123']
IP: ['115.46.78.9'] 端口: ['8123']
IP: ['121.237.137.172'] 端口: ['3128']
IP: ['27.46.36.2'] 端口: ['9797']
IP: ['111.155.116.216'] 端口: ['8123']
IP: ['110.73.32.142'] 端口: ['8123']
IP: ['202.194.14.72'] 端口: ['8118']
IP: ['115.46.65.122'] 端口: ['8123']
IP: ['110.73.52.86'] 端口: ['8123']
IP: ['112.95.89.176'] 端口: ['9999']
IP: ['182.88.177.105'] 端口: ['9797']
IP: ['110.88.31.5'] 端口: ['39451']
IP: ['111.155.116.215'] 端口: ['8123']
IP: ['111.155.116.225'] 端口: ['8123']
IP: ['210.45.125.41'] 端口: ['1080']
IP: ['111.155.116.207'] 端口: ['8123']
IP: ['124.134.31.118'] 端口: ['8118']
IP: ['222.186.45.60'] 端口: ['62222']
IP: ['121.31.103.198'] 端口: ['8123']
IP: ['110.72.35.95'] 端口: ['8123']
IP: ['180.119.65.55'] 端口: ['3128']
IP: ['110.189.207.60'] 端口: ['30392']
IP: ['27.44.197.175'] 端口: ['9797']
IP: ['112.64.76.70'] 端口: ['8118']
IP: ['61.143.19.17'] 端口: ['20772']
IP: ['121.31.145.100'] 端口: ['8123']
IP: ['183.51.122.45'] 端口: ['80']
IP: ['180.76.135.10'] 端口: ['3128']
IP: ['180.119.65.115'] 端口: ['3128']
IP: ['139.208.195.96'] 端口: ['8118']
IP: ['221.8.170.46'] 端口: ['8118']
IP: ['171.39.39.197'] 端口: ['8123']
IP: ['115.46.65.157'] 端口: ['8123']
IP: ['110.72.28.1'] 端口: ['8123']