在一个国内免费代理网站筛选有效代理

免费代理:免费代理是有地域和实效性的,所以要频繁更新。

import random
import requests
import re

normalHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
                  "t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}


class GetProxyServer:
    defaultHttpsURL = "http://www.xicidaili.com/wn/"
    defaultHttpURL = "http://www.xicidaili.com/wt/"
    ipAndPortRegex = r"<td>(([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\.){3}" \
                     r"([1]?\d?\d|[2][0-4]\d|[2][5][0-5])</td>\s*<td>\d+</td>"
    ipRegex = r"\b(([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\.){3}([1]?\d?\d|[2][0-4]\d|[2][5][0-5])\b"
    portRegex = r"\b\d+\b"

    ipAndPortPatter = re.compile(ipAndPortRegex)
    ipPatter = re.compile(ipRegex)
    portPatter = re.compile(portRegex)

    # 获取https代理的网页
    def getHttpsSrcPage(self, httpsURL=None):
        while True:
            index = 1
            if httpsURL:
                self.defaultHttpsURL = httpsURL
            resp = requests.get(self.defaultHttpsURL+str(index), headers=normalHeaders)
            yield resp.text
            index+=1

    # 获取http代理的网页
    def getHttpSrcPage(self, httpURL=None):
        while True:
            index = 1
            if httpURL:
                self.defaultHttpURL = httpURL
            resp = requests.get(self.defaultHttpURL+str(index), headers=normalHeaders)
            yield resp.text
            index+=1

    def getAllIpAndPortSpan(self, strSrc):
        return self.ipAndPortPatter.finditer(strSrc)

    def getIpAndPortTuple(self, srcStr):
        ipMatch = self.ipPatter.search(srcStr)
        ip = ipMatch.group()
        ipSpan = ipMatch.span()
        srcStr = srcStr[ipSpan[1]:]
        port = self.portPatter.search(srcStr).group()
        return ip, port

    def getProxyServer(self, srcStrPage):
        for match in self.getAllIpAndPortSpan(srcStrPage):
            yield self.getIpAndPortTuple(match.group())


    @staticmethod
    def getHttpProxyServer():
        gps = GetProxyServer()
        httpYiedSrcPage = gps.getHttpSrcPage()
        return gps.getProxyServer(next(httpYiedSrcPage)) # 返回一个可next()的对象

    @staticmethod
    def getHttpsProxyServer():
        gps = GetProxyServer()
        httpsYiedSrcPage = gps.getHttpsSrcPage()
        return gps.getProxyServer(next(httpsYiedSrcPage)) # 返回一个可next()的对象

    @staticmethod
    def getAvailableProxy(proxyType,proxSrc):
        url = ""
        if proxyType == "http":
            url = "http://httpbin.org/ip"
        elif proxyType == "https":
            url = "https://httpbin.org/ip"
        else:
            print("proxyType参数错误!")
            return
        times = 0
        while True:
            print("###############################################################")
            times+=1
            print("第",times,"个IP!")
            try:
                tempHttpServer = next(proxSrc)
                print(proxyType.upper(),"正在尝试连接远程代理服务器:",tempHttpServer[0],"......")
                resp = requests.get(url=url,proxies={proxyType:tempHttpServer[0]+":"+tempHttpServer[1]},timeout=3,headers=normalHeaders)
                respTxt = resp.text
                if tempHttpServer[0] in respTxt:
                    print("找到一个有效",proxyType.upper(),"类型代理服务器:",tempHttpServer)
                    return tempHttpServer
            except StopIteration:
                if proxyType == "http":
                    proxSrc = GetProxyServer.getHttpProxyServer()
                elif proxyType == "https":
                    proxSrc = GetProxyServer.getHttpsProxyServer()
            except Exception:
                print(proxyType.upper(),"远程代理服务器:",tempHttpServer[0],"无效!")
            print("###############################################################",end="\n\n")

    #最终我们要调用的方法
    @staticmethod
    def getProxyRule(num):
        rules = []
        httpProxy = GetProxyServer.getHttpProxyServer()
        httpsProxy = GetProxyServer.getHttpsProxyServer()
        for i in range(num):
            tempHttp = GetProxyServer.getAvailableProxy("http",httpProxy)
            tempHttps = GetProxyServer.getAvailableProxy("https",httpsProxy)
            rules.append({
                "http":tempHttp[0]+":"+tempHttp[1],
                "https":tempHttps[0]+":"+tempHttps[1]
            })
        return rules

模块测试:

import requests
import random
import GetProxyServer

normalHeaders = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
                  "t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}

prox = random.choice(GetProxyServer.getProxyRule(1))

try:
    response = requests.get("http://httpbin.org/ip",
                            proxies=prox,
                            headers=normalHeaders, timeout=6)
    print(response.text)
except Exception:
    print("连接代理服务器失败!")

输出情况:

###############################################################
第 1 个IP!
HTTP 正在尝试连接远程代理服务器: 139.224.135.94 ......
HTTP 远程代理服务器: 139.224.135.94 无效!
###############################################################

###############################################################
第 2 个IP!
HTTP 正在尝试连接远程代理服务器: 123.149.163.243 ......
HTTP 远程代理服务器: 123.149.163.243 无效!
###############################################################

###############################################################
第 3 个IP!
HTTP 正在尝试连接远程代理服务器: 61.135.217.7 ......
HTTP 远程代理服务器: 61.135.217.7 无效!
###############################################################

###############################################################
第 4 个IP!
HTTP 正在尝试连接远程代理服务器: 122.114.31.177 ......
HTTP 远程代理服务器: 122.114.31.177 无效!
###############################################################

###############################################################
第 5 个IP!
HTTP 正在尝试连接远程代理服务器: 113.86.221.244 ......
HTTP 远程代理服务器: 113.86.221.244 无效!
###############################################################

###############################################################
第 6 个IP!
HTTP 正在尝试连接远程代理服务器: 114.250.25.19 ......
HTTP 远程代理服务器: 114.250.25.19 无效!
###############################################################

###############################################################
第 7 个IP!
HTTP 正在尝试连接远程代理服务器: 60.175.213.156 ......
HTTP 远程代理服务器: 60.175.213.156 无效!
###############################################################

###############################################################
第 8 个IP!
HTTP 正在尝试连接远程代理服务器: 180.121.163.53 ......
HTTP 远程代理服务器: 180.121.163.53 无效!
###############################################################

###############################################################
第 9 个IP!
HTTP 正在尝试连接远程代理服务器: 114.113.126.86 ......
HTTP 远程代理服务器: 114.113.126.86 无效!
###############################################################

###############################################################
第 10 个IP!
HTTP 正在尝试连接远程代理服务器: 139.129.166.68 ......
HTTP 远程代理服务器: 139.129.166.68 无效!
###############################################################

###############################################################
第 11 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.211 ......
HTTP 远程代理服务器: 111.155.116.211 无效!
###############################################################

###############################################################
第 12 个IP!
HTTP 正在尝试连接远程代理服务器: 49.71.81.131 ......
HTTP 远程代理服务器: 49.71.81.131 无效!
###############################################################

###############################################################
第 13 个IP!
HTTP 正在尝试连接远程代理服务器: 116.22.53.147 ......
HTTP 远程代理服务器: 116.22.53.147 无效!
###############################################################

###############################################################
第 14 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.208 ......
HTTP 远程代理服务器: 111.155.116.208 无效!
###############################################################

###############################################################
第 15 个IP!
HTTP 正在尝试连接远程代理服务器: 182.91.135.139 ......
HTTP 远程代理服务器: 182.91.135.139 无效!
###############################################################

###############################################################
第 16 个IP!
HTTP 正在尝试连接远程代理服务器: 110.73.53.65 ......
HTTP 远程代理服务器: 110.73.53.65 无效!
###############################################################

###############################################################
第 17 个IP!
HTTP 正在尝试连接远程代理服务器: 111.155.116.217 ......
找到一个有效 HTTP 类型代理服务器: ('111.155.116.217', '8123')
###############################################################1 个IP!
HTTPS 正在尝试连接远程代理服务器: 122.72.18.35 ......
HTTPS 远程代理服务器: 122.72.18.35 无效!
###############################################################

###############################################################2 个IP!
HTTPS 正在尝试连接远程代理服务器: 180.173.66.20 ......
找到一个有效 HTTPS 类型代理服务器: ('180.173.66.20', '9797')
{
  "origin": "111.155.116.217"
}

http代理只能代理http类型网站,http不能代理https类型网站,分享一个比较好用的国内http类型代理网站:http://cn-proxy.com/

通过xpath改进代理网页中获取ip和端口:

import requests
import lxml.etree as etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKi"
                  "t/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
}
resp = requests.get("http://www.xicidaili.com/wt/",headers=headers)
page = resp.text
pageEtree = etree.HTML(page)
wanted = pageEtree.xpath('//*[@id="ip_list"]/tr[position()>1]')
ip = "td[2]/text()"
port = "td[3]/text()"
for temp in wanted:
    print("IP:",temp.xpath(ip),"端口:",temp.xpath(port))

输出:

IP: ['111.155.116.247'] 端口: ['8123']
IP: ['203.174.112.13'] 端口: ['3128']
IP: ['114.250.25.19'] 端口: ['80']
IP: ['122.114.31.177'] 端口: ['808']
IP: ['61.135.217.7'] 端口: ['80']
IP: ['139.224.135.94'] 端口: ['80']
IP: ['111.155.116.236'] 端口: ['8123']
IP: ['114.113.126.86'] 端口: ['80']
IP: ['171.38.41.110'] 端口: ['8123']
IP: ['110.90.133.85'] 端口: ['32420']
IP: ['120.37.174.49'] 端口: ['23890']
IP: ['115.210.72.4'] 端口: ['31844']
IP: ['121.31.195.60'] 端口: ['8123']
IP: ['139.129.166.68'] 端口: ['3128']
IP: ['110.189.207.86'] 端口: ['31759']
IP: ['175.11.50.117'] 端口: ['8118']
IP: ['110.73.3.148'] 端口: ['8123']
IP: ['218.74.82.123'] 端口: ['39960']
IP: ['49.82.54.12'] 端口: ['44318']
IP: ['180.120.200.83'] 端口: ['31395']
IP: ['115.46.66.241'] 端口: ['8123']
IP: ['116.213.98.6'] 端口: ['8080']
IP: ['58.216.202.149'] 端口: ['8118']
IP: ['59.56.242.89'] 端口: ['38598']
IP: ['218.20.54.179'] 端口: ['9797']
IP: ['139.198.191.105'] 端口: ['8888']
IP: ['222.182.53.254'] 端口: ['8118']
IP: ['121.31.100.21'] 端口: ['8123']
IP: ['118.254.145.30'] 端口: ['3128']
IP: ['27.46.74.23'] 端口: ['9999']
IP: ['116.3.205.22'] 端口: ['8888']
IP: ['116.231.242.248'] 端口: ['8118']
IP: ['14.211.117.224'] 端口: ['9797']
IP: ['14.153.54.72'] 端口: ['3128']
IP: ['111.155.116.208'] 端口: ['8123']
IP: ['112.228.170.47'] 端口: ['8118']
IP: ['61.178.238.122'] 端口: ['63000']
IP: ['111.155.116.226'] 端口: ['8123']
IP: ['182.90.91.127'] 端口: ['8123']
IP: ['222.183.210.106'] 端口: ['8118']
IP: ['180.122.149.141'] 端口: ['44871']
IP: ['110.73.1.65'] 端口: ['8123']
IP: ['221.7.175.182'] 端口: ['8123']
IP: ['118.254.155.64'] 端口: ['3128']
IP: ['115.215.51.136'] 端口: ['43619']
IP: ['221.229.18.14'] 端口: ['3128']
IP: ['110.73.28.249'] 端口: ['8123']
IP: ['115.221.121.158'] 端口: ['41932']
IP: ['125.109.194.143'] 端口: ['48398']
IP: ['111.155.116.211'] 端口: ['8123']
IP: ['111.155.116.224'] 端口: ['8123']
IP: ['222.71.89.180'] 端口: ['47202']
IP: ['171.11.228.255'] 端口: ['20802']
IP: ['112.124.39.77'] 端口: ['123']
IP: ['222.89.82.113'] 端口: ['22073']
IP: ['113.86.221.244'] 端口: ['808']
IP: ['123.149.163.243'] 端口: ['42989']
IP: ['60.175.213.156'] 端口: ['33188']
IP: ['180.121.163.53'] 端口: ['23441']
IP: ['49.71.81.131'] 端口: ['3128']
IP: ['116.22.53.147'] 端口: ['8118']
IP: ['182.91.135.139'] 端口: ['9999']
IP: ['110.73.53.65'] 端口: ['8123']
IP: ['111.155.116.217'] 端口: ['8123']
IP: ['111.155.116.237'] 端口: ['8123']
IP: ['171.39.40.152'] 端口: ['8123']
IP: ['111.155.116.196'] 端口: ['8123']
IP: ['115.46.78.9'] 端口: ['8123']
IP: ['121.237.137.172'] 端口: ['3128']
IP: ['27.46.36.2'] 端口: ['9797']
IP: ['111.155.116.216'] 端口: ['8123']
IP: ['110.73.32.142'] 端口: ['8123']
IP: ['202.194.14.72'] 端口: ['8118']
IP: ['115.46.65.122'] 端口: ['8123']
IP: ['110.73.52.86'] 端口: ['8123']
IP: ['112.95.89.176'] 端口: ['9999']
IP: ['182.88.177.105'] 端口: ['9797']
IP: ['110.88.31.5'] 端口: ['39451']
IP: ['111.155.116.215'] 端口: ['8123']
IP: ['111.155.116.225'] 端口: ['8123']
IP: ['210.45.125.41'] 端口: ['1080']
IP: ['111.155.116.207'] 端口: ['8123']
IP: ['124.134.31.118'] 端口: ['8118']
IP: ['222.186.45.60'] 端口: ['62222']
IP: ['121.31.103.198'] 端口: ['8123']
IP: ['110.72.35.95'] 端口: ['8123']
IP: ['180.119.65.55'] 端口: ['3128']
IP: ['110.189.207.60'] 端口: ['30392']
IP: ['27.44.197.175'] 端口: ['9797']
IP: ['112.64.76.70'] 端口: ['8118']
IP: ['61.143.19.17'] 端口: ['20772']
IP: ['121.31.145.100'] 端口: ['8123']
IP: ['183.51.122.45'] 端口: ['80']
IP: ['180.76.135.10'] 端口: ['3128']
IP: ['180.119.65.115'] 端口: ['3128']
IP: ['139.208.195.96'] 端口: ['8118']
IP: ['221.8.170.46'] 端口: ['8118']
IP: ['171.39.39.197'] 端口: ['8123']
IP: ['115.46.65.157'] 端口: ['8123']
IP: ['110.72.28.1'] 端口: ['8123']

猜你喜欢

转载自blog.csdn.net/marvel__dead/article/details/79375427