写的东西看的人太少了。。随便写个爬虫,正好用之前那个代理的改改就行,假装多一点点点击量。。勿喷,自己不太好意思用。。大家谁想用的,改一下自己的博客链接就好
import urllib.request
import random
import zlib
from bs4 import BeautifulSoup
import sqlite3
import os
import requests
import time
import random
header = { # 添加header可以将程序伪装成浏览器,但是这里没用到,用这个header会被检测,直接不用了
"Host": "www.kuaidaili.com",
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
class KuaiDaili(object):
def get_html(self, page):
TARGET_URL1 = ("http://www.kuaidaili.com/free/inha/%s" % page) # 目标地址
TARGET_URL2 = ("http://www.kuaidaili.com/free/intr/%s" % page)
if 1 == page%2:
html = requests.get(url=TARGET_URL1, headers=header, timeout=30).content # 获取html文本
else:
html = requests.get(url=TARGET_URL2, headers=header, timeout=30).content # 获取html文本
return html.decode("utf-8")
def ip_list(self, html):
soup = BeautifulSoup(html, 'lxml') # 创建一个BeautifulSoup,使用更强的lxml解析器,
#time.sleep(random.randint(3,6))
list_tr = soup.find('div', id='list').find_all('tr') # 提取id为list的div标签中的所有tr标签
for i in range(len(list_tr)): # 遍历tr标签的列表
if i == 0: continue # 因为从上图中我们可以看到第一个tr标签里面的内容是表头,
# 不是我们需要的数据,所以我们跳过第一个tr标签,从第二个tr标签开始遍历
tr = list_tr[i]
list_td = tr.find_all('td')# 获取每个tr标签中的所有td标签,分析html可知td标签从上到下
# 依次是ip,端口,匿名度等信息...
ip = list_td[0].get_text()
port = list_td[1].get_text()
anonymous = list_td[2].get_text()
LthLog(anonymous)
types = list_td[3].get_text()
#location = list_td[4].get_text()
#speed = list_td[5].get_text()
#verify_time = list_td[6].get_text()
# 创建代理对象,把每个代理信息都保存到对象中,这一步也可以跳过
ScratchIp(ip,port,types)
def batch_insert(self, page=200):# 批量插入
for i in range(1,page):
resp = self.get_html(i)
self.ip_list(resp)
def ReadHtml(html,response):
encoding = response.info().get('Content-Encoding')
#print(encoding)
if encoding == 'gzip':
html = zlib.decompress(html, 16+zlib.MAX_WBITS)
elif encoding == 'deflate':
try:
html = zlib.decompress(html, -zlib.MAX_WBITS)
except zlib.error:
html = zlib.decompress(html)
return html
def LthLog(logfile):
print(logfile)
def ScratchIp(ip,port,types):
time.sleep(random.randint(3,10))
proxy = "{\"" + types.lower() + "\" : \"" + ip + ":" + port +"\"}"
proxy = eval(proxy)
httpproxy_handler = urllib.request.ProxyHandler(proxy)
LthLog(proxy)
opener = urllib.request.build_opener(httpproxy_handler)
#request = urllib.request.Request("http://dns.weixin.qq.com/cgi-bin/micromsg-bin/newgetdns")
#response = urllib.request.urlopen(request)
try:
i = random.randint(2,10)
label = i%2
if label:
response = opener.open("https://blog.csdn.net/liutianheng654/article/list/1",timeout=2)
else:
response = opener.open("https://blog.csdn.net/liutianheng654/article/list/2",timeout=2)
except :
print("error")
return
html = response.read()
html = ReadHtml(html,response)
#print(html)
htmls = bytes.decode(html)
soup = BeautifulSoup(htmls, 'lxml')
nameTags = soup.findAll('div',{"class":"article-item-box csdn-tracking-statistics"})
temp = random.sample(nameTags,5)
for i in temp:
print(i.a['href'])
time.sleep(random.randint(3,20))
if i.a['href'].find('liutianheng')>=0:
request = urllib.request.Request(i.a['href'])
response = urllib.request.urlopen(request)
else:
print('nothing')
if __name__ == '__main__':
daili = KuaiDaili()
daili.batch_insert()