需要的包
import requests
from fake_useragent import UserAgent ###随机获取ua
import urllib3
import random ##随机
from requests.adapters import HTTPAdapter ### 重试
get--requests
def sendGetRequest(url):
s = requests.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
try:
data = s.get(url, headers=self.headers, verify=False)
except Exception as e:
print(e)
return None
return data
添加cookie的get的请求
def sendGetByCookie(url, cookie):
s = requests.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
# 'user-agent':random.sample(user_agent_w_list[0], 1),
'user-agent': UserAgent(verify_ssl=False).random,
# 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
'Cookie':cookie
}
try:
data = s.get(url, headers=headers, verify=False)
except Exception as e:
print(e)
return None
return data
urllib3的请求方式
def geturl(url, cookie):
http = urllib3.PoolManager()
requests.packages.urllib3.disable_warnings()
r = http.request('GET',
url,
headers=self.headers
)
return r.data.decode()
参数为json的get请求
def postJson(self, url, cookie, data, proxy):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3',
'Cookie':cookie,
'Content-Type': 'application/json'
}
r = requests.session()
requests.packages.urllib3.disable_warnings()
try:
res = r.post(url, json=data, headers=headers, proxies=proxy , verify=False)
except Exception as e:
print(url, e)
res = self.cycle(r, headers,url)
if not res.status_code == 200:
print('访问状态码', res.status_code)
res.encoding = 'utf8'
return res.text
通过代理的get的请求
def sendGetByProxy(url, proxies, cookie, paramDict):
'''
:param url:
:param proxies: proxies = {
'http': 'http://localhost:8888',
'https': 'http://localhost:8888'}
:param cookie:
:param paramDict:
:return:
'''
s = requests.Session()
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
headers = {
'user-agent': UserAgent(verify_ssl=False).random,
'Connection': 'close' ###释放keep_live
}
if cookie != None:
headers['Cookie'] = cookie
if paramDict:
headers.update(paramDict)
#######重试
s.mount('http://', HTTPAdapter(max_retries=3))
s.mount('https://', HTTPAdapter(max_retries=3))
if proxies:
try:
####设置超时 timeout=5 allow_redirects=False 取消跳转
data = s.get(url, headers=headers, proxies=proxies, verify=False, allow_redirects=False, timeout=10)
# requests.adapters.DEFAULT_RETRIES = 5 ####重试次数
s.keep_alive = False ####关闭不必要的链接
except Exception as e:
print('访问:',url,e)
data = self.cycle(s,headers, url)
return data
else:
try:
data = s.get(url, headers=headers, verify=False)
requests.adapters.DEFAULT_RETRIES = 5
s.keep_alive = False
except Exception as e:
print(e)
return None
print(data)
return data
ip出错自动获取新的ip重新获取
def cycle(s, headers, url):
'''
如果ip出错就获取新的ip
:param s:
:param headers:
:param url:
:return:
'''
times = 0
success = False
data = None
while times < 5 and not success:
try:
####设置超时 timeout=5 allow_redirects=False 取消跳转
data = s.get(url,
headers=headers,
proxies=self.getProxy(),
verify=False,
allow_redirects=False,
timeout=10)
# requests.adapters.DEFAULT_RETRIES = 5 ####重试次数
s.keep_alive = False ####关闭不必要的链接
success = True
if not data.status_code == 200:
print(url, data.status_code)
success = False
except Exception as e:
print('访问:', e)
return data
获取代理ip
def getProxy(proxie):
proxies = {}
ip = random.sample(proxie.keys(), 1)[0]
port = proxie[ip]
ip = ip.decode('unicode_escape')
port = port.decode('unicode_escape')
ip_port = ip + ':' + port
print('获取到的代理为', ip_port)
proxies['http'] = 'http://' + ip_port
proxies['https'] = 'https://' + ip_port
return proxies
模拟谷歌浏览器访问本地文件
def seleniumUtil(chromePath, filePath):
"""
:param browserPath: 浏览器的驱动所在的路径
:param filePath: 文件的相对路径
:return:
"""
chromedriver = chromePath
# os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
file_Path = "file://" + os.getcwd() + filePath
# os.getcwd() 文件所在的目录
driver.get(file_Path) ###里面也可以是url
sleep(5)
data = driver.page_source
driver.quit()
return data
无界面版谷歌
def ChromeOptionsUtil(chromePath, filePath):
"""
谷歌无界面版
:param phantomJs_Path:
:param filePath:
:return:
"""
options = webdriver.ChromeOptions()
# chrome_options = Options()
options.add_argument("--headless") # 设置谷歌为无界面模式
# chrome_options.add_argument("--disable-gpu")
file_Path = "file://" + os.getcwd() + filePath
driver = webdriver.Chrome(chrome_options=options, executable_path=chromePath) # 第一歌参数是谷歌options, 第二个参数是chromedriver的路径
driver.get(file_Path)
sleep(2)
data = driver.page_source
driver.close()
driver.quit()
return data
获取cookie,并且处理成可用cookie
def test():
driver = webdriver.Chrome('D:/chromedriver')
driver.get("https://*****")
sleep(3)
driver.find_element(By.CSS_SELECTOR, ' li:nth-child(1) > div > div.live-card-following-info > p.live-card-following-info-user > a').click()
###获取cookie
cookie = driver.get_cookies()
sleep(5)
driver.close()
driver.quit()
cookies = []
###处理获取到的cookie字典成字符串
for i in cookie:
cookies.append(i["name"] + "=" + i["value"])
cookiestr = '; '.join(item for item in cookies)
ru = redis_Util()
res = ru.redis_py('127.0.0.1', '6379', '', 1)
res.hset('cookies', cookiestr, '')
return cookiestr
网页乱码问题
#response中添加
res.encoding='gbk'
print(res.text)