from random import randint
import time
import requests
from requests.exceptions import RequestException
from requests.cookies import RequestsCookieJar
from selenium.webdriver import Chrome, ChromeOptions
import json
import csv
class Common():
def wait(self):
# 等待一个随机时间,频繁发出请求容易被封ip,等待时间间隔一样容易被识别为爬虫
wait_time = randint(7, 20) * 0.1
time.sleep(wait_time)
def requests_get_response(self, url:str, headers:dict, params:dict=None, proxies=None):
# 若响应,则返回响应
try:
response = requests.get(url, headers=headers, params=params, proxies=proxies)
except RequestException as e:
print(f'ERROR: {response} url: {url}, params: {params} {e}')
return response
def selenium_cookiejar(self, driver:Chrome, url:str):
# 使用selenium进入网站,获取cookie(有些网站即使不登录也需要使用cookie才能正确访问)
driver.get(url)
input()
cookies = driver.get_cookies()
cookiejar = RequestsCookieJar()
for cookie in cookies:
cookiejar.set(cookie['name'], cookie['value'])
return cookiejar
def selenium_options(self):
# selenium基础反屏蔽:隐藏webdriver提示条、隐藏自动化扩展信息、将webdriver属性置空
option = ChromeOptions()
option.add_experimental_option(name='excludeSwitches', value=['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
driver = Chrome(options=option)
driver.execute_cdp_cmd(cmd='Page.addScriptToEvaluateOnNewDocument', cmd_args={
'source':'Object.defineProperty(navigator, "webdriver", {get:()=>undefined})'})
return driver
def save_json(self, filename:str, content, mode:str='a'):
# 将‘content’保存到json文件
with open(filename, mode=mode, encoding='utf-8') as file:
json.dump(content, file, ensure_ascii=False, indent=4)
def save_csv_header(self, filename:str, fieldnames:list, mode:str='a'):
# 写入将csv文件首行内容(excel表格标题行)
with open(filename, mode=mode, encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader
def save_csv_content(self, filename:str, fieldnames:list, content:list[dict], mode:str='a'):
# 将‘content’保存到csv文件
with open(filename, mode=mode, encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writerows(content)
def print_get_info(self, headers:dict=None, params:dict=None, proxies:dict=None):
# 输出响应内容
url = 'https://www.httpbin.org/get'
response = requests.get(url, headers=headers, params=params, proxies=proxies)
print(response.text)
class Parameter():
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
} # Windows浏览器UA
python爬虫自用的网站通用类
猜你喜欢
转载自blog.csdn.net/guanxxx/article/details/138713324
今日推荐
周排行