Python模拟浏览器访问URL

需要的包

import requests
from fake_useragent import UserAgent   ###随机获取ua
import urllib3
import random  ##随机
from requests.adapters import HTTPAdapter   ### 重试

get--requests

def sendGetRequest(url):
      s = requests.Session()
      urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
      try:
          data = s.get(url, headers=self.headers, verify=False)
      except Exception as e:
          print(e)
          return None
      return data

添加cookie的get的请求

def sendGetByCookie(url, cookie):
     s = requests.Session()
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
     headers = {
         # 'user-agent':random.sample(user_agent_w_list[0], 1),
         'user-agent': UserAgent(verify_ssl=False).random,
         # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
         'Cookie':cookie
     }
     try:
         data = s.get(url, headers=headers, verify=False)
     except Exception as e:
         print(e)
         return None
     return data

urllib3的请求方式

def geturl(url, cookie):
     http = urllib3.PoolManager()
     requests.packages.urllib3.disable_warnings()
     r = http.request('GET',
                      url,
                      headers=self.headers
                      )
     return r.data.decode()

参数为json的get请求

def postJson(self, url, cookie, data, proxy):
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3',
        'Cookie':cookie,
        'Content-Type': 'application/json'
    }
    r = requests.session()
    requests.packages.urllib3.disable_warnings()
    try:
        res = r.post(url, json=data, headers=headers, proxies=proxy , verify=False)
    except Exception as e:
        print(url, e)
        res = self.cycle(r, headers,url)
    if not res.status_code == 200:
        print('访问状态码', res.status_code)
    res.encoding = 'utf8'
    return res.text

通过代理的get的请求

def sendGetByProxy(url, proxies, cookie, paramDict):
     '''
     :param url:
     :param proxies: proxies = {
            'http': 'http://localhost:8888', 
            'https': 'http://localhost:8888'}
     :param cookie:
     :param paramDict:
     :return:
     '''
     s = requests.Session()
     urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
     headers = {
         'user-agent': UserAgent(verify_ssl=False).random,
         'Connection': 'close'     ###释放keep_live
     }
     if cookie != None:
         headers['Cookie'] = cookie
     if paramDict:
         headers.update(paramDict)
     #######重试
     s.mount('http://', HTTPAdapter(max_retries=3))
     s.mount('https://', HTTPAdapter(max_retries=3))

     if proxies:
         try:
             ####设置超时 timeout=5  allow_redirects=False 取消跳转
             data = s.get(url, headers=headers, proxies=proxies, verify=False, allow_redirects=False, timeout=10)

             # requests.adapters.DEFAULT_RETRIES = 5   ####重试次数
             s.keep_alive = False   ####关闭不必要的链接
         except Exception as e:
             print('访问:',url,e)
             data = self.cycle(s,headers, url)
             return data
     else:
         try:
             data = s.get(url, headers=headers, verify=False)
             requests.adapters.DEFAULT_RETRIES = 5
             s.keep_alive = False
         except Exception as e:
             print(e)
             return None
     print(data)
     return data

ip出错自动获取新的ip重新获取

def cycle(s, headers, url):
       '''
       如果ip出错就获取新的ip
       :param s:
       :param headers:
       :param url:
       :return:
       '''
       times = 0
       success = False
       data = None
       while times < 5 and not success:
           try:
               ####设置超时 timeout=5  allow_redirects=False 取消跳转
               data = s.get(url,
                            headers=headers,
                            proxies=self.getProxy(),
                            verify=False,
                            allow_redirects=False,
                            timeout=10)

               # requests.adapters.DEFAULT_RETRIES = 5   ####重试次数
               s.keep_alive = False  ####关闭不必要的链接
               success = True
               if not data.status_code == 200:
                   print(url, data.status_code)
                   success = False
           except Exception as e:
               print('访问:', e)
       return data

获取代理ip

  def getProxy(proxie):
        proxies = {}
        ip = random.sample(proxie.keys(), 1)[0]
        port = proxie[ip]
        ip = ip.decode('unicode_escape')
        port = port.decode('unicode_escape')
        ip_port = ip + ':' + port
        print('获取到的代理为', ip_port)
        proxies['http'] = 'http://' + ip_port
        proxies['https'] = 'https://' + ip_port
        return proxies

模拟谷歌浏览器访问本地文件

def seleniumUtil(chromePath, filePath):
    """
    :param browserPath:  浏览器的驱动所在的路径
    :param filePath:  文件的相对路径
    :return:
    """
    chromedriver = chromePath
    # os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Chrome(chromedriver)

    file_Path = "file://" + os.getcwd() + filePath
    # os.getcwd() 文件所在的目录
    driver.get(file_Path)   ###里面也可以是url
    sleep(5)
    data = driver.page_source
    driver.quit()
    return data

无界面版谷歌

def ChromeOptionsUtil(chromePath, filePath):
    """
      谷歌无界面版
    :param phantomJs_Path:
    :param filePath:
    :return:
    """
    options = webdriver.ChromeOptions()
    # chrome_options = Options()
    options.add_argument("--headless")  # 设置谷歌为无界面模式
    # chrome_options.add_argument("--disable-gpu")
    file_Path = "file://" + os.getcwd() + filePath
    driver = webdriver.Chrome(chrome_options=options, executable_path=chromePath)  # 第一歌参数是谷歌options, 第二个参数是chromedriver的路径
    driver.get(file_Path)
    sleep(2)
    data = driver.page_source
    driver.close()
    driver.quit()
    return data

获取cookie,并且处理成可用cookie

def test():
    driver = webdriver.Chrome('D:/chromedriver')
    driver.get("https://*****")
    sleep(3)
    driver.find_element(By.CSS_SELECTOR, ' li:nth-child(1) > div > div.live-card-following-info > p.live-card-following-info-user > a').click()
    ###获取cookie
    cookie = driver.get_cookies()
    sleep(5)
    driver.close()
    driver.quit()
    cookies = []
    ###处理获取到的cookie字典成字符串
    for i in cookie:
        cookies.append(i["name"] + "=" + i["value"])
    cookiestr = '; '.join(item for item in cookies)
    ru = redis_Util()
    res = ru.redis_py('127.0.0.1', '6379', '', 1)
    res.hset('cookies', cookiestr, '')

    return cookiestr

网页乱码问题

#response中添加
res.encoding='gbk'
print(res.text)
发布了35 篇原创文章 · 获赞 16 · 访问量 19万+

猜你喜欢

转载自blog.csdn.net/qq_38795430/article/details/100550169