百度 排名 url 解析 获取主域名

import requests
import re

url = "https://www.baidu.com/s?wd=seo&rsv_spt=1&rsv_iqid=0x86e343c80002c005&issp=1&f=8&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_dl=tb&rsv_sug3=2&rsv_sug1=1&rsv_sug7=100&rsv_sug2=0&rsv_btype=i&inputT=821&rsv_sug4=1460"

payload={}
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0',
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
  'Connection': 'keep-alive',
  'Cookie': 'BAIDUID=CC826C6100194A69E8DF5F2B876D36FF:FG=1; BIDUPSID=CC826C6100194A69E46991FDCA2F9148; PSTM=1605770717; BD_UPN=13314752; BDUSS=DEzODJSQ0c0bWNvajdnTU14Zjk2UldkM3JQRWVTVk0yV3pxeU5SaVhKeUpQdkJmSVFBQUFBJCQAAAAAAAAAAAEAAACc2Qgdb21hbjk5OTkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAImxyF-Jschfe; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1440_33061_33098_33101_32846_33199_33144_33149; BDSFRCVID=UN4OJexroG3Szrnr1w9Nh-XA_gKK0njTDYLEdsT2v2Mifd_VNw0IEG0PtDRfJqF-0CO2ogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; H_BDCLCKID_SF=JRC8oIK-JCvbfP0k5bofhnDW-qO2aI62aKDs-lu2BhcqJ-ovQTb62RDHKGQfqx63HGbCMb3cWKJ_jxbeWfvpbbt7DlDHW4ch5NOp2t0MWl5nhMJmQh5tyR_zqH3mW5jy523ion3vQpP-OpQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0-nDSHH_jqTcP; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a03571489433ZgeZpY8qxwvZrN2mZt%2Fxbk6qfqTSdU%2FOwGTdgU9NcnezlFluoWuAgqJi62hMVA0fGWMtBmURhpsYg0N%2BBq3vCPcOIFY8XITZUhk1eRvsdAHc7WW%2Bw8IuGhLWu6VCBek62%2F99X96C3uHll70uCfrRIeVE2EdmCLVTFE%2FDLbHQXdNj4S5hlL5wXCSQRiM6FmEEYrHxpZFMPTzoGGCU9v9JeNriBamUAUaOuuDk5ZBThaiqNy1xqPFl8XqzZD2b0V3bgPBqgm%2BPtMmiBW73K7mdY6geIfXlXFwN1ckclEMpXVXuqrnQ1HoAZ0EXDiepCikJ66795780514342583374798341083894; H_PS_645EC=cc640b%2BojxktnIMWy5o2qfnQrKc0Vet9uGh%2FxYc2ogc1rdk6JlBOVyLb86gqfP8lZRXh; COOKIE_SESSION=213_0_8_9_1_32_0_3_7_5_4_7_0_0_7_0_1607143710_0_1607143703%7C9%2312227_1115_1607079768%7C9; BD_HOME=1; BA_HECTOR=01al84a5802l8401he1fsm4ld0q; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; BD_CK_SAM=1; PSINO=7; BDRCVFR[VjobkFsAYtR]=mk3SLVN4HKm; RT=sl=1&ss=kib81a9x&tt=72&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&z=1&dm=baidu.com&si=e1r0xa88g68&ld=sb; sug=3; sugstore=0; ORIGIN=0; bdime=0; BDSVRTM=147; BDRCVFR[feWj1Vr5u3D]=mk3SLVN4HKm; delPer=0; BD_CK_SAM=1; PSINO=7; BDSVRTM=26; H_PS_PSSID=1440_33061_33098_33101_32846_33199_33144_33149',
  'Upgrade-Insecure-Requests': '1'
}

response = requests.request("GET", url, headers=headers, data=payload)

r = re.compile(r'(http://www.baidu.com/link\?url=[^"]+)')
links = set(r.findall(response.text))

for link in links:
    real_url = requests.head(link,headers = headers).headers.get('Location', '')  # HEAD请求 返回302跳转
    print(real_url)

可以通过urlpase直接解析主域名

# -*- coding: UTF-8 -*-
from urllib.parse import urlparse

url = "https://zhidao.baidu.com/question/719224624369281125.html?zsyx=nHmvn1c1rjnLnW6YP1bvn1n3PW-xPH0s&fid=nHmvn1c1rjnLnW6YP1bvn1n3PW-xnHc3n-tk&ver=2&mtrender=1&xst=I1LGC4O-b3ZM-FOb8JxM83OCb2g-WpEM-FOCooW-WlOCo3ZMhCH-BGeCo3ZhVmZMJcEhh2Ob83jb8mWEEFtKIHYzPj03nWT1nf7WpjdBTh78u07z5XpL_XyT_s7G5HD3nz3kPi3knWT8nHDY0g6quHubnWD3uHDsnj0Ynj7Wu7tkrHczg1DznjDsP1nLPWcKT1YkPWm1nWn3n1TzrjfLrHm1n16vrNtknW6zg10KPHfYn10srjmv"

dj_url = urlparse(url).netloc

print(dj_url)

猜你喜欢

转载自blog.csdn.net/haohaomax1/article/details/109821361
今日推荐