百度下拉关键词获取

# -*- conding:utf-8 -*-
#https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd=%E5%87%8F%E8%82%A5
from threading import Thread
from queue import Queue
import requests
import json
from pymongo import MongoClient

class Xiala(Thread):
  def __init__(self,queue,db_config):
    super().__init__()
    self.queue   = queue
    self.config  = db_config
    self.headers ={
      "Agent-User":"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
    }

  def run(self) -> None:
    #主程序
    while True:
      try:
        wd = self.queue.get()
        json_res = self.fetch_json(wd)
        res_word = self.parse_json(json_res)
        print(f'{wd}相关词有{len(res_word)}')
        for word in res_word:
             self.save_mongo(word)
      finally:
         self.queue.task_done()
  @staticmethod
  def parse_json(json_res):
    if not isinstance(json_res,dict):
        return
    keyword = json_res.get('g',[])
    return {item['q'] for item in keyword}

  def fetch_json(self,wd):
      try:
          url = f'https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd={wd}'
          r = requests.get(url, headers = self.headers, timeout =10)
      except requests.RequestException as err:
          res = None
          print(f'request错误{err}')
      else:
           r.encoding = "utf-8"
           try:
              return r.json()
           except json.JSONDecodeError :
              return  r.text

  def save_mongo(self,res_word):
    result = self.config.find_one({'word':res_word},{'count':1, '_id':0})
    if result is None:
      data={
        "word" : res_word,
        'count': 1,
      }
      self.config.insert_one(data)
    else:
        self.config.update_one({'word':res_word},{"$set":{"count": result['count'] + 1}})
if __name__ == '__main__':
  query = Queue()
  with open("keywords.txt",encoding="utf-8") as f:
       for x in f.readlines():
          query.put(x.strip())
  client = MongoClient()
  db = client['xiala']
  config = db['cc41']
  for x in range(1):
    xl = Xiala(query,config)
    xl.daemon = True
    xl.start()
  query.join()
  print('done')


  获取下来搜索有多种接口:

第一种:

源代码搜索 sugHost
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su
获取相关搜索词接口
增加?wd=关键词   
即获取 关键词 下拉搜索词
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=seo
https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=seo&json=1 返回json格式

第二种:

network - xhl - headers

https://www.baidu.com/sugrec?&prod=pc&from=pc_web&wd=

猜你喜欢

转载自blog.csdn.net/haohaomax1/article/details/111188308