Python实现爬取google翻译API结果

看了胖喵http://www.cnblogs.com/by-dream/p/6554340.html 的博文后,想参考着自己写一个,由于对js不是很熟悉,就直接在Python里利用pyexecjs库调用js来获取tk值,tkk的js获取代码自动网页爬取写入googletranslate.js,再调用大神写的googletranslate_1.js运算获取tk值
代码如下,打印结果来看运行的不错:

import requests
import json
import sys
import urllib
from bs4 import  BeautifulSoup
import re
import execjs
import os



    
class  Translate:
    def __init__(self,query_string):
        self.api_url="https://translate.google.cn"
        self.query_string=query_string
        self.headers={
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; rv:53.0) Gecko/20100101 Firefox/53.0"
                      }
        
    def get_url_param_data(self):
        url_param_part=self.api_url+"/translate_a/single?"
        url_param=url_param_part+"client=t&sl=zh-CN&tl=en&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&source=btn&ssel=3&tsel=3&kc=0&"
        url_get=url_param+"tk="+str(self.get_tk())+"&q="+str(self.get_query_string())
        print(url_get)
        return  url_get
    
    def get_query_string(self):
        query_url_trans=urllib.parse.quote(self.query_string)#汉字url编码
        return  query_url_trans
     
    def get_tkk(self):
        part_jscode_2="\n"+"return TKK;"
        tkk_page=requests.get(self.api_url,headers=self.headers)
        tkk_code=BeautifulSoup(tkk_page.content,'lxml')
        patter= re.compile(r'(TKK.*?\);)', re.I | re.M)
        part_jscode=re.findall(patter,str(tkk_code))
        print(part_jscode[0])
        js_code=part_jscode[0]+part_jscode_2
        with open ("D:/googletranslate.js","w")  as  f:
            f.write(js_code)
            f.close
        tkk_value=execjs.compile(open(r"D:/googletranslate.js").read()).call('eval')
        print(tkk_value)
        return tkk_value
    
    def get_tk(self):
        tk_value=execjs.compile(open(r"D:/googletranslate_1.js").read()).call('tk',self.query_string,self.get_tkk())
        print(tk_value)
        return tk_value
          
    
    def parse_url(self):
        response=requests.get(self.get_url_param_data(),headers=self.headers)
        return response.content.decode()
    
    
    def  get_trans_ret(self,json_response):
        dict_response=json.loads(json_response)
        ret=dict_response[0][0][0]
        print(ret)
        
        
    def  run(self):
        json_response=self.parse_url()
        self.get_trans_ret(json_response)
       
       
if  __name__=="__main__":
    query_string="Google 翻译是谷歌公司提供一项免费的翻译服务,可提供 80 种语言之间的即时翻译,支持任意两种语言之间的字词、句子和网页翻译"
    google=Translate(query_string)
    google.run()

运行结果见图:


爬取写入后的googletranslate.js的内容:

TKK=eval('((function(){var a\x3d3238994924;var b\x3d-2446116049;return 420013+\x27.\x27+(a+b)})())');
return TKK;
利用tkk和文本内容运算tk的 
 googletranslate_1.js的内容: 
 

function b(a, b) {  
  for (var d = 0; d < b.length - 2; d += 3) {  
      var c = b.charAt(d + 2),  
          c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c),  
          c = "+" == b.charAt(d + 1) ? a >>> c : a << c;  
      a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c  
  }  
  return a  
}  
  
function tk(a,TKK) {  
    for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) {  
        var c = a.charCodeAt(f);  
        128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128)  
    }  
    a = h;  
    for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6");  
    a = b(a, "+-3^+b+-f");  
    a ^= Number(e[1]) || 0;  
    0 > a && (a = (a & 2147483647) + 2147483648);  
    a %= 1E6;  
    return a.toString() + "." + (a ^ h)  
}  







猜你喜欢

转载自blog.csdn.net/boyheroes/article/details/78681357