python爬虫——爬取知网体育学刊引证论文信息

前言

国庆百无聊赖，然后帮一个小姐姐爬取知网信息，觉得知网算目前处理过的对爬虫稍微有点防范的网站，遂有了这篇博客

目标

爬取知网上2003年体育学刊文献所有论文的引证论文，包括论文名称、作者、发表时间，也就是下面红框所指处

点击click处，点击黑框，红框所指处即为要爬取数据：

分析

知网主体页面使用ASP（不知道啥东西，类似于jsp一样的模板吧），我需要爬取的页面后台是通过模板解析后返回html页面，所以不可能捕获接口获得json
知网会有iframe，不过没有嵌套，iframe可以在html中嵌套html，问过前端同学后，才知道这是两个html文件，浏览器会把两个html嵌套显示，iframe在一定程度上会增加爬取难度，一般iframe的src属性会标记html文件的url，但是知网的不是，可能是js处理过的，抓包显示url不是src属性的值
知网会用cookie跟踪用户行为
第一幅图中，click处的url貌似是经过js处理，直接请求，会出现重定向

难点

知网采用cookie跟踪用户行为以及部分url经过js处理，所以我决定使用selenium+chromedriver，但是太慢了，而且有点占资源，这里知网有个小漏洞，引证论文页面不需要cookie，所以第一遍使用selenium+chromedriver处理获得所有引证论文url（一共299个），第二遍使用urllib+etree单独处理引证论文（一共6800+）
引证论文存在翻页，但是翻页后大量数据会有重复，并且页面形式不固定，例如某些页面有A、B、C三处有引证论文，有些页面有A、B、C、D、E五处引证论文，具体参见知网页面，这里考虑到程序健壮性，我选择每次插入数据前先查一遍数据是否存在（有点蠢），虽然可以使用数据库的unique字段，但是异常抛出太频繁

代码

获得引证论文url

from selenium import webdriver
import time
from lxml import etree
from urllib import request
from dbmanager_zhiwang import dbmanager_paper

def get_paper(page,index):
    html_parse=etree.HTML(page)
    ul=html_parse.xpath('//div[@class="essayBox"]/ul')[index]
    li_list=ul.findall('li')
    for li in li_list:
        a_list=li.findall('a')
        for index in range(0,len(a_list)):
            print(a_list[index].text)
            if index==0:
                print(a_list[index].tail)   

def get_cookie():
    driver.get('http://kns.cnki.net/kns/brief/default_result.aspx')
    time.sleep(5)
    driver.find_element_by_name('txt_1_value1').send_keys('体育学刊')     
    driver.find_element_by_xpath('//select[@id="txt_1_sel"]//option[@value="LY$%=|"]').click()      
    driver.find_element_by_id('btnSearch').click()    
    time.sleep(5)
      
    

#缺少paperid
def get_url(num):
    elements=driver.find_elements_by_xpath('//table[@class="GridTableContent"]//tr[@bgcolor]')
    for element in elements:
        try:
            a=element.find_element_by_xpath('td/a[@class="fz14"]')
            print(a.get_attribute('href'))
            paper_info=element.text.replace('\n',' ').split(' ')
            paper_title=paper_info[1]
            index=2
            author=''
            while('体育学刊' not in paper_info[index]):
                author=author+paper_info[index]
                index=index+1
            date=paper_info[index+1]
            reference=paper_info[index+3]
            insert_info=(str(num),paper_title,author,date,reference,a.get_attribute('href'))      
            a.click()
            windows = driver.window_handles
            driver.switch_to.window(windows[-1])
            time.sleep(5)
            i=0
            while(i<5):
                i=i+1
                if(etree.HTML(driver.page_source).xpath('//div[@class="yzwx"]/a')!=[]):
                    break
            #不存在引证论文
            url0_list=[]
            url1_list=[]
            url2_list=[]
            if(i!=5):
                html_parse=etree.HTML(driver.page_source)
                url=driver.find_element_by_xpath('//div[@class="yzwx"]/a').get_attribute('href')
                if(url!=None):          
                    print(url)             
                    driver.get(url)
                    html_parse=etree.HTML(driver.page_source)
                    a0_list=html_parse.xpath('//span[@id="CJFQ"]//a')
                    a1_list=html_parse.xpath('//span[@id="CDFD"]//a')
                    a2_list=html_parse.xpath('//span[@id="CMFD"]//a')
                    for a in a0_list:     
                        url0_list.append(a.attrib['href'])
                    
                    for a in a1_list:
                        url1_list.append(a.attrib['href'])
                    
                    for a in a2_list:
                        url2_list.append(a.attrib['href'])     
            db.insert_info(insert_info,url0_list,url1_list,url2_list) 
            num=num+1            

        except Exception as arg:
            print (arg)
        driver.close()
        driver.switch_to_window(windows[0])
        time.sleep(5)
    return num


if __name__=="__main__":
    options = webdriver.ChromeOptions()
    prefs = {
        'profile.default_content_setting_values' :
            {
            'notifications' : 2
             }
    }
    options.add_experimental_option('prefs',prefs)
    driver = webdriver.Chrome(chrome_options = options)
    driver.maximize_window()
    get_cookie()
    db=dbmanager_paper('root','12345','127.0.0.1','zhiwang')
    num=0
    now_page=1
    driver.get('http://kns.cnki.net/kns/brief/brief.aspx?ctl=4a7fde68-1a44-4852-8b23-1a70aeb4cf8b&dest=%E5%88%86%E7%BB%84%EF%BC%9A%E5%8F%91%E8%A1%A8%E5%B9%B4%E5%BA%A6%20%E6%98%AF%202003&action=5&dbPrefix=SCDB&PageName=ASP.brief_default_result_aspx&Param=%e5%b9%b4+%3d+%272003%27&SortType=(FFD%2c%27RANK%27)+desc&ShowHistory=1&isinEn=1')

    while(now_page<16):
        num=get_url(num)
        a_list=driver.find_elements_by_xpath('//div[@class="TitleLeftCell"]//a')
        for a in a_list:
            if(a.text=='下一页'):
                a.click()
                break
        now_page=now_page+1
        time.sleep(5)

获得引证论文

from selenium import webdriver
import time
from lxml import etree
from urllib import request
import re
from dbmanager_zhiwang import dbmanager_paper
    

def get_paper(url,paperid):
    req=request.Request(url,headers=header)
    html_page=request.urlopen(req).read().lower().decode('utf-8',errors='ignore')
    html_parse=etree.HTML(html_page)
    ul_list=html_parse.xpath('//div[@class="essaybox"]//ul')
    for ul in ul_list:
        li_list=ul.findall('li')
        for li in li_list:
            try:
                a_list=li.itertext()
                info_temp=''
                for a in a_list:
                    info_temp=info_temp+a.replace(' ','').replace('\r\n','').replace('&nbsp&nbsp','')
                info=info_temp.split('.')
                if(db.judge_exist(info[0])==False):
                    length=len(info)-1
                    if(length<3):
                        deal=info[length]
                        date=re.findall('\d.*',deal)[0]
                        workunit=deal.replace(date,'').replace('年','')
                        info[length]=workunit
                        info.append(date)
                    info.append(str(paperid))
                    if(db.insert_paper_info(tuple(info))==False):
                        return False
            except Exception as arg:
                print(arg)
                return False
    return True


if __name__=="__main__":
    header={
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
        }
    db=dbmanager_paper('root','Ll41655184165518','127.0.0.1','zhiwang')
    while(True):
        status='finish'
        result=db.get_url()
        if(result==[]):
            break
        print(result[2])
        if(get_paper(result[2],result[1])==False):
            status='error'
        status_info=(status,result[0])
        db.set_status(status_info)
        time.sleep(10)

最后把数据库中的数据转换为excel，博客中没有给出数据库结构，详情见gayhub（没有打错）：https://github.com/zhuoyunli/crawler_zhiwang，里面有数据库文件以及处理好的excel文件

python爬虫——爬取知网体育学刊引证论文信息

前言

目标

分析

难点

代码

猜你喜欢