从无到有：一周搞定爬虫（Python3.x + Selenium + Chrome + ChromeDriver）

一周之前没有接触过Python，更不用说Selenium，一把泪！直接上代码，故事稍后。。。
#coding = utf-8
#sudo apt-get install python3-openpyxl


from openpyxl import workbook
from openpyxl import load_workbook

import time
from urllib.request import urlopen
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException

def getAllData(url):
    global book     # 全局工作表对象
    PCBH = []      # 普查編號
    SuoSH = []     # 索書號
    CunJuan = ['','','','','','','','','','']   # 存卷
    TMZZ = []      # 題名著者
    BanBen = []    # 版本
    CeShu = []     # 册（件）数    
    DanWei = []    # 单位

    page_number = 14910  # 页数初始为1

    while page_number <= 65914:
        # print("-----------打印第",page_number,"页")
        try:
            link = driver.find_element_by_link_text(str(page_number)) 
        except NoSuchElementException:
    	    print ("solve all page")
    
        link.click()

        # 获取源代码
        soup = BeautifulSoup(urlopen(driver.current_url),"html.parser")
        # print(driver.current_url)
        # 获取各参数值，并且添加到相应的列表中

        for link in soup.find_all(attrs={"data-field":"PUCHABIANHAO"}):
            # print(link.get_text)
            PCBH.append(link.get_text())

        for link in soup.find_all(attrs={"data-field":"SUOSHUHAO"}):
            # print(link.get_text)
            SuoSH.append(link.get_text())

        for link in soup.find_all(attrs={"data-field":"TIMING"}):
            # print(link.get_text)
            TMZZ.append(link.get_text())

        for link in soup.find_all(attrs={"data-field":"BANBEN"}):
            # print(link.get_text)
            BanBen.append(link.get_text())

        for link in soup.find_all(attrs={"data-field":"CESHU"}):
            # print(link.get_text)
            CeShu.append(link.get_text())

        for link in soup.find_all(attrs={"data-field":"DANWEI"}):
            # print(link.get_text)
            DanWei.append(link.get_text())

        # 对存卷这一参数做特殊筛选处理
        count = 0   # 记录循环次数,即统计item数量 
        for tag in soup.find_all(attrs={"class":"item"}):
            title = tag.find_all(attrs={"class":"ot"})
            i = 0  
            for n in title:
                text = n.get_text()
                text = text.replace('\n','')
                if i == 0: # 查找 第一个ot
                    # print('')
                    pass
                elif i == 1: # 查找 第二个ot,即查找存卷是否存在,存在则添加到列表中
                    # print(link.get_text)
                    CunJuan.insert(count,text)              
                i = i + 1
            count = count + 1       

        for i in range(10):
            book.append([PCBH[i],SuoSH[i],TMZZ[i],BanBen[i],CeShu[i],DanWei[i],CunJuan[i]])

        PCBH.clear()
        SuoSH.clear()
        TMZZ.clear()
        BanBen.clear()
        CeShu.clear()
        DanWei.clear()        
        CunJuan.clear()
        CunJuan = ['','','','','','','','','','']

        if page_number % 10 == 0:
            wb.save('getGJPC0614.xlsx')
            print(time.strftime('%m-%d %H:%M:%S', time.localtime(time.time())), "---已向Excle中存入",page_number,"页数据---")
        page_number = page_number + 1
         

if __name__ == '__main__':

    driver = webdriver.Chrome()
    url = 'http://202.96.31.78/xlsworkbench/publish?keyWord=0&orderProperty=PU_CHA_BIAN_HAO&orderWay=asc&cGFnZT0xNDkxMA=='
    driver.get(url)
    #  创建Excel表并写入数据  
    wb = workbook.Workbook()  # 创建Excel对象  
    book = wb.active  # 获取当前正在操作的表对象  
    # 往表中写入标题行,以列表形式写入！  
    book.append(['普查編號', '索書號', '題名著者', '版本', '册（件）数', '单位','存卷'])

    getAllData(url)

    driver.quit()
后台运行记录在txt文件上： nohup python -u test.py >out.txt 2 >&1 &
tail -f out.txt
从无到有：一周搞定爬虫（Python3.x + Selenium + Chrome + ChromeDriver）

猜你喜欢