动态爬虫1：爬取影评信息

网页下载器

import requests
from http import cookiejar
import urllib

class HtmlDownloader():
    def cookie():
        with open('cookie.txt','r') as f:
            cookies={}
            for line in f.read().split(';'):
                name,value=line.strip().split('=',1)
                cookies[name]=value 
            return cookies

    def download(self,url):
        if url is None:
            return None
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331',
            'Referer': r'http://movie.mtime.com',
            'Connection': 'keep-alive'
        }
        cookie=cookiejar.CookieJar()
        opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
        response=opener.open(url)
        r=requests.get(url,headers=headers,cookies=cookie)
        if r.status_code==200:
            r.encoding='utf-8'
            return r.text
        return None

网页解析器

import re
import json

class HtmlParser():
    def parser_url(self,page_url,response):
        pattern=re.compile(r'(http://movie.mtime.com/(\d+)/)')
        urls=pattern.findall(response)
        if urls!=None:
            return list(set(urls))
        else:
            return '没有链接了！'

    def parser_json(self,page_url,response):
        pattern=re.compile(r'=(.*?);')
        result=pattern.findall(response)[0]
        if result!=None:
            value=json.loads(result)
            try:
                isRelease=value.get('value')
            except Exception as e:
                print(e)
                return None
            if isRelease:
                if value.get('value').get('hotValue')==None:
                    return self._parser_release(page_url,value)
                else:
                    return self._parser_no_release(page_url,value,isRelease=2)
            else:
                return self._parser_no_release(page_url,value)

    def _parser_release(self,page_url,value):
        try:
            isRelease=1
            movieRating=value.get('value').get('movieRating')
            boxOffice=value.get('value').get('boxOffice')
            movieTitle=value.get('value').get('movieTitle')
            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')

            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')

            if boxOffice!=None:
                TotalBoxOffice=boxOffice.get('TotalBoxOffice')
                TotalBoxOfficeUnit=boxOffice.get('TotalBoxOfficeUnit')
                TodayBoxOffice=boxOffice.get('TodayBoxOffice')
                TodayBoxOfficeUnit=boxOffice.get('TodayBoxOfficeUnit')

                ShowDays=boxOffice.get('ShowDays')
                try:
                    Rank=boxOffice.get('Rank')
                except Exception as e:
                    Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,
                       TotalBoxOffice+TotalBoxOfficeUnit,TodayBoxOffice+TodayBoxOfficeUnit,Rank,ShowDays,isRelease)

            else:
                Rank=0
                return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

    def _parser_no_release(self,page_url,value,isRelease=0):
        try:
            movieRating=value.get('value').get('movieRating')
            movieTitle=value.get('value').get('movieTitle')

            RPictureFinal=movieRating.get('RPictureFinal')
            RStoryFinal=movieRating.get('RStoryFinal')
            RDirectorFinal=movieRating.get('RDirectorFinal')
            ROtherFinal=movieRating.get('ROtherFinal')
            RatingFinal=movieRating.get('RatingFinal')
            MovieId=movieRating.get('MovieId')
            Usercount=movieRating.get('Usercount')
            AttitudeCount=movieRating.get('AttitudeCount')
            try:
                Rank=value.get('value').get('hotValue').get('Ranking')
            except Exception as e:
                Rank=0
            return (MovieId,movieTitle,RatingFinal,ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,Usercount,AttitudeCount,u'无',u'无',Rank,0,isRelease)
        except Exception as e:
            print(e,page_url,value)
            return None

数据存储器

import sqlite3

class DataOutput():
    def __init__(self):
        self.cx=sqlite3.connect('/home/as/test.db')
        self.create_table('MTime')
        self.datas=[]

    def create_table(self,table_name):
        values='''
            id integer primary key,
            MovieId integer,
            MovieTitle varchar(40) NOT NULL,
            RatingFinal REAL NOT NULL DEFAULT 0.0,
            ROtherFinal REAL NOT NULL DEFAULT 0.0,
            RPictureFinal REAL NOT NULL DEFAULT 0.0,
            RDirectorFinal REAL NOT NULL DEFAULT 0.0,
            RStoryFinal REAL NOT NULL DEFAULT 0.0,
            Usercount integer NOT NULL DEFAULT 0,
            AttitudeCount integer NOT NULL DEFAULT 0,
            TotalBoxOffice varchat(20) NOT NULL,
            TodayBoxOffice varchat(20) NOT NULL,
            Rank integer NOT NULL DEFAULT 0,
            ShowDays integer NOT NULL DEFAULT 0,
            isRelease integer NOT NULL
            '''
        self.cx.execute('CREATE TABLE IF NOT EXISTS %s ( %s ) '%(table_name, values))

    def store_data(self,data):
        if data is None:
            return
        self.datas.append(data)
        if len(self.datas)>10:
            self.output_db('MTime')

    def output_db(self,table_name):
        for data in self.datas:
            self.cx.execute('INSERT INTO %s (MovieId,MovieTitle,RatingFinal,'
                           'ROtherFinal,RPictureFinal,RDirectorFinal,RStoryFinal,'
                           'Usercount,AttitudeCount,TotalBoxOffice,TodayBoxOffice,'
                           'Rank,ShowDays,isRelease) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)'
                          ''%table_name,data)
            self.datas.remove(data)
        self.cx.commit()

    def output_end(self):
        if len(self.datas)>0:
            self.output_db('MTime')
        self.cx.close()

爬虫调度器

import time

class SpiderMan():
    def __init__(self):
        self.downloader=HtmlDownloader()
        self.parser=HtmlParser()
        self.output=DataOutput()
    def crawl(self,root_url):
        content=self.downloader.download(root_url)
        urls=self.parser.parser_url(root_url,content)
        for url in urls:
            try:
                t=time.strftime('%Y%-m%-d%H%M%S3282',time.localtime())
                rank_url='http://service.library.mtime.com/Movie.api'\
                '?Ajax_CallBack=true'\
                '&Ajax_CallBackType=Mtime.Library.Services'\
                '&Ajax_CallBackMethod=GetMovieOverviewRating'\
                '&Ajax_CrossDomain=1'\
                '&Ajax_RequestUrl=%s'\
                '&t=%s'\
                '&Ajax_CallBackArgument0=%s'%(url[0],t,url[1])
                rank_content=self.downloader.download(rank_url)
                data=self.parser.parser_json(rank_url,rank_content)
                self.output.store_data(data)
            except Exception as e:
                print(e)

spider=SpiderMan()
spider.crawl('http://theater.mtime.com/China_Beijing/')

页面操作

1.页面交互与填充表单

2.元素拖拽

3.窗口和页面frame的切换

4.弹窗处理

5.历史记录

6.Cookie处理

7.设置phantomJS请求头中User_Agent

等待

1.显示等待

内置方法	功能
title_is	判断当前页面内容
title_contains	判断当前页面包含的字符串
presence_of_element_located	判断某个元素是否被加到DOM树
visibility_of_element_located	判断元素是否可见
visibility_of	判断元素是否可见
presence_of_all_elements_located	判断是否至少有1个元素在DOM书中
text_to_be_present_in_element	判断text包含的字符串
text_to_be_present_in_element_value	判断value属性包含的字符串
frame_to_be_available_and_switch_to_it	判断frame是否可以切换
invisibility_of_element_located	判断元素是否不存在于DOM树或不可见
element_to_be_clickable	判断元素是否可见并且是enable
staleness_of	等待某个元素从DOM树中移除
element_to_be_selected	判断某个元素是否被选中，一般用于下拉列表
element_located_to_be_selected	判断某个元素是否被选中，一般用于下拉列表
element_selection_state_to_be	判断元素的选中状态
element__located_selection_status_to_be	判断元素的选中状态
alert_is_present	判断页面是否存在alert框

2.隐式等待

3.线程休眠

动态爬虫2：爬取去哪儿网

ele_toCity=driver.find_element_by_name('toCity')
ele_fromDate=driver.find_element_by_id('fromDate')
ele_toDate=driver.find_element_by_id('toDate')
ele_search=driver.find_element_by_class('search-btn')
ele_toCity.clear()
ele_toCity.send_keys(to_city)
ele_toCity.click()
ele_fromDate.clear()
ele_fromDate.send_keys(fromdate)
ele_toDate.clear()
ele_toDate.send_keys(todate)

try:
    WebDriverWait(driver,10).until(
        EC.title_contains(unicode(to_city))
    )
except Exception as e:
    print(e)
    break
time.sleep(5)

js="window.scrollTo(0,document.body.scrollHeight);"
driver.execute_script(js)
time.sleep(5)
    htm_const=driver.page_source

soup=BeautifulSoup(htm_const,'html.parser',from_encoding='utf-8')
infos=soup.find_all(class_='item_hotel_info')
f=codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')
for info in infos:
    f.write(str(page_num)+'--'*50)
    content=info.get_text().replace(" ","").replace("\t","").strip()
    for line in [ln for ln content.splitlines() if ln.strip()]:
        f.write(line)
        f.write('\r\n')
f.close()

next_page=WebDriverWait(driver,10).until(
EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
)
next_page.click()

import datetime

class QunaerSpider():
    def get_hotel(self,driver,to_city,fromdate,todate):
        ele_toCity=driver.find_element_by_name('toCity')
        ele_fromDate=driver.find_element_by_id('fromDate')
        ele_toDate=driver.find_element_by_id('toDate')
        ele_search=driver.find_element_by_class_name('search-btn')
        ele_toCity.clear()
        ele_toCity.send_keys(to_city)
        ele_toCity.click()
        ele_fromDate.clear()
        ele_fromDate.send_keys(fromdate)
        ele_toDate.clear()
        ele_toDate.send_keys(todate)
        ele_search.click()
        page_num=0
        while True:
            try:
                WebDriverWait(driver,10).until(
                    EC.title_contains(unicode(to_city))
                )
            except Exception as e:
                print(e)
                break
            time.sleep(5)

            js="window.scrollTo(0,document.body.scrollHeight);"
            driver.execute_script(js)
            time.sleep(5)
            htm_const=driver.page_source
            soup=BeautifulSoup(htm_const,'html.parser',from_encoding='utf-8')
            infos=soup.find_all(class_='item_hotel_info')
            f=codecs.open(unicode(to_city)+unicode(fromdate)+u'.html','a','utf-8')
            for info in infos:
                f.write(str(page_num)+'--'*50)
                content=info.get_text().replace(" ","").replace("\t","").strip()
                for line in [ln for ln  in content.splitlines() if ln.strip()]:
                    f.write(line)
                    f.write('\r\n')
            f.close()
            try:
                next_page=WebDriverWait(driver,10).until(
                EC.visibility_of(driver.find_element_by_css_selector(".item.next"))
                )
                next_page.click()
                page_num+=1
                time.sleep(10)
            except Exception as e:
                print(e)
                break

    def crawl(self,root_url,to_city):
        today=datetime.date.today().strftime('%Y-%m-%d')
        tomorrow=datetime.date.today()+datetime.timedelta(days=1)
        tomorrow=tomorrow.strftime('%Y-%m-%d')
        driver=webdriver.Firefox()
        driver.set_page_load_timeout(50)
        driver.get(root_url)
        driver.maximize_window()
        driver.implicitly_wait(10)
        self.get_hotel(driver,to_city,today,tomorrow)

spider=QunaerSpider()
spider.crawl('http://hotel.qunar.com/',u'厦门')

动态网站抓取

动态爬虫1：爬取影评信息

网页下载器

网页解析器

数据存储器

爬虫调度器

页面操作

1.页面交互与填充表单

2.元素拖拽

3.窗口和页面frame的切换

4.弹窗处理

5.历史记录

6.Cookie处理

7.设置phantomJS请求头中User_Agent

等待

1.显示等待

2.隐式等待

3.线程休眠

动态爬虫2：爬取去哪儿网

猜你喜欢