python利用pyquery库实现爬取豆瓣电影排行top250并存储到mysql数据库中

需要的库:
pyquery
requests
time
re
pymysql

比较简单,所以直接上源码:

from pyquery import PyQuery as pq
import requests
import time
import re
import pymysql

headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
    }





def get_one_page(url,n):
    a=0
    res=requests.get(url,headers=headers)
    html=res.text
    doc=pq(html)
    urls=doc('div.hd').items()
    for item in urls:
        a=a+1
        fid=n*25+a
        one_url=item.children('a').attr('href')
        get_one_film(one_url,fid)
        time.sleep(0.8)


def get_one_film(url,fid):
    d={}
    print(fid)
    try:
        s=requests.session()
        s.keep_alive=False
        res=requests.get(url,headers=headers)
        html=res.text
        doc=pq(html)
        
        intr=doc('#link-report')
    except:
        print("链接失败,跳过链接")
        return
    #编号
    d['id']=fid
    #标题
    ftitle=doc('[property*="itemreviewed"]').text()
    d['title']=ftitle
    #导演
    fdirector=doc('[rel*="directedBy"]').text()
    d['director']=fdirector
    #编剧
    fscriptwriter=doc('[rel*="directedBy"]').parent().parent().next().next().text()[3:].strip()
    d['scriptwriter']=fscriptwriter
    #演员
    factor=doc('#info>.actor').text()[3:].strip()
    d['actor']=factor
    #类型
    ftype=doc('[property="v:genre"]').text()
    d['type']=ftype
    #地区
    reg_region='制片国家/地区:</span>(.*?)<br\/>'
    fregion=pat_findall(reg_region,html)
    d['region']=fregion
    #语言
    reg_language='语言:</span>(.*?)<br\/>'
    flanguage=pat_findall(reg_language,html)
    d['language']=flanguage
    #上映日期
    fdate=doc('[property="v:initialReleaseDate"]').text()
    d['date']=fdate
    #时长
    fduration=doc('[property="v:runtime"]').text()
    d['duration']=fduration
    #评分
    fscore=doc('[property="v:average"]').text()
    d['score']=fscore
    #简介
    fintr=doc('[property="v:summary"]').text()
    d['intr']=fintr
    #海报
    fposter=doc('[class="nbgnbg"]>img').attr('src')
    d['poster']=fposter
    print(d)
    print('\n')



    #执行插入数据库操作
    db=pymysql.connect(host='127.0.0.1',port=3306,user='root',password='',db='douban')
    cursor=db.cursor()
    sql='INSERT INTO top250(id,title,director,scriptwriter,actor,type,region,language,date,duration,score,intr,poster) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
    try:
        cursor.execute(sql,(fid,ftitle,fdirector,fscriptwriter,factor,ftype,fregion,flanguage,fdate,fduration,fscore,fintr,fposter))
        print('写入成功')
        db.commit()
    except:
        print('写入失败')
        db.rollback()
    db.close()
    
    
def pat_findall(reg,reg_str):
    pattern=re.compile(reg,re.S)
    mat=re.findall(pattern,reg_str)
    un="unknow"
    
    if(len(mat)>0):
        return mat[0]
    else:
        return un 
    

def main():
    for n in range(0,10):
        str1=str(n*25)
        url="https://movie.douban.com/top250?start="+str1+"&filter="
        get_one_page(url,n)

main()

猜你喜欢

转载自www.cnblogs.com/codexlx/p/12502131.html