使用python备份搜狐博客

博客日落西山,已经是不争的事实了,只怕哪天会停掉的。
用python把旧的内容做下备份。
# -*- coding:utf-8 -*-

import urllib.request
from urllib import request
from bs4 import BeautifulSoup
import sqlite3

domain="TTTT";####此处修改为你的博客域名
url = "http://"+domain+".blog.sohu.com/entry/"
urlFile = urllib.request.urlopen(url)
data = urlFile.read()
urlFile.close()
data = data.decode('utf-8',errors='ignore')
print("get page success")
pre = "var _ebi = \'"
index1 = data.find(pre) + len(pre)
index2 = data.find('\'', index1)

ebi=data[index1 : index2];
print("ebi:"+ebi)

pre = "var totalCount = "
index1 = data.find(pre) + len(pre)
index2 = data.find(';', index1)
print("totalcount:"+data[index1 : index2])
totalPage="";
if (int(data[index1 : index2]))%20>0:
    totalPage=str(int(int(data[index1 : index2])/20+1)) 
else:
    totalPage=str(int(int(data[index1 : index2])/20))
print("totalpage:"+totalPage);

    
def getBlogList(pageId):
    url="http://"+domain+".blog.sohu.com/action/v_frag-ebi_"+ebi+"-pg_"+pageId+"/entry/";
    
    print("get url:"+url);
    #1.获取页面内容html
    with request.urlopen(url) as f:
        html_doc=f.read()
        html_doc = html_doc.decode('utf-8',errors='ignore')
        
    #2.分析页面内容,获取标题内容和链接[格式如下]
    #<h2 class="news_entry">
    #	<a href="/n/535728/" target="_blank">传Windows 10 Mobile Build 11088下月初发布</a>
    #</h2>
    soup = BeautifulSoup(html_doc,"html.parser")
    news_array=soup.find_all('div', {'class': 'newBlog-list-title'})
    for news in news_array:
        if news.a:
            print(news.a.get("href"))#获取链接
            save(news.a.get("href"))
        #print(news.a.string)#获取标题

def save(link,title=None):
    if title is None:
	    title=""
    conn = sqlite3.connect('blog.db')
    cursor = conn.cursor()
    # 执行一条SQL语句,创建user表:
    cursor.execute('create table IF NOT EXISTS blog (id INTEGER PRIMARY KEY, title varchar(100),link vachar(100),content text,postdate varchar(100),status Integer)')
    cursor.execute('select * from blog where link=\''+link+'\'')
    values=cursor.fetchall()
    if len(values) > 0:#链接以前就存在
        print('链接已经存在:'+link)
    else:
        cursor.execute('insert into blog (title, link,status) values (\''+title+'\', \''+link+'\',0)')
        conn.commit()
        print("save success."+link)    
# 关闭Cursor:
    cursor.close()
# 提交事务:
    conn.commit()
# 关闭Connection:
    conn.close()
    
for x in range(1,int(totalPage)+1): #代表从1到5(不包含5)9000-9700
    errorLink=[]
    try:
        getBlogList(str(x))
    except Exception  as e:
        print('except:', e)
        errorLink.append(x)
print("errorLink:"+str(errorLink));




2.抓取内容页面,将内容保存到数据库中
# -*- coding:utf-8 -*-

from bs4 import BeautifulSoup
import urllib.request
from urllib import request
# 导入SQLite驱动:
import sqlite3

	
def updateContent():
    conn = sqlite3.connect('blog.db')
    cursor = conn.cursor()
    cursor.execute('select * from blog where status=0')
    values = cursor.fetchall()
    
    for line in values:
        id=line[0]
        link=line[2]
        
        soup=getContent(link)
        
        try:
            title=soup.find('div', {'class': 'item-body'}).h2.span.get_text()
            postdate=soup.find('span', {'class': 'date'}).get_text();
            content=str(soup.find('div', {'class': 'item-content'}))#.get_text()
            end = "<div class=\"clear\"></div>"
            content=content[45:content.find(end)]
            
            print(link)
            cursor.execute('update blog set title=?,content=?,status=1,postdate=? where id=?',(title,content, postdate,id))
            conn.commit()
        except Exception  as e:
            print('except:', e)
    cursor.close()
    conn.commit()
    conn.close()


#根据链接获取内容
def getContent(link):
    #1.获取页面内容html
    html_doc="";
    #构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。
    headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0',
           'Referer' : link}
           
    #打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功)
    
    try:
        #with request.urlopen(link) as f:
        #    html_doc=f.read()  
        request = urllib.request.Request(link, None, headers)
        html_doc=urllib.request.urlopen(request).read()
    except Exception  as e:
        print('except:', e)
            
    #2.分析页面内容,获取内容
    soup = BeautifulSoup(html_doc,"html.parser")
    return soup

#将所有没有内容的新闻,抓取一下,将内容填充进去
updateContent()

猜你喜欢

转载自mushme.iteye.com/blog/2278576