爬取所有博客

爬取所有博客的内容并转换成为pdf格式

from bs4 import BeautifulSoup
import pdfkit
import re


# <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="">查看</span></a>
from gevent import os


def getPagehtml(url):  #获取网页的内容
    response = requests.get(url)
    return response.text


def createurl(text):  #从网页源码中匹配到每一片博客网址
    '''
    <a href="https://blog.csdn.net/qq_41911569/article/details/83034422" target="_blank"><span class="article-type type-1">原</span>爬取猫眼电影</a>
    :param text:
    :return:
    '''
    pattern = r'<a href="(https://blog.csdn.net/qq_41911569/article/.*?)" target="_blank">'
    return re.findall(pattern,text)

url = 'https://blog.csdn.net/qq_41911569'
text = getPagehtml(url)
createurl(text)


def get_blog_content(i,url):  #根据获取到的每一片的博客网址,获得博客的内容,并写入文件中
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    # 获取head标签的内容
    head = soup.head
    # 获取博客标题
    title = soup.find_all(class_="title-article")[0].get_text()
    # 获取博客内容
    content = soup.find_all(class_="article_content")[0]
    # 写入本地文件
    other = 'http://passport.csdn.net/account/login?from='
    with open('/home/kiosk/Desktop/python笔记/python_stack/day26/bs/westos%d.html' %i, 'w') as f:
        f.write(str(head))
        f.write('<h1>%s</h1>\n\n' %(title))
        f.write(str(content))

def main():
    # https://blog.csdn.net/qq_41911569/article/list/3
    article_url = []
    for i in range(3):
        url = 'https://blog.csdn.net/qq_41911569/article/list/%d' %(i+1)
        text = getPagehtml(url)
        article_url.append(createurl(text))
    article_url = [j for i in article_url for j in i]

    # print(article_url)
    for i,v in enumerate(set(article_url)):
        get_blog_content(i,v)


main()

结果:
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/qq_41911569/article/details/83044467
今日推荐