一. 技术栈

简单爬取: bs4 + requests + re
多线程爬取: 以上基础 + thread + queue

二. 爬取步骤

该文程序基础, 实现了基本功能.

关于爬虫, 主要是通过网站url链接获取一些规律, 然后访问网站url后返回文本内容, 通过各种技术来解析文本内容, 来获得自己想要的内容

2.1 获取文秘站文章的所有链接

首先该站的所有文章链接, 保存到文档中, 之后通过读取保存文档的链接来返回文章内容

这样做得目的是该站内容过多, 无法一次完成全部内容爬取, 所以先将文章的所有链接爬取保存到文档中

import requests
import re
from bs4 import BeautifulSoup
import traceback
import json


# 传入url来返回文章内容
def get_html_text(url):
    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        }
        # 客户机通过这个头告诉服务器，客户机的软件环境
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return 404


def get_nav_addr(lst, web_url):
    url = get_html_text(web_url + '/html/wenmimap.html')
    soup = BeautifulSoup(url, "html.parser")
    link_list = soup.find_all('div', attrs={'class': 'list'})
    # 获取map中导航的所有链接
    for list in link_list:
        ul_list = list.find('ul')
        # print(ul_list)
        a = ul_list.find_all('a')
        for i in a:
            try:
                href = i.attrs['href']
                lst.append(href)
            except:
                traceback.print_exc()


def get_article_url(lst, web_url, path):
    fw = open(path, 'a', encoding='utf-8')
    nav_count = 0
    # 获取每个文章的链接
    for nav_link in lst:
        article_count = 0
        print(nav_link)
        max_list = 0
        min_list = 0
        url = get_html_text(web_url + nav_link)
        soup = BeautifulSoup(url, "html.parser")
        page = soup.find('h3', attrs={'class': 'list_page'})
        # print(page)
        # 判断是否该页面有内容
        if page is None:
            continue
        min_page = page.find_all('a', string="首页")
        max_page = page.find_all('a', string="尾页")
        min_num = re.search(r'\d{1,4}', str(min_page[0]))
        max_num = re.search(r'\d{1,4}', str(max_page[0]))
        # 当只有一页的时候, 有可能链接为 /index.html, 因此没有想要的页码
        if min_num:
            min_num1 = int(min_num.group(0))
        else:
            min_num1 = 1
        if max_num:
            max_num1 = int(max_num.group(0))
        else:
            max_num1 = 1
        # 获取最大的页数, 开始遍历每一页中的文章内容
        num = int(max(min_num1, max_num1))
        try:
            for i in range(num):
                r = get_html_text(web_url + nav_link + 'List_' + str(i + 1) + '.html')
                # url = web_url + nav_link + 'List_' + str(i + 1) + '.html'
                # 将"http://www.cnwmz.com/html/jiguandanwei_tag/gongqingtuan/List_27.html"等等写入文本
                soup = BeautifulSoup(r, "html.parser")
                # 直接获取<dt>标签中的内容
                # article = soup.find("article", attrs={'id': 'list_l'})
                article = soup.find_all('dt', soup)
                for article_list in article:
                    a = article_list.find('a')
                    href = a.attrs['href']
                    article_url = web_url + href
                    # print(article_url)
                    article_count += 1
                    # 将获取的文章链接保存的文档中
                    fw.write(json.dumps(article_url, ensure_ascii=False) + '\n')
                    # print(href)
                    # lst.append(href)
                nav_count += 1
            print(article_count)
        except:
            traceback.print_exc()
    print(nav_count)


def main():
    web_url = 'http://www.cnwmz.com'
    lst = []
    fpath = 'F://wmz/wmz.json'
    flist_url = 'F://wmz/article_url1.txt'
    fcontent = "F://wmz/article_info.txt"
    # 获取文秘站所有导航的页码链接
    get_nav_addr(lst, web_url)
    # 获取文秘站的所有文章链接
    get_article_url(lst, web_url, flist_url)



if __name__ == '__main__':
    main()

2.2.1 获取指定内容

现在已经获取了所有链接, 后面通过读取文档的链接来依次获取指定内容, 通过bs4+re来保存想要的内容到文档中, 这样算是基本完成了网站内容的爬取

import json
import re
import traceback
import requests
from bs4 import BeautifulSoup


def get_html_text(url):
    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        }
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        # traceback.print_exc()
        return 404


def get_news_content(fpath, fcontent, fhistory_num):
    info_dict = {'url': '', 'title': '', 'content': '', 'class': '', 'tag': []}
    fh = open(fhistory_num, 'a', encoding='utf-8')
    fw = open(fcontent, 'a', encoding='utf-8')
    with open(fpath, 'r') as fr:
        print("注意:")
        # print("\t1. 直接回车将从第一行开始")
        print("\t1. 若异常结束,直接输入异常结束时的数字+1,即可继续爬取,")
        print("\t\t例如: 30行异常结束, 则输入31")
        print("\t2. 对于输出数据的文件, 是以追加形式写入文本, 故不用处理输出文本")
        print("\t3. F盘下有名字为: history_read_num.txt 的文件, 记录了已经打印的行数, 忘记了可以去看最后一条记录,从(最后一条记录+1)行开始即可")
        num = input("请输入想要开始的行号:")
        
        n = int(num) - 1
        count = n
        for line in fr.readlines()[n:]:
            url = re.search(r'http.+\.html', line).group(0)
            print(url)
            count += 1
            # print(count)
            try:
                r = get_html_text(url)
                # 将url存入dict
                info_dict['url'] = url
                soup = BeautifulSoup(r, 'html.parser')
                # print(soup.prettify())
                nav = soup.find('nav')
                nav_a = nav.find_all('a')
                nav_con = '您现位置：'
                for nav_content in nav_a:
                    # print(nav_content.string)
                    nav_con += nav_content.string + '>'
                # print(nav_con + '>正文')
                # 导航存入dict
                info_dict['class'] = nav_con + '>正文'
                # print(nav)
                article = soup.find('article', attrs={'id': 'con_l'})
                # print(article)
                title = article.find('h1')
                # print(title.string)
                # 标题存入dict
                info_dict['title'] = str(title.string)
                section_tip = soup.find('section', attrs={'id': 'tip'})
                # print(section_tip)
                # print(article)
                tip = section_tip.find('h2')

                tag = tip.find_all('a')
                tag_con = []
                for t in tag:
                    tag_con.append(t.string)
                # 标签存到dict
                info_dict['tag'] = tag_con

                # 开始获取文章的内容
                # 文章有可能有多页, 因此需要爬取多个页面的内容
                web_url = re.search(r'http://www.cnwmz.com/html/\d+/\d+', line)
                content = ''
                for j in range(20):
                    # 尝试文章是否有多页, 若有多页的话, 则改变j的值进行内容添加
                    if j != 0:
                        j += 1
                        wu = str(web_url.group(0)) + '_' + str(j) + '.html'
                        wu_text = get_html_text(wu)
                        if wu_text == 404:
                            break
                        soup = BeautifulSoup(wu_text, 'html.parser')
                    article_content = soup.find('section', attrs={'id': 'article'})
                    for num, desc in enumerate(article_content.descendants):
                        dc = str(desc)
                        if str(desc.name) == 'a' or str(desc.name) == 'script' or dc == 'wm("arc");':
                            continue
                        if str(desc.name) == 'p' or str(desc.name) == 'br':
                            content += '\n'
                            continue
                        if re.match(r'[{p.*}上一页下一页<b>.*</b>]', dc):
                            continue
                        if dc == '1' or dc == '2' or dc == '3' or dc == '4' or dc == '5' or dc == '6':
                            continue
                        if dc == ' ':
                            continue
                        content += dc

                    # 文章内容存入dict
                # print(content)
                info_dict['content'] = content
                fw.write(json.dumps(info_dict, ensure_ascii=False) + '\n')
                fw.flush()
                fh.write(str(count) + '\n')
                fh.flush()
            except:
                traceback.print_exc()
            print(count)
        # print(info_dict)
    fr.close()
    fh.close()
    fw.close()


def main():
    fhistory_num = 'F://wmz/history_read_num.txt'
    # 分批来读取链接, 文档的命名方式为: XXXX_XXXX.txt
    flist_url = 'F://wmz/0_100000.txt'
    fname = re.search(r'\d+_\d+_?\d*_?\d*', flist_url)
    fcontent = "F://wmz/article_info_" + str(fname.group(0)) + ".json"
    # 获取想要的内容
    get_news_content(flist_url, fcontent, fhistory_num)


if __name__ == '__main__':
    main()

技术要点:

对于文章解析获取指定内容, 一定要有输出信息, 看是否有错误, 通过有错误的网站来分析自己写的程序到底哪里出了问题, 大概率是不同的文章页面的结构有变化, 只是一种解析方式会出错, 一定要多种情况进行考虑
通过手动控制读取文档的行数, 方便程序的随时结束, 来继续指定位置继续爬取

2.2.2 爬取效率优化

对于上面程序, 只是单进程进行爬取, 速度过慢, 1分钟100条, 爬取所有要3天多的时间才可以爬完, 显然太耗时了, 需要改进程序

下面通过Python多线程方式来进行爬取, 效率大大提高, 1天即可完成所有内容爬取, 虽然还是有些慢, 但相对于上面程序速度有较大提升

import json
import re
import traceback
from threading import Thread
from queue import Queue
from lxml import etree
import requests
from bs4 import BeautifulSoup


def get_html_text(url):
    try:
        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        }
        r = requests.get(url, headers=headers, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        if r.status_code == 200:
            return r.text
    except:
        # traceback.print_exc()
        return 404


# 获取url文本的类
class CrawlInfo(Thread):
    def __init__(self, url_queue, html_queue):
        Thread.__init__(self)
        self.url_queue = url_queue
        self.html_queue = html_queue

    def run(self):
        info_dict = {'url': '', 'title': '', 'content': '', 'class': '', 'tag': []}
        # count = 0
        f = open('F://wmz/article_info_730000_737438.json', 'a', encoding='utf-8')
        while self.url_queue.empty() == False:
            url = self.url_queue.get()
            print(url)
            r = get_html_text(url)
            # 将url存入dict
            info_dict['url'] = url
            soup = BeautifulSoup(r, 'html.parser')
            # print(soup.prettify())
            nav = soup.find('nav')
            nav_a = nav.find_all('a')
            nav_con = '您现位置：'
            for nav_content in nav_a:
                # print(nav_content.string)
                nav_con += nav_content.string + '>'
            # print(nav_con + '>正文')
            # 导航存入dict
            info_dict['class'] = nav_con + '>正文'
            # print(nav)
            article = soup.find('article', attrs={'id': 'con_l'})
            # print(article)
            title = article.find('h1')
            # print(title.string)
            # 标题存入dict
            info_dict['title'] = str(title.string)
            section_tip = soup.find('section', attrs={'id': 'tip'})
            # print(section_tip)
            # print(article)
            tip = section_tip.find('h2')

            tag = tip.find_all('a')
            tag_con = []
            for t in tag:
                tag_con.append(t.string)
            # 标签存到dict
            info_dict['tag'] = tag_con

            # 开始获取文章的内容
            # 文章有可能有多页, 因此需要爬取多个页面的内容
            web_url = re.search(r'http://www.cnwmz.com/html/\d+/\d+', url)
            # print(web_url)
            content = ''

            for j in range(20):
                # 尝试文章是否有多页, 若有多页的话, 则改变j的值进行内容添加
                if j != 0:
                    j += 1
                    wu = str(web_url.group(0)) + '_' + str(j) + '.html'
                    # print(wu)
                    wu_text = get_html_text(wu)
                    if wu_text == 404:
                        break
                    soup = BeautifulSoup(wu_text, 'html.parser')
                    # print(soup)
                article_content = soup.find('section', attrs={'id': 'article'})
                for num, desc in enumerate(article_content.descendants):
                    dc = str(desc)
                    if str(desc.name) == 'a' or str(desc.name) == 'script' or dc == 'wm("arc");':
                        continue
                    if str(desc.name) == 'p' or str(desc.name) == 'br':
                        content += '\n'
                        continue
                    if re.match(r'[{p.*}上一页下一页<b>.*</b>]', dc):
                        continue
                    if dc == '1' or dc == '2' or dc == '3':
                        continue
                    if dc == ' ':
                        continue
                    content += dc

            # 文章内容存入dict
            # print(content)
            info_dict['content'] = content
            f.write(json.dumps(info_dict, ensure_ascii=False) + '\n')
            f.flush()
        print(info_dict)
        f.close()
    # fh.write(json.dumps(count, ensure_ascii=False) + '\n')


def main():
    flist_url = 'F://wmz/0_100000.txt'

    # 存储url的容器
    url_queue = Queue()
    # 存储内容的容器
    html_queue = Queue()

    with open(flist_url, 'r', encoding='utf-8') as fr:
        num = input("请输入想要开始的行号:")
        n = int(num) - 1
        count = n
        for line in fr.readlines()[n:]:
            print(line)
            # print(i)
            url = re.search(r'http.+\.html', line).group(0)
            # 向容器中添加url
            url_queue.put(url)
    crawl_list = []
    # 生成50个爬虫, 同时爬取
    for i in range(0, 50):
        crawl1 = CrawlInfo(url_queue, html_queue)
        crawl_list.append(crawl1)
        crawl1.start()

    # 等待爬取网页信息的线程完成, 在执行主线程
    for crawl in crawl_list:
        crawl.join()

    fr.close()


if __name__ == '__main__':
    main()

至此, 爬去文秘站文章内容的文章就大功告成了, 希望通过该案例来扩展获取指定网站的内容

[扩展阅读]:

-Clearlight

发布了190 篇原创文章 · 获赞 153 · 访问量 9万+

私信关注

爬取文秘站实例 - Python爬虫(三)

导航

一. 技术栈

二. 爬取步骤

2.1 获取文秘站文章的所有链接

2.2.1 获取指定内容

2.2.2 爬取效率优化

猜你喜欢