以下是一个针对菜鸟教程Java修饰符类型页面的爬虫代码,包含主页内容提取和子页面抓取功能:
python
复制
import requests from bs4 import BeautifulSoup import os import time from urllib.parse import urljoin # 配置参数 BASE_URL = 'https://www.runoob.com/java/java-modifier-types.html' OUTPUT_DIR = 'java_modifiers_data' HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } def init_directory(): """创建输出目录""" if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) def get_page_content(url): """获取页面内容""" try: response = requests.get(url, headers=HEADERS) response.raise