安装python的excel库:
pip install pandas openpyxl
展示:
代码:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
# 设置 Chrome 浏览器选项
chrome_options = Options()
chrome_options.add_argument("--start-maximized") # 启动时最大化窗口
# 指定 ChromeDriver 路径
driver_path = r'D:\software\chromedriver\chromedriver127\chromedriver-win64\chromedriver.exe'
# 定义公共常量
BASE_URL = "http://www.***.com"
TARGET_XPATH = "/html/body/div[6]/ul[1]/li/font[1]/b"
SUCCESS_TEXT = "5"
NEW_PAGE_URL = "http://www.***.com/Home/Category/index/fClass/295.html"
LI_TARGET_XPATH = "//ul[@class='class_gdlist_frm']//li"
# 创建 Service 对象并传入 ChromeDriver 路径
service = Service(executable_path=driver_path)
# 启动浏览器并打开百度首页
driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(BASE_URL)
# 等待页面加载完成,假设等待某个元素加载完成
# 企查查已陪伴您1428天
WebDriverWait(driver, 600).until(
EC.presence_of_element_located((By.XPATH, TARGET_XPATH))
)
# 获取目标元素
element = driver.find_element(By.XPATH, TARGET_XPATH)
# 获取并打印该元素的文本内容
element_text = element.text
if element_text == SUCCESS_TEXT:
driver.get(NEW_PAGE_URL) # 跳转到新页面
# 等待页面加载完成
time.sleep(2) # 等待 2 秒以确保页面加载
# 查找所有商品的 li 项目
items = driver.find_elements(By.XPATH, LI_TARGET_XPATH)
# 初始化一个空列表,用来存储商品数据
data = []
# 遍历每个 li 项目,提取数据
for item in items:
try:
# 获取标题
title_element = item.find_element(By.XPATH, ".//a[@title]")
title = title_element.get_attribute("title")
# 获取价格
price_element = item.find_element(By.XPATH, ".//span//big")
price = price_element.text
# 获取图片链接
image_element = item.find_element(By.XPATH, ".//img")
image_url = image_element.get_attribute("src")
# 获取商品详情链接
detail_link_element = item.find_element(By.XPATH, ".//a[@title]")
detail_link = detail_link_element.get_attribute("href")
# 将数据添加到列表中
data.append({
'标题': title,
'价格': price,
'图片': image_url,
'超链接': detail_link
})
# 打印提取的数据
print(f"标题: {title}")
print(f"价格: {price}")
print(f"图片: {image_url}")
print(f"超链接: {detail_link}")
print("-" * 50) # 分隔符
except Exception as e:
print(f"Error occurred: {e}")
# 创建 DataFrame,并将数据保存为 Excel 文件
df = pd.DataFrame(data)
# 保存为 Excel 文件
df.to_excel('products.xlsx', index=False, engine='openpyxl')
# 提示保存成功
print("数据已成功保存到 products.xlsx")
# 等待一段时间,保持浏览器开启
input("Press Enter to close the browser...")
# 关闭浏览器
# driver.quit()