引入jar包:
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>3.141.59</version>
</dependency>
java代码:
package com.jd.demo;
import org.openqa.selenium.By;
import org.openqa.selenium.JavascriptExecutor;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeOptions;
import java.util.ArrayList;
import java.util.List;
/**
* @author zqq
* @date 2020/9/2 15:57
*/
public class Demo {
/**
* WebDriver
*/
private static WebDriver driver = null;
/**
* 爬取多少页的数据 for循环
*/
private static int pageCount = 2;
/**
* 存放爬取的商品信息
*/
private static List<String> gList = new ArrayList<String>();
static {
//谷歌驱动路径
System.setProperty("webdriver.chrome.driver","C:\\Users\\44141\\Desktop\\chromedriver\\chromedriver.exe");
ChromeOptions chromeOptions = new ChromeOptions();
//设置不打开浏览器
chromeOptions.addArguments("--headless");
//初始化驱动
//driver = new ChromeDriver(chromeOptions);
driver = new ChromeDriver();
}
public static void main(String[] args) {
try {
//爬取
demo1();
}catch (Exception e){
e.printStackTrace();
}finally {
if(driver != null){
driver.quit();
}
}
}
private static void demo1() {
driver.get("https://www.jd.com/");
//输入关键字衣服
driver.findElement(By.id("key")).sendKeys("衣服");
//点击搜索按钮
driver.findElement(By.cssSelector("button.button")).click();
//爬取开始,由于京东商品页数太多就不做下一页点击效果,直接循环爬取,可设置页数,要抓取大批量数据改成多线程逻辑
for(int i = 0; i < pageCount ; i ++){
try {
//获取商品信息
getGoods();
}catch (Exception e){
e.printStackTrace();
}
}
for(int i = 0 ; i < gList.size() ;i ++){
System.out.println("爬取商品" + (i + 1) + ":========:" + gList.get(i));
}
System.out.println("========end==========");
}
private static void getGoods(){
//根据响应时间传线程睡眠时间,或者可以判断页面元素是否存在,无限等待直到出现,睡眠简单明了
sleep(2);
//执行js,滚动条下拉到最底
((JavascriptExecutor) driver).executeScript("window.scrollTo(0,document.body.scrollHeight)");
//这里睡3秒,有京东下面的商品加载慢的情况,根据网络速度实际调整
sleep(3);
List<WebElement> elementList = driver.findElements(By.xpath("//div[@id='J_goodsList']/ul/li"));
for(WebElement e : elementList){
//获取 $('.p-name').text() ,class = p-name的文本内容
//获取其他信息检查html页面获取
String text = e.findElement(By.className("p-name")).getText();
gList.add(text);
}
//点击下一页
driver.findElement(By.className("pn-next")).click();
}
/**
* 线程睡眠等待页面响应,睡眠时间根据实际响应速度睡
* @param i
*/
private static void sleep(int i){
try {
Thread.sleep(i * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
输出: