实现技术:Jsoup+Httpclient+FileUtils+StringUtils
Jsoup:html解析工具,功能强大,使用方法类似于jquery
一:导入maven依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-io</artifactId>
<version>1.3.2</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.0</version>
</dependency>
二:爬虫程序入口
package com.debo.crawler;
public class Main {
public static void main(String[] args) throws Exception {
new Cawler().start();
}
}
三:爬虫代码
package com.debo.crawler;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Cawler {
// 定义资源网址
private String url = "http://www.umei.cc/weimeitupian/oumeitupian/shuaige.htm";
public void start() {
// 使用httpclient的get请求获取网页源代码
String html = doGet(url);
// jsoup解析源代码
Document document = Jsoup.parse(html);
// 获取源代码中所有的img标签
Elements element = document.select("img");
// 遍历,读取图片地址,并下载
for (Element e : element) {
String src = e.attr("src");
//判断src是否满足条件
if (StringUtils.isNotBlank(src) && StringUtils.contains(src, "http://")) {
System.out.println("下载地址:" + src);
DownUtils.downFile(src, "D://download//");
}
}
System.out.println("下载完毕");
}
/**
* httpclient的get请求
* @param @param url
* @param @return
* @throws
*/
public String doGet(String url) {
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = null;
try {
response = httpClient.execute(httpGet);
return EntityUtils.toString(response.getEntity(), "UTF-8");
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
}
三:DownUtils
package com.debo.crawler;
import java.io.File;
import java.net.URL;
import org.apache.commons.io.FileUtils;
public class DownUtils {
public static void downFile(String url, String dir) {
String fileName = url.substring(url.lastIndexOf("/")+1);
try {
URL httpurl = new URL(url);
File dirfile = new File(dir);
if (!dirfile.exists()) {
dirfile.mkdirs();
}
FileUtils.copyURLToFile(httpurl, new File(dir + fileName));
} catch (Exception e) {
e.printStackTrace();
}
}
}
四:运行测试