爬虫抓取图片

实现技术：Jsoup+Httpclient+FileUtils+StringUtils

Jsoup：html解析工具，功能强大，使用方法类似于jquery

一：导入maven依赖

<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.10.2</version>
</dependency>

<dependency>
	<groupId>org.apache.httpcomponents</groupId>
	<artifactId>httpclient</artifactId>
	<version>4.5.3</version>
</dependency>

<dependency>
	<groupId>org.apache.commons</groupId>
	<artifactId>commons-io</artifactId>
	<version>1.3.2</version>
</dependency>

<dependency>
	<groupId>org.apache.commons</groupId>
	<artifactId>commons-lang3</artifactId>
	<version>3.0</version>
</dependency>

二：爬虫程序入口

package com.debo.crawler;


public class Main {
	public static void main(String[] args) throws Exception {
		new Cawler().start();
	}
}

三：爬虫代码

package com.debo.crawler;

import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Cawler {

	// 定义资源网址
	private String url = "http://www.umei.cc/weimeitupian/oumeitupian/shuaige.htm";

	public void start() {

		// 使用httpclient的get请求获取网页源代码
		String html = doGet(url);

		// jsoup解析源代码
		Document document = Jsoup.parse(html);

		// 获取源代码中所有的img标签
		Elements element = document.select("img");

		// 遍历，读取图片地址，并下载
		for (Element e : element) {
			String src = e.attr("src");
			//判断src是否满足条件
			if (StringUtils.isNotBlank(src) && StringUtils.contains(src, "http://")) {
				System.out.println("下载地址：" + src);
				DownUtils.downFile(src, "D://download//");
			}
		}
		System.out.println("下载完毕");

	}

	/**
	 * httpclient的get请求
	 * @param @param url
	 * @param @return
	 * @throws
	 */
	public String doGet(String url) {
		
		CloseableHttpClient httpClient = HttpClients.createDefault();

		HttpGet httpGet = new HttpGet(url);

		CloseableHttpResponse response = null;

		try {
			
			response = httpClient.execute(httpGet);
			return EntityUtils.toString(response.getEntity(), "UTF-8");
			
		} catch (Exception e) {
			e.printStackTrace();
		}
		return null;
	}

}

三：DownUtils

package com.debo.crawler;

import java.io.File;
import java.net.URL;
import org.apache.commons.io.FileUtils;

public class DownUtils {

	public static void downFile(String url, String dir) {
		
		String fileName = url.substring(url.lastIndexOf("/")+1);
		try {
			URL httpurl = new URL(url);
			File dirfile = new File(dir);
			if (!dirfile.exists()) {
				dirfile.mkdirs();
			}
			FileUtils.copyURLToFile(httpurl, new File(dir + fileName));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}


}

四：运行测试

实现技术：Jsoup+Httpclient+FileUtils+StringUtils

Jsoup：html解析工具，功能强大，使用方法类似于jquery

猜你喜欢