版权声明:原创文章转载请声明出处https://blog.csdn.net/qq_40374604 https://blog.csdn.net/qq_40374604/article/details/86029399
Jsoup---
读取文件中的种子页,整站爬取整站数据,并保存。
如果你想简单用一下,可以,如果学习使用,个人觉得有点乱,
package cn;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
public class CrawlerUtils {
public static int count = 0;
// seeds
public static List<String> list = new ArrayList<String>();
// 存所有url
public static HashSet<String> hashSet = new HashSet<String>();
// 线程池
ExecutorService pool = Executors.newFixedThreadPool(5);
public static String gethtml(String url) {
String content;
try {
Connection con = Jsoup.connect(url);
con.header("Accept", "text/html, application/xhtml+xml, */*");
con.header("Content-Type", "application/x-www-form-urlencoded");
con.header("User-Agent",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");
con.header("Cookie", "");
content = con.get().toString();
} catch (IOException e) {
e.printStackTrace();
return null;
}
return content;
}
/**
* 拿取所有包含主站的url地址 返回list
*/
public static List<String> geturl(String html, String url) {
List<String> list = new ArrayList<String>();
Pattern pattern = Pattern.compile("href=\"(.*?)\"");
Matcher matcher = pattern.matcher(html);
// find向前迭代
while (matcher.find()) {
String urlline = matcher.group().replace("href=\"", "")
.replace("\"", "");
if (urlline.contains("http")) {
if (url.contains("http")) {
if (urlline.contains(url.replace("http://", ""))) {
System.out.println(urlline);
list.add(urlline);
}
}
} else if (urlline.contains("https")) {
if (url.contains("https")) {
if (urlline.contains(url.replace("https://", ""))) {
System.out.println(urlline);
list.add(urlline);
}
}
} else {
String urlString = url.substring(0, url.length() - 1) + urlline;
System.out.println(urlString);
list.add(urlString);
}
}
return list;
}
/**
* 保存html -写入文件
*
* @throws IOException
*/
public static void saveFile(String pathname, String html, String charset)
throws IOException {
FileUtils.write(new File(pathname), html, charset, true);
}
/**
* 通过字节流-写入文件
*
* @throws IOException
*/
public static void WriteByte(String pathname, String date, String charset)
throws IOException {
File file = new File(pathname);
OutputStream outputStream = new FileOutputStream(file);
byte[] datebyte = date.getBytes(charset);
outputStream.write(datebyte);
outputStream.close();
}
/** 主运行类 */
public static void mainUtil(String url, String maintitle) {
try {
String html = gethtml(url);
System.out.println(html);
List<String> urlList = geturl(html, url);
for (String string : urlList) {
if (hashSet.add(string)) {
String htmlline = gethtml(string);
try {
String title = "未命名";
title = Jsoup.parse(htmlline).getElementsByTag("title")
.get(0).text();
saveFile("E://crawler4j/房地产行业/" + maintitle + "/"
+ title + System.currentTimeMillis() + ".html",
htmlline, "utf8");
System.out.println("第" + count++ + "保存文件:" + string);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("第" + count + "写入失败!!!" + "网址:"
+ string);
}
}
}
} catch (Exception e) {
// TODO: handle exception
System.out.println("99999999999");
}
}
public static void main(String[] args) {
try {
FileReader reader = new FileReader("E://crawler4j/房地产行业seeds.txt");
BufferedReader br = new BufferedReader(reader);
while (br.ready()) {
String line = br.readLine();
list.add(line);
}
br.close();
reader.close();
} catch (Exception e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
System.out.println("没有种子页!!");
}
String url1 = "http://www.minagri.gov.rw/index.php?id=16";
for (String url : list) {
String maintitle = "未命名" + System.currentTimeMillis();
try {
maintitle = Jsoup.connect(url).get().getElementsByTag("title")
.get(0).text();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
continue;
}
mainUtil(url, maintitle);
String html = gethtml(url);
if (html.equals(null)) {
continue;
}
System.out.println(html);
List<String> urlList = geturl(html, url);
for (int i = 0; i < urlList.size(); i++) {
mainUtil(urlList.get(i), maintitle);
}
}
}
}