使用jsoup爬取了下某个网站中的《冰与火之歌》信息,并将格式保存成了json格式到文本文件中。
具体执行的代码如下:
public static void main(String[] args) throws IOException {
TestJsoupBingYuHuo tj = new TestJsoupBingYuHuo();
tj.test();
}
static String urlPath = "http://www.bingyuhuozhige.cc";
static String srcPath = "D:\\study\\jsoup\\bingyuhuozhige\\";
static boolean writeOnOff = true;
public void test() throws IOException {
Document document = JsoupUtils.getRoot(urlPath);
Elements h3lists = document.select("h3");
this.analysisH3List(h3lists);
}
// 解析册信息
private JSONArray analysisH3List(Elements h3List) throws IOException {
int n = 0;
for (Element h3 : h3List) {
n++;
if (n < 7) { //改变这里的数字,可以设置从第几册开始跑
continue;
}
// h3 是册名
String h3Text = h3.text().trim();
System.out.println(h3Text);
if(writeOnOff) {
FileUtils.createDir(srcPath + h3Text); // 创建册目录
}
// p 是册的描述
Element p = this.analysisPList(h3, srcPath + h3Text);// 解析册的描述,并存储到册对应目录下的文件中
// div class = row 内的 links 是每个章节的首页
this.analysisFirstPageLink(p, srcPath + h3Text);
}
return null;
}
// 解析 册的描述
private Element analysisPList(Element h3, String path) {
Element p = null;
while (p == null || !p.tagName().equals("p")) {
if (p != null) {
p = p.nextElementSibling();
} else {
p = h3.nextElementSibling();
}
System.out.println("=");
}
String pText = p.text().trim();
System.out.println(pText);
if(writeOnOff) {
FileUtils.writeLine(path + File.separator + "简述", pText);
}
return p;
}
// 解析每章的首页url
private JSONArray analysisFirstPageLink(Element p, String path) throws IOException {
Element divRow = null;
while (divRow == null || !divRow.tagName().equals("div") || !divRow.hasClass("row")) {
if (divRow != null) {
divRow = divRow.nextElementSibling();
} else {
divRow = p.nextElementSibling();
}
System.out.println("=");
}
Elements links = divRow.select("a[href]");
for (Element link : links) {
String url = link.attr("href").trim();
String title = link.attr("title").trim();// 为每个 title 创建一个 文本
title = title.startsWith("第六十一章 狮鹫的重生") ? "第六十一章 狮鹫的重生(格里夫·琼恩·克林顿)" : title;
if(writeOnOff) {
FileUtils.writeLine(path + File.separator + title + ".txt", title);
}
System.out.println(title + " = " + url);
this.analysisAllPagesLink(url, path + File.separator + title + ".txt");//进行下一步,解析每个章节所有页面的链接,并获取每页的文本内容
}
return null;
}
// 解析 每章 首页,获取到每个章节的所有页链接 ,并获取每页的文本内容
// 记得要做 睡眠处理,避免因网络延时出错
private void analysisAllPagesLink(String firstPageUrl, String path) throws IOException {
try {
Thread.sleep(50);
} catch (InterruptedException e) {
e.printStackTrace();
}
Document document = JsoupUtils.getRoot(urlPath + firstPageUrl);
Elements paginations = document.select("div.pagination");
Element pagination = paginations.get(0);
Elements links = pagination.select("a[href]");
if(links.size() > 0) {
int lastNum = 2;
for (Element link : links) {
String title = link.text().trim();
if("尾页".equals(title)) {
String url = link.attr("href").trim();
lastNum = Integer.parseInt(url.substring(url.indexOf("_") + 1, url.indexOf(".html")));
}
}
//解析每页的内容,把内容写到文件中
this.analysisPageText(document, path);
String urlPrefix = firstPageUrl.substring(0, firstPageUrl.indexOf(".html"));
for(int i = 2 ; i <= lastNum ; i ++) {
//解析每页的内容,把内容写到文件中
this.analysisPageText(urlPath + urlPrefix + "_" + i + ".html", path);
System.out.println(i);
}
}else {
//解析每页的内容,把内容写到文件中
this.analysisPageText(document, path);
}
}
private void analysisPageText(String pageUrl, String path) throws IOException {
try {
Thread.sleep(50);
} catch (InterruptedException e) {
e.printStackTrace();
}
Document document = JsoupUtils.getRoot(pageUrl);
this.analysisPageText(document, path);
}
private void analysisPageText(Document document, String path) {
Elements span9s = document.select("div.span9");
if(span9s.size() < 1) {
span9s = document.select("div.span12");
}
Element span9 = span9s.get(0);
String text = span9.html();
text = text.substring(0, text.indexOf("<div class=\"pagination\""));
// System.out.println(text);
int begin = text.lastIndexOf("</div>") + 6;
text = text.substring(begin);
text = text.replaceAll("<br>", "\n\r").replaceAll("<p>", "\n\r").replaceAll("</p>", "\n\r");
if(writeOnOff) {
FileUtils.writeLine(path, text);
}
}
代码中确实的其它jar包信息和工具类,见我的另一篇文章,链接如下: