jsoup爬取网站信息之《冰与火之歌》

使用jsoup爬取了下某个网站中的《冰与火之歌》信息,并将格式保存成了json格式到文本文件中。

具体执行的代码如下: 

public static void main(String[] args) throws IOException {
		TestJsoupBingYuHuo tj = new TestJsoupBingYuHuo();
		tj.test();
	}

	static String urlPath = "http://www.bingyuhuozhige.cc";
	static String srcPath = "D:\\study\\jsoup\\bingyuhuozhige\\";
	static boolean writeOnOff = true;
	
	public void test() throws IOException {
		Document document = JsoupUtils.getRoot(urlPath);
		Elements h3lists = document.select("h3");
		this.analysisH3List(h3lists);
	}

	// 解析册信息
	private JSONArray analysisH3List(Elements h3List) throws IOException {
		int n = 0;
		for (Element h3 : h3List) {
			n++;
			if (n < 7) { //改变这里的数字,可以设置从第几册开始跑
				continue;
			}
			// h3 是册名
			String h3Text = h3.text().trim();
			System.out.println(h3Text);
			if(writeOnOff) {
				FileUtils.createDir(srcPath + h3Text); // 创建册目录
			}
			// p 是册的描述
			Element p = this.analysisPList(h3, srcPath + h3Text);// 解析册的描述,并存储到册对应目录下的文件中
			// div class = row 内的 links 是每个章节的首页
			this.analysisFirstPageLink(p, srcPath + h3Text);
		}
		return null;
	}

	// 解析 册的描述
	private Element analysisPList(Element h3, String path) {
		Element p = null;
		while (p == null || !p.tagName().equals("p")) {
			if (p != null) {
				p = p.nextElementSibling();
			} else {
				p = h3.nextElementSibling();
			}
			System.out.println("=");
		}
		String pText = p.text().trim();
		System.out.println(pText);
		if(writeOnOff) {			
			FileUtils.writeLine(path + File.separator + "简述", pText);
		}
		return p;
	}

	// 解析每章的首页url
	private JSONArray analysisFirstPageLink(Element p, String path) throws IOException {
		Element divRow = null;
		while (divRow == null || !divRow.tagName().equals("div") || !divRow.hasClass("row")) {
			if (divRow != null) {
				divRow = divRow.nextElementSibling();
			} else {
				divRow = p.nextElementSibling();
			}
			System.out.println("=");
		}
		Elements links = divRow.select("a[href]");
		for (Element link : links) {
			String url = link.attr("href").trim();
			String title = link.attr("title").trim();// 为每个 title 创建一个 文本
			title = title.startsWith("第六十一章 狮鹫的重生") ? "第六十一章 狮鹫的重生(格里夫·琼恩·克林顿)" : title;
			if(writeOnOff) {				
				FileUtils.writeLine(path + File.separator + title + ".txt", title);
			}
			System.out.println(title + " = " + url);
			this.analysisAllPagesLink(url, path + File.separator + title + ".txt");//进行下一步,解析每个章节所有页面的链接,并获取每页的文本内容
		}
		return null;
	}
	// 解析 每章 首页,获取到每个章节的所有页链接 ,并获取每页的文本内容
	// 记得要做 睡眠处理,避免因网络延时出错
	private void analysisAllPagesLink(String firstPageUrl, String path) throws IOException {
		try {
			Thread.sleep(50);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		Document document = JsoupUtils.getRoot(urlPath + firstPageUrl);
		Elements paginations = document.select("div.pagination");
		Element pagination = paginations.get(0);
		Elements links = pagination.select("a[href]");
		if(links.size() > 0) {
			int lastNum = 2;
			for (Element link : links) {
				String title = link.text().trim(); 
				if("尾页".equals(title)) {
					String url = link.attr("href").trim();
					lastNum = Integer.parseInt(url.substring(url.indexOf("_") + 1, url.indexOf(".html")));
				}
			}
			//解析每页的内容,把内容写到文件中
			this.analysisPageText(document, path);
			String urlPrefix = firstPageUrl.substring(0, firstPageUrl.indexOf(".html"));
			for(int i = 2 ; i <= lastNum ; i ++) {
				//解析每页的内容,把内容写到文件中
				this.analysisPageText(urlPath + urlPrefix + "_" + i + ".html", path);
				System.out.println(i);
			}
		}else {
			//解析每页的内容,把内容写到文件中
			this.analysisPageText(document, path);
		}
	}
	private void analysisPageText(String pageUrl, String path) throws IOException {
		try {
			Thread.sleep(50);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		Document document = JsoupUtils.getRoot(pageUrl);
		this.analysisPageText(document, path);
	}
	
	private void analysisPageText(Document document, String path) {
		Elements span9s = document.select("div.span9");
		if(span9s.size() < 1) {
			span9s = document.select("div.span12");
		}
		Element span9 = span9s.get(0);
		String text = span9.html();
		text = text.substring(0, text.indexOf("<div class=\"pagination\""));
//		System.out.println(text);
		int begin = text.lastIndexOf("</div>") + 6;
		text = text.substring(begin);
		text = text.replaceAll("<br>", "\n\r").replaceAll("<p>", "\n\r").replaceAll("</p>", "\n\r");

		if(writeOnOff) {				
			FileUtils.writeLine(path, text);
		}
	}

代码中确实的其它jar包信息和工具类,见我的另一篇文章,链接如下:

jsoup爬取网站信息之《本草纲目》

猜你喜欢

转载自blog.csdn.net/u013276512/article/details/113945874