jsoup爬取网站信息之《庆余年》

使用jsoup爬取了下某个小说网站中的《庆余年》信息,并将格式保存成了json格式到文本文件中。

具体执行的代码如下:

public static void main(String[] args) throws IOException {
		TestJsoup3 tj = new TestJsoup3();
		tj.test();
	}
	
	static String path = "http://www.xbiquge.la";
	public void test() throws IOException {
		String url = "/2/1690/";
		Document document = JsoupUtils.getRoot(path + url);
		Elements lists = document.select("#list");
		JSONArray arr = this.analysisList(lists.get(0));//解析所有章节
		String dpath = "D:\\study\\jsoup\\qynjson.js";
		FileUtils.writeLine(dpath, arr.toJSONString());
	}
	private JSONArray analysisList(Element list) throws IOException {
		Elements links = list.select("a[href]");
		JSONArray arr = new JSONArray();
		for(Element link : links) {
			try {
				Thread.sleep(10000);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
			String url = link.attr("href");//每个章节对应的url,子路径,需要加上 Path
			String name = link.text();//每个章节的名称
			arr.add(this.analysisChapter(name, path + url));
		}
		return arr;
	}
	
	private JSONObject analysisChapter(String name, String url) throws IOException {
		Document document = JsoupUtils.getRoot(url);
		Elements contents = document.select("#content");
		Element content = contents.get(0);//章节内容
		String text = content.text().trim();
		text = text.replaceAll("\"", "");
		JSONObject json = new JSONObject();
		json.put("name", name);
		json.put("content", text);
		System.out.println(name);
		return json;
	}

代码中确实的其它jar包信息和工具类,见我的另一篇文章,链接如下:

jsoup爬取网站信息之《本草纲目》

猜你喜欢

转载自blog.csdn.net/u013276512/article/details/112647930