来自:深沉的船
任务描述:
某图书网站按条件查询得出一页20条记录,每条记录有书目的简要信息和url链接到书的详细信息。
需要抓取网站图书的详细信息,保存到本地数据库中。
任务分析:
用httpclient模拟执行url将网站的信息取回,再用jericho包,分析页面元素,将需要的信息取出,保存到数据库中。
因为数据量比较大,还是采用多线程的方式来执行抓取详细页面,分析获得数据。
处理过程:
按条件查询到图书列表信息后,主线程不停的下翻页面,分析每本图书的详细url,将url保存到一个ArrayList中,启多个子线程分别去抓取详细页面的信息。然后利用jericho包分析页面数据并保存到数据库中。
代码实现截取如下:
......
public class BookCatcher { private static ArrayList threads= new ArrayList();//存储未处理URL public static boolean isFinished=false; public String getUrl() { try { synchronized (threads) { if (threads.size() > 0) { String tmp = String.valueOf(threads.get(0)); threads.remove(0); return tmp; } else return null; } } catch (Exception e) { return null; } } public void process(){ //处理预处理 //下面开10个线程等待处理 new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); new Thread(new Processer(this)).start(); .... for(int j=0;j<pages;j++)//从第一页翻到最后一页 { ... source = CommonUtil.getSourceByUrl(url); List<Element> elements = source.getAllElementsByClass("ProductTitle"); for (Element element : elements){ String href = element.getContent().getFirstStartTag().getAttributeValue("href"); if (href!=null && !"".equals(href)){ synchronized (threads) { threads.add(bookurl);// 把URL存进去 } } } isFinished=true; //主线程处理完所有的url } class Processer implements Runnable { BookCatcher c; public Processer(BookCatcher c) { this.c = c; } public void run() { String bookUrl = null; while((bookUrl=c.getUrl())!=null || !BookCatcher.isFinished) //当还有记录时就处理 { if(bookUrl!=null) { //处理分析页面数据并将数据保存到数据库 Source source = CommonUtil.getSourceByUrl(bookUrl); String tmp = ""; BookBean bean = new BookBean(); bean.setStoreBookUrl(bookUrl); //书名 StartTag tag = source.getFirstStartTagByClass("BookTitle"); tmp = tag.getRenderer().toString(); bean.setName(tmp); //作者 tag = source.getFirstStartTagByClass("bookAuthor"); if (tag!=null){ List<StartTag> list = tag.getElement().getAllStartTags(HTMLElementName.A); if (list.size()>0) bean.setAuthor(list.get(0).getElement().getContent().toString()); } //书籍图片 tag = source.getFirstStartTag("id", "BookImage", false); if (tag!=null) bean.setPicUrl(tag.getAttributeValue("src").trim()); StartTag tagLeft = source.getFirstStartTagByClass("Left"); tmp=tagLeft.getRenderer().toString(); List<String> resList = new ArrayList<String>(); String[] leftArray = tmp.split("·"); for (String str:leftArray){ if ("".equals(str)) continue; resList.add(str); } StartTag tagRight = source.getFirstStartTagByClass("Right"); tmp = tagRight.getRenderer().toString(); String[] rightArray = tmp.split("·"); for (String str:rightArray){ if ("".equals(str)) continue; resList.add(str); } for (String str:resList){ try{ String name = CommonUtil.getString(str.split(":")[0]); String value = CommonUtil.getString(str.split(":")[1]); if ("ISBN".equals(name)) bean.setIsbn(value); if ("出版社".equals(name)) bean.setPublisherOrg(value); if ("页码".equals(name)) bean.setPageNum(value); if ("出版日期".equals(name)) bean.setPublishDate(value); if ("装帧".equals(name)) bean.setWrapType(value); if ("开本".equals(name)) bean.setFormat(value); }catch(ArrayIndexOutOfBoundsException ee){} } //定价 tag = source.getFirstStartTagByClass("BookPrice"); String price = tag.getElement().getAllStartTags(HTMLElementName.STRIKE).get(0).getRenderer().toString(); price = price.substring(1,price.length()); bean.setPrice(price); //零售价格 tag = source.getFirstStartTagByClass("DetailPrice"); if (tag!=null) bean.setStorePrice(tag.getElement().getAllStartTagsByClass("OurPrice").get(0).getRenderer().toString()); else bean.setStorePrice("0"); List<StartTag> tagList = source.getAllStartTagsByClass("ContentValue"); if(tagList!=null && tagList.size()>1){ // 内容简介 tag = tagList.get(0); tmp = tag.getRenderer().toString().trim(); if(tmp.length()>2000) tmp = tmp.substring(0, 1990)+"..."; bean.setContent(tmp); } new BookBO().saveBook(bean); }else//如果没标志处理则休眠一秒再重新开始处理 { try { Thread.sleep(1000); } catch (InterruptedException e) { e.printStackTrace(); } } } } } //CommonUtil中的方法,通过httpclient提交到url,返回的页面信息装入jericho的source public static Source getSourceByUrl(final String url) { Source source = null; HttpClient httpClient = new HttpClient(); GetMethod getMethod = new GetMethod(url); getMethod.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY); getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); try { int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { log.error("Method failed: " + getMethod.getStatusLine()); } source = new Source(getMethod.getResponseBodyAsStream()); } catch (HttpException e) { log.error("Please check your provided http address!"); e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { getMethod.releaseConnection(); } return source; }