Html解析 新工具 HtmlCleanner

    

Java代码
  1.       HtmlCleaner cleaner = new HtmlCleaner();     
  2.         
  3.       TagNode node = cleaner.clean(new URL("http://finance.sina.com.cn/money/nmetal/20091209/10157077895.shtml"));     
  4.       //按tag取.     
  5.       Object[] ns = node.getElementsByName("title"true);    //标题     
  6.         
  7.       if(ns.length > 0) {     
  8.           System.out.println("title="+((TagNode)ns[0]).getText());     
  9.       }     
  10.       // /html/body/div[2]/div[4]/div/div/div/div[2]/p  
  11.       ns = node.evaluateXPath("//div[@class=\"blkContainerSblkCon\"]/p"); // 选取class为指定blkContainerSblkCon的 div下面的所有p标签  
  12.       for (int i = 0; i < ns.length; i++) {  
  13.          String in = cleaner.getInnerHtml((TagNode)ns[i]);  
  14.            System.out.println("<p>"+in + "</p>");  
  15. }  
  16.       String in = cleaner.getInnerHtml((TagNode)ns[0]);  
  17.       System.out.println(in);  
  18.   
  19.       System.out.println(((TagNode)ns[0]).getText());   
        HtmlCleaner cleaner = new HtmlCleaner();   
        
        TagNode node = cleaner.clean(new URL("http://finance.sina.com.cn/money/nmetal/20091209/10157077895.shtml"));   
        //按tag取.   
        Object[] ns = node.getElementsByName("title", true);    //标题   
        
        if(ns.length > 0) {   
            System.out.println("title="+((TagNode)ns[0]).getText());   
        }   
        // /html/body/div[2]/div[4]/div/div/div/div[2]/p
        ns = node.evaluateXPath("//div[@class=\"blkContainerSblkCon\"]/p"); //选取class为指定blkContainerSblkCon的div下面的所有p标签
        for (int i = 0; i < ns.length; i++) {
        	 String in = cleaner.getInnerHtml((TagNode)ns[i]);
             System.out.println("<p>"+in + "</p>");
		}
        String in = cleaner.getInnerHtml((TagNode)ns[0]);
        System.out.println(in);

        System.out.println(((TagNode)ns[0]).getText()); 



Java 代码
  1.         HtmlCleaner cleaner = new HtmlCleaner();     
  2.         String url = "http://finance.sina.com.cn/nmetal/hjfx.html";  
  3.         URL _url = new URL(url);  
  4.         TagNode node = cleaner.clean(_url);     
  5.           
  6.         //按tag取.     
  7.         Object[] ns = node.getElementsByName("title"true);    //标题     
  8.           
  9.         if(ns.length > 0) {   
  10.             System.out.println("title="+((TagNode)ns[0]).getText());     
  11.         }    
  12.           
  13.           
  14.         ns = node.evaluateXPath("//*[@class='Frame-Row3-01-C']/table[2]/tbody/tr/td/a"); // 选取class为指定blkContainerSblkCon的 div下面的所有p  
  15.         for (int i = 0; i < ns.length; i++) {  
  16.               
  17.             //取链接文本  
  18. //           String in = cleaner.getInnerHtml((TagNode)ns[i]);  
  19. //           System.out.println(in);  
  20.               
  21.             //获取链接的  
  22.             TagNode n = (TagNode) ns[i];  
  23. //          System.out.println(n.getAttributeByName("href"));  
  24.             System.out.println(new URL(_url,n.getAttributeByName("href")).toString());  
  25.         }  
  26. //        String in = cleaner.getInnerHtml((TagNode)ns[0]);  
  27. //        System.out.println(in);  
  28.   
  29. //        System.out.println(((TagNode)ns[0]).getText());  
  30.           
  31. //        System.out.println("ul/li:");     
  32. //        //按xpath取     
  33. //        ns = node.evaluateXPath("//div[@class='d_1']//li");     
  34. //        for(Object on : ns) {     
  35. //            TagNode n = (TagNode) on;     
  36. //            System.out.println("\ttext="+n.getText());     
  37. //        }     
  38. //        System.out.println("a:");     
  39. //        //按属性值取     
  40. //        ns = node.getElementsByAttValue("name", "my_href", true, true);     
  41. //        for(Object on : ns) {     
  42. //            TagNode n = (TagNode) on;     
  43. //            System.out.println("\thref="+n.getAttributeByName("href")+", text="+n.getText());     
  44. //        } 

猜你喜欢

转载自nhy520.iteye.com/blog/660409