纯干货,你懂的,各位看官直接看代码:
package com.yida.spider4j.crawler.utils.xml; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.yida.spider4j.crawler.utils.common.GerneralUtils; /** * XML常用操作工具类 * * @since 1.0 * @author [email protected] * @date 2015-6-16下午3:39:10 * */ public class XMLUtils { private DocumentBuilder builder; private XPath xpath; private XMLUtils () { init(); } private static class SingletonHolder { private static final XMLUtils INSTANCE = new XMLUtils(); } public static final XMLUtils getInstance() { return SingletonHolder.INSTANCE; } private void init() { if(builder == null) { DocumentBuilderFactory domfactory = DocumentBuilderFactory .newInstance(); domfactory.setValidating(false); domfactory.setIgnoringComments(true); try { builder = domfactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new RuntimeException( "Create DocumentBuilder instance occur one exception."); } } if(xpath == null) { XPathFactory xpfactory = XPathFactory.newInstance(); xpath = xpfactory.newXPath(); } } /** * @Author: Lanxiaowei([email protected]) * @Title: document2String * @Description: W3C Document对象转成XML String * @param @param doc * @param @return * @return String * @throws */ public String document2String(Document doc) { DOMSource domSource = new DOMSource(doc); StringWriter writer = new StringWriter(); StreamResult result = new StreamResult(writer); TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer; try { transformer = tf.newTransformer(); transformer.transform(domSource, result); } catch (TransformerException e) { throw new RuntimeException( "Transformer org.w3c.dom.document object occur one exception."); } return writer.toString(); } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据XML路径解析XML文档 * @param path * @return * @return Document * @throws */ public Document parseDocument(String path) { try { return builder.parse(path); } catch (SAXException e) { throw new RuntimeException( "The xml path is invalid or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The xml path is invalid or parsing xml occur exception."); } } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据文件解析XML文档 * @param file * @return * @return Document * @throws */ public Document parseDocument(File file) { try { return builder.parse(file); } catch (SAXException e) { throw new RuntimeException( "The input xml file is null or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The input xml file is null or parsing xml occur exception."); } } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据输入流解析XML文档 * @param is * @return * @throws IOException * @throws SAXException * @return Document * @throws */ public Document parseDocument(InputStream is) { try { return builder.parse(is); } catch (SAXException e) { throw new RuntimeException( "The input xml fileInputStream is null or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The input xml fileInputStream is null or parsing xml occur exception."); } } /** * @Author: Lanxiaowei([email protected]) * @Title: fragment2Document * @Description: 将html代码片段转换成document对象 * @param @param fragment * @param @return * @return Document * @throws */ public Document fragment2Document(String fragment) { try { return builder.parse(new InputSource(new StringReader(fragment))); } catch (SAXException e) { throw new RuntimeException( "parse fragment to document occur SAXException,please check your fragment."); } catch (IOException e) { throw new RuntimeException( "parse fragment to document occur one IOException."); } } /** * @Author Lanxiaowei * @Title: selectNodes * @Description: 通过xpath获取节点列表 * @param node * @param expression * @return * @throws XPathExpressionException * @return NodeList * @throws */ public NodeList selectNodes(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (NodeList) xpexpreesion.evaluate(node, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author Lanxiaowei * @Title: selectSingleNode * @Description: 通过xpath获取单个节点 * @param node * @param expression * @return * @return Node * @throws */ public Node selectSingleNode(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (Node) xpexpreesion.evaluate(node, XPathConstants.NODE); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author Lanxiaowei * @Title: getNodeText * @Description: 根据xpath获取节点的文本值(只返回匹配的第一个节点的文本值) * @param node * @param expression * @return * @return String * @throws */ public String getNodeText(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (String) xpexpreesion.evaluate(node, XPathConstants.STRING); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author: Lanxiaowei([email protected]) * @Title: getMultiNodeText * @Description: 根据xpath获取节点的文本值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的文本值) * @param @param node * @param @param expression * @param @return * @return List<String> * @throws */ public List<String> getMultiNodeText(Node node, String expression) { NodeList nodeList = selectNodes(node, expression); if(null == nodeList || nodeList.getLength() == 0) { return null; } List<String> list = new ArrayList<String>(); for(int i=0; i < nodeList.getLength(); i++) { Node n = nodeList.item(i); String text = n.getTextContent(); list.add(text); } return list; } /** * @Author: Lanxiaowei([email protected]) * @Title: getNodeAttributeValue * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则只会提取匹配到的第一个节点的属性值) * @param @param node * @param @param expression * @param @param atrributeName * @param @return * @return String * @throws */ public String getNodeAttributeValue(Node node, String expression, String atrributeName) { Node matchNode = selectSingleNode(node, expression); if (null == matchNode) { return null; } Node attNode = matchNode.getAttributes().getNamedItem( atrributeName); if (null == attNode) { return null; } return attNode.getNodeValue(); } /** * @Author: Lanxiaowei([email protected]) * @Title: getMultiNodeAttributeValue * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的属性值) * @param @param node * @param @param expression Xpath表达式,如div\span[@class] * @param @param atrributeName 属性名称 * @param @return * @return List<String> * @throws */ public List<String> getMultiNodeAttributeValue(Node node, String expression,String atrributeName) { NodeList nodeList = selectNodes(node, expression); if(null == nodeList || nodeList.getLength() == 0) { return null; } List<String> list = new ArrayList<String>(); for(int i=0; i < nodeList.getLength(); i++) { Node currentItem = nodeList.item(i); Node attNode = currentItem.getAttributes().getNamedItem(atrributeName); if(null == attNode) { continue; } String val = currentItem.getAttributes().getNamedItem(atrributeName).getNodeValue(); list.add(val); } return list; } public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { /*String fragment = "<data><employee><name>益达</name>" + "<title>Manager</title></employee></data>"; XMLUtils util = new XMLUtils(); Document doc = util.fragment2Document(fragment); NodeList nodes = doc.getElementsByTagName("employee"); for (int i = 0; i < nodes.getLength(); i++) { Element element = (Element) nodes.item(i); NodeList name = element.getElementsByTagName("name"); Element line = (Element) name.item(0); System.out.println("Name: " + line.getNodeName() + ":" + line.getTextContent()); NodeList title = element.getElementsByTagName("title"); line = (Element) title.item(0); System.out.println("Name: " + line.getNodeName() + ":" + line.getTextContent()); }*/ String fragment = "<data><employee><name id=\"1\">益达</name><name id=\"2\">yida</name>" + "<title>Manager</title></employee></data>"; XMLUtils util = new XMLUtils(); Document doc = util.fragment2Document(fragment); List<String> strList = util.getMultiNodeText(doc, "//employee/name[@id]"); String s = GerneralUtils.joinCollection(strList); System.out.println(s); strList = util.getMultiNodeAttributeValue(doc, "//employee/name[@id]", "id"); s = GerneralUtils.joinCollection(strList); System.out.println(s); } }
注意这里说的Document指的都是org.w3c.dom.Document,而不是JDOM or DOM4J or Jsoup里的Document.org.w3c.dom.Document是JDK原生对象.