有关W3C Document操作的XML工具类

       纯干货,你懂的,各位看官直接看代码:

package com.yida.spider4j.crawler.utils.xml;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import com.yida.spider4j.crawler.utils.common.GerneralUtils;

/**
 * XML常用操作工具类
 * 
 * @since 1.0
 * @author [email protected]
 * @date 2015-6-16下午3:39:10
 * 
 */
public class XMLUtils {
	private DocumentBuilder builder;

	private XPath xpath;
	
	private XMLUtils () {
		init();
	}
	
	private static class SingletonHolder {  
        private static final XMLUtils INSTANCE = new XMLUtils();  
    }  

    public static final XMLUtils getInstance() {  
        return SingletonHolder.INSTANCE; 
    }  

    private void init() {
    	if(builder == null) {
			DocumentBuilderFactory domfactory = DocumentBuilderFactory
					.newInstance();
			domfactory.setValidating(false);
			domfactory.setIgnoringComments(true);
			try {
				builder = domfactory.newDocumentBuilder();
			} catch (ParserConfigurationException e) {
				throw new RuntimeException(
						"Create DocumentBuilder instance occur one exception.");
			}
		}
		
		if(xpath == null) {
			XPathFactory xpfactory = XPathFactory.newInstance();
			xpath = xpfactory.newXPath();
		}
    }

       /**
	 * @Author: Lanxiaowei([email protected])
	 * @Title: document2String
	 * @Description: W3C Document对象转成XML String
	 * @param @param doc
	 * @param @return
	 * @return String
	 * @throws
	 */
	public String document2String(Document doc) {
		DOMSource domSource = new DOMSource(doc);
		StringWriter writer = new StringWriter();
		StreamResult result = new StreamResult(writer);
		TransformerFactory tf = TransformerFactory.newInstance();
		Transformer transformer;
		try {
			transformer = tf.newTransformer();
			transformer.transform(domSource, result);
		} catch (TransformerException e) {
			throw new RuntimeException(
				"Transformer org.w3c.dom.document object occur one exception.");
		}
		return writer.toString();
	}

	/**
	 * @Author Lanxiaowei
	 * @Title: parseDocument
	 * @Description: 根据XML路径解析XML文档
	 * @param path
	 * @return
	 * @return Document
	 * @throws
	 */
	public Document parseDocument(String path) {
		try {
			return builder.parse(path);
		} catch (SAXException e) {
			throw new RuntimeException(
					"The xml path is invalid or parsing xml occur exception.");
		} catch (IOException e) {
			throw new RuntimeException(
					"The xml path is invalid or parsing xml occur exception.");
		}
	}

	/**
	 * @Author Lanxiaowei
	 * @Title: parseDocument
	 * @Description: 根据文件解析XML文档
	 * @param file
	 * @return
	 * @return Document
	 * @throws
	 */
	public Document parseDocument(File file) {
		try {
			return builder.parse(file);
		} catch (SAXException e) {
			throw new RuntimeException(
					"The input xml file is null or parsing xml occur exception.");
		} catch (IOException e) {
			throw new RuntimeException(
					"The input xml file is null or parsing xml occur exception.");
		}

	}

	/**
	 * @Author Lanxiaowei
	 * @Title: parseDocument
	 * @Description: 根据输入流解析XML文档
	 * @param is
	 * @return
	 * @throws IOException
	 * @throws SAXException
	 * @return Document
	 * @throws
	 */
	public Document parseDocument(InputStream is) {
		try {
			return builder.parse(is);
		} catch (SAXException e) {
			throw new RuntimeException(
					"The input xml fileInputStream is null or parsing xml occur exception.");
		} catch (IOException e) {
			throw new RuntimeException(
					"The input xml fileInputStream is null or parsing xml occur exception.");
		}
	}

	/**
	 * @Author: Lanxiaowei([email protected])
	 * @Title: fragment2Document
	 * @Description: 将html代码片段转换成document对象
	 * @param @param fragment
	 * @param @return
	 * @return Document
	 * @throws
	 */
	public Document fragment2Document(String fragment) {
		try {
			return builder.parse(new InputSource(new StringReader(fragment)));
		} catch (SAXException e) {
			throw new RuntimeException(
					"parse fragment to document occur SAXException,please check your fragment.");
		} catch (IOException e) {
			throw new RuntimeException(
					"parse fragment to document occur one IOException.");
		}
	}

	/**
	 * @Author Lanxiaowei
	 * @Title: selectNodes
	 * @Description: 通过xpath获取节点列表
	 * @param node
	 * @param expression
	 * @return
	 * @throws XPathExpressionException
	 * @return NodeList
	 * @throws
	 */
	public NodeList selectNodes(Node node, String expression) {
		XPathExpression xpexpreesion = null;
		try {
			xpexpreesion = this.xpath.compile(expression);
			return (NodeList) xpexpreesion.evaluate(node,
					XPathConstants.NODESET);
		} catch (XPathExpressionException e) {
			throw new RuntimeException(
					"Compile xpath expression occur excetion,please check out your xpath expression.");
		}
	}

	/**
	 * @Author Lanxiaowei
	 * @Title: selectSingleNode
	 * @Description: 通过xpath获取单个节点
	 * @param node
	 * @param expression
	 * @return
	 * @return Node
	 * @throws
	 */
	public Node selectSingleNode(Node node, String expression) {
		XPathExpression xpexpreesion = null;
		try {
			xpexpreesion = this.xpath.compile(expression);
			return (Node) xpexpreesion.evaluate(node, XPathConstants.NODE);
		} catch (XPathExpressionException e) {
			throw new RuntimeException(
					"Compile xpath expression occur excetion,please check out your xpath expression.");
		}
	}

	/**
	 * @Author Lanxiaowei
	 * @Title: getNodeText
	 * @Description: 根据xpath获取节点的文本值(只返回匹配的第一个节点的文本值)
	 * @param node
	 * @param expression
	 * @return
	 * @return String
	 * @throws
	 */
	public String getNodeText(Node node, String expression) {
		XPathExpression xpexpreesion = null;
		try {
			xpexpreesion = this.xpath.compile(expression);
			return (String) xpexpreesion.evaluate(node, XPathConstants.STRING);
		} catch (XPathExpressionException e) {
			throw new RuntimeException(
					"Compile xpath expression occur excetion,please check out your xpath expression.");
		}
	}
	
	/**
	 * @Author: Lanxiaowei([email protected])
	 * @Title: getMultiNodeText
	 * @Description: 根据xpath获取节点的文本值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的文本值)
	 * @param @param node
	 * @param @param expression
	 * @param @return
	 * @return List<String>
	 * @throws
	 */
	public List<String> getMultiNodeText(Node node, String expression) {
		NodeList nodeList = selectNodes(node, expression);
		if(null == nodeList || nodeList.getLength() == 0) {
			return null;
		}
		List<String> list = new ArrayList<String>();
		for(int i=0; i < nodeList.getLength(); i++) {
			Node n = nodeList.item(i);
			String text = n.getTextContent();
			list.add(text);
		}
		return list;
	}
	
       /**
	 * @Author: Lanxiaowei([email protected])
	 * @Title: getNodeAttributeValue
	 * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则只会提取匹配到的第一个节点的属性值)
	 * @param @param node
	 * @param @param expression
	 * @param @param atrributeName
	 * @param @return
	 * @return String
	 * @throws
	 */
	public String getNodeAttributeValue(Node node,
			String expression, String atrributeName) {
		Node matchNode = selectSingleNode(node, expression);
		if (null == matchNode) {
			return null;
		}
		Node attNode = matchNode.getAttributes().getNamedItem(
				atrributeName);
		if (null == attNode) {
			return null;
		}
		return attNode.getNodeValue();
	}
	/**
	 * @Author: Lanxiaowei([email protected])
	 * @Title: getMultiNodeAttributeValue
	 * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的属性值)
	 * @param @param node
	 * @param @param expression      Xpath表达式,如div\span[@class]
	 * @param @param atrributeName   属性名称
	 * @param @return
	 * @return List<String>
	 * @throws
	 */
	public List<String> getMultiNodeAttributeValue(Node node, String expression,String atrributeName) {
		NodeList nodeList = selectNodes(node, expression);
		if(null == nodeList || nodeList.getLength() == 0) {
			return null;
		}
		List<String> list = new ArrayList<String>();
		for(int i=0; i < nodeList.getLength(); i++) {
			Node currentItem = nodeList.item(i);
			Node attNode = currentItem.getAttributes().getNamedItem(atrributeName);
			if(null == attNode) {
				continue;
			}
			String val = currentItem.getAttributes().getNamedItem(atrributeName).getNodeValue();
			list.add(val);
		}
		return list;
	}

	public static void main(String[] args) throws ParserConfigurationException,
			SAXException, IOException {

		/*String fragment = "<data><employee><name>益达</name>"
				+ "<title>Manager</title></employee></data>";

		XMLUtils util = new XMLUtils();
		Document doc = util.fragment2Document(fragment);
		NodeList nodes = doc.getElementsByTagName("employee");

		for (int i = 0; i < nodes.getLength(); i++) {
			Element element = (Element) nodes.item(i);

			NodeList name = element.getElementsByTagName("name");
			Element line = (Element) name.item(0);
			System.out.println("Name: " + line.getNodeName() + ":"
					+ line.getTextContent());

			NodeList title = element.getElementsByTagName("title");
			line = (Element) title.item(0);
			System.out.println("Name: " + line.getNodeName() + ":"
					+ line.getTextContent());
		}*/
		
		String fragment = "<data><employee><name id=\"1\">益达</name><name id=\"2\">yida</name>"
				+ "<title>Manager</title></employee></data>";

		XMLUtils util = new XMLUtils();
		Document doc = util.fragment2Document(fragment);
		
		
		List<String> strList = util.getMultiNodeText(doc, "//employee/name[@id]");
		String s = GerneralUtils.joinCollection(strList);
		System.out.println(s);
		
		strList = util.getMultiNodeAttributeValue(doc, "//employee/name[@id]", "id");
		s = GerneralUtils.joinCollection(strList);
		System.out.println(s);
	}
}

 

    注意这里说的Document指的都是org.w3c.dom.Document,而不是JDOM or DOM4J or Jsoup里的Document.org.w3c.dom.Document是JDK原生对象.

 

猜你喜欢

转载自iamyida.iteye.com/blog/2247529
w3c