整理了一天网上查到的把Word转为HTML的方法,包括了word07以上版本的转换
代码如下(整合了前辈们的代码):
参考博客地址:http://blog.csdn.net/ptzrbin/article/details/43449701
http://blog.csdn.net/u011687117/article/details/29561027
package data.util;
import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.jsoup.Jsoup;
import org.w3c.dom.Document;
import org.xml.sax.ContentHandler;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.IURIResolver;
import org.apache.poi.xwpf.converter.xhtml.DefaultContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.IContentHandlerFactory;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
/**
* Word转换为Html并读取Html内容工具类
*/
public class WechatWord2Html {
//输出html文件
public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
org.jsoup.nodes.Document doc = Jsoup.parse(content);
content=doc.html();
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"GB2312"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
/**
* Word 转 Html
* 依赖jar包: ooxml-schemas-1.1.jar ;
* org.apache.poi.xwpf.converter.core-1.0.4.jar ;
* org.apache.poi.xwpf.converter.xhtml-1.0.4.jar ;
* @param fileName
* @param outPutFile
* @param fileNameExtension
* @throws TransformerException
* @throws IOException
* @throws ParserConfigurationException
*/
public static void convert2Html(String filePath, String outPutFile ,String fileNameExtension)
throws TransformerException, IOException, ParserConfigurationException {
//filePath :Word文件路径
//outPutFile : 输出文件存放路径
//fileNameExtension : Word后缀
if(fileNameExtension.equals("doc")){ //老版本
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filePath));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter( DocumentBuilderFactory.newInstance().newDocumentBuilder() .newDocument()); wordToHtmlConverter.setPicturesManager( new PicturesManager() { public String savePicture( byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches ) { return "test/"+suggestedName; } } ); wordToHtmlConverter.processDocument(wordDocument); //save pictures List pics=wordDocument.getPicturesTable().getAllPictures(); if(pics!=null){ for(int i=0;i<pics.size();i++){ Picture pic = (Picture)pics.get(i); System.out.println(); try { pic.writeImageContent(new FileOutputStream("D:/test/" + pic.suggestFullFileName())); } catch (FileNotFoundException e) { e.printStackTrace(); } } } Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312"); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "HTML"); serializer.transform(domSource, streamResult); out.close(); writeFile(new String(out.toByteArray()), outPutFile); }else if(fileNameExtension.equals("docx")){ //新版本 XWPFDocument document = new XWPFDocument(new FileInputStream(new File(filePath))); XHTMLOptions options = XHTMLOptions.create();// .indent( 4 ); IContentHandlerFactory f = new DefaultContentHandlerFactory(); // Extract image options.setExtractor(new FileImageExtractor(new File("D:/"))); // URI resolver options.URIResolver(new IURIResolver() { @Override public String resolve(String uri) { return "D:/" + uri; } }); ByteArrayOutputStream out = new ByteArrayOutputStream(); ContentHandler contentHandler = f.create(out, null, options); XHTMLConverter.getInstance().convert(document, out, options); out.close(); writeFile(new String(out.toByteArray()), outPutFile); } } /** * Html内容提取为String * @param filePath * @return */ public static String readfile(String filePath){ File file = new File(filePath); InputStream input = null; try { input = new FileInputStream(file); } catch (FileNotFoundException e) { e.printStackTrace(); } StringBuffer buffer = new StringBuffer(); byte[] bytes = new byte[1024]; try { for (int n; (n = input.read(bytes)) != -1;) { buffer.append(new String(bytes, 0, n, "GBK")); } } catch (IOException e) { e.printStackTrace(); } // System.out.println(buffer); return buffer.toString(); } /** * 读取html的body内容为String * @param val * @return */ public static String getBody(String val) { String start = "<body>"; String end = "</body>"; int s = val.indexOf(start) + start.length(); int e = val.indexOf(end); return val.substring(s, e); } }