关于lucene处理word、excel、ppt、txt、pdf等格式文件的代码如下:
- public class ReadFile {
- /**
- * 处理word2003
- * @param path
- * @return
- * @throws Exception
- */
- public static String readWord(String path) throws Exception {
- String bodyText = null;
- InputStream inputStream = new FileInputStream(path);
- WordExtractor extractor = new WordExtractor(inputStream);
- bodyText = extractor.getText();
- return bodyText;
- }
- /**
- * 处理word2007
- * @param path
- * @return
- * @throws IOException
- * @throws OpenXML4JException
- * @throws XmlException
- */
- public static String readWord2007(String path) throws IOException, OpenXML4JException, XmlException {
- OPCPackage opcPackage = POIXMLDocument.openPackage(path);
- POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
- return ex.getText();
- }
- /**
- * 处理excel2003
- * @param path
- * @return
- * @throws IOException
- */
- public static String ReadExcel(String path) throws IOException {
- InputStream inputStream = null;
- String content = null;
- try {
- inputStream = new FileInputStream(path);
- HSSFWorkbook wb = new HSSFWorkbook(inputStream);
- ExcelExtractor extractor = new ExcelExtractor(wb);
- extractor.setFormulasNotResults(true);
- extractor.setIncludeSheetNames(false);
- content = extractor.getText();
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- return content;
- }
- /**
- * 处理excel2007
- * @param path
- * @return
- * @throws IOException
- */
- public static String readExcel2007(String path) throws IOException {
- StringBuffer content = new StringBuffer();
- // 构造 XSSFWorkbook 对象,strPath 传入文件路径
- XSSFWorkbook xwb = new XSSFWorkbook(path);
- // 循环工作表Sheet
- for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
- XSSFSheet xSheet = xwb.getSheetAt(numSheet);
- if (xSheet == null) {
- continue;
- }
- // 循环行Row
- for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
- XSSFRow xRow = xSheet.getRow(rowNum);
- if (xRow == null) {
- continue;
- }
- // 循环列Cell
- for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
- XSSFCell xCell = xRow.getCell(cellNum);
- if (xCell == null) {
- continue;
- }
- if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
- content.append(xCell.getBooleanCellValue());
- } else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
- content.append(xCell.getNumericCellValue());
- } else {
- content.append(xCell.getStringCellValue());
- }
- }
- }
- }
- return content.toString();
- }
- /**
- * 处理ppt
- * @param path
- * @return
- */
- public static String readPowerPoint(String path) {
- StringBuffer content = new StringBuffer("");
- InputStream inputStream = null;
- try {
- inputStream = new FileInputStream(path);
- SlideShow ss = new SlideShow(new HSLFSlideShow(new FileInputStream(path)));// is
- // 为文件的InputStream,建立SlideShow
- Slide[] slides = ss.getSlides();// 获得每一张幻灯片
- for (int i = 0; i < slides.length; i++) {
- TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun
- for (int j = 0; j < t.length; j++) {
- content.append(t[j].getText());// 这里会将文字内容加到content中去
- }
- }
- } catch (Exception ex) {
- System.out.println(ex.toString());
- }
- return content.toString();
- }
- /**
- * 处理pdf
- * @param path
- * @return
- * @throws IOException
- */
- public static String readPdf(String path) throws IOException {
- StringBuffer content = new StringBuffer("");// 文档内容
- PDDocument pdfDocument = null;
- try {
- FileInputStream fis = new FileInputStream(path);
- PDFTextStripper stripper = new PDFTextStripper();
- pdfDocument = PDDocument.load(fis);
- StringWriter writer = new StringWriter();
- stripper.writeText(pdfDocument, writer);
- content.append(writer.getBuffer().toString());
- fis.close();
- } catch (java.io.IOException e) {
- System.err.println("IOException=" + e);
- System.exit(1);
- } finally {
- if (pdfDocument != null) {
- org.pdfbox.cos.COSDocument cos = pdfDocument.getDocument();
- cos.close();
- pdfDocument.close();
- }
- }
- return content.toString();
- }
- /**
- * 处理txt
- * @param path
- * @return
- * @throws IOException
- */
- public static String readTxt(String path) throws IOException {
- StringBuffer sb = new StringBuffer("");
- InputStream is = new FileInputStream(path);
- // 必须设置成GBK,否则将出现乱码
- BufferedReader reader = new BufferedReader(new InputStreamReader(is, "GBK"));
- try {
- String line = "";
- while ((line = reader.readLine()) != null) {
- sb.append(line + "\r");
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- }
- return sb.toString().trim();
- }
- }
使用jar包如下: