java process PDF into DOC

PDF to DOC

Before someone to share my part of the technical documentation, but it is the PDF. It turned himself wanted DOC files in an editor. Many found online surcharge, I thought it would also need the junk fees. So do yourself a settlement.
I have been pro-test non-toxic, taken directly!
Do not say, on the code!

package com.rick1024k.data_conversion.PDF;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

import com.alibaba.fastjson.JSONObject;
import com.rick1024k.data_conversion.utils.FileUtils;

public class PDFTools {

	public static void pdf2DOC(String pdfPath, String pdfName) {
		try {
			// 后缀名
			String suffix = pdfName.substring(pdfName.lastIndexOf(".") + 1);
			if (!suffix.equals("pdf")) {
				return;
			}
			// pdf的完全路径
			String pdfFile = pdfPath + "//" + pdfName;
			PDDocument doc = PDDocument.load(new File(pdfFile));
			int pagenumber = doc.getNumberOfPages();
			pdfFile = pdfFile.substring(0, pdfFile.lastIndexOf("."));
			// doc文件的完全路劲
			String fileName = pdfFile + ".doc";
			File file = new File(fileName);
			if (!file.exists()) {// 文件是否存在,不存在则新增
				file.createNewFile();
			}

			// 写入数据
			FileOutputStream fos = new FileOutputStream(fileName);
			Writer writer = new OutputStreamWriter(fos, "UTF-8");
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setSortByPosition(true);// 排序
			stripper.setStartPage(1);// 设置转换的开始页
			stripper.setEndPage(pagenumber);// 设置转换的结束页
			stripper.writeText(doc, writer);
			writer.close();
			doc.close();
			System.out.println(pdfPath + pdfName + "--------文件转换成功!");
		} catch (IOException e) {
			System.out.println(pdfPath + pdfName + "--------文件转换报错!");
			e.printStackTrace();
		}
	}

	public static void PDF2Markdown(String path) {
		// 这是需要获取的文件夹路径
		Map<String, List<String>> fileMap = new HashMap<>();
		FileUtils.getAllFile(path, fileMap);
		System.out.println(JSONObject.toJSONString(fileMap));
		Set<String> keySet = fileMap.keySet();
		List<String> pdfList;
		for (String key : keySet) {
			pdfList = fileMap.get(key);
			for (String fileName : pdfList) {
				pdf2DOC(key, fileName);
			}
		}
	}

	public static void main(String[] args) {
		String path = "C:/Resource/RICK/DOC/PDF文件夹";
		PDF2Markdown(path);
	}

}

  • file tools category:
package com.rick1024k.data_conversion.utils;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class FileUtils {

	public static void getAllFile(String path, Map<String, List<String>> fileMap) {
		getAllFile(path, fileMap, 0);
	}

	/*
	 * 函数名:getFile 作用:使用递归,输出指定文件夹内的所有文件 参数:path:文件夹路径 deep:表示文件的层次深度,控制前置空格的个数
	 * 前置空格缩进,显示文件层次结构
	 */
	public static void getAllFile(String path, Map<String, List<String>> fileMap, int deep) {
		// 获得指定文件对象
		File file = new File(path);
		// 获得该文件夹内的所有文件
		File[] fileList = file.listFiles();
		List<String> fileNameList = new ArrayList<>();
		for (File fileData : fileList) {
			if (fileData.isFile()) {// 如果是文件
				fileNameList.add(fileData.getName());
			} else if (fileData.isDirectory()) {// 如果是文件夹
				getAllFile(fileData.getPath(), fileMap, deep + 1);
			}
		}
		fileMap.put(path, fileNameList);
	}
}

  • Dependent jar package:
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.18</version>
		</dependency>
Published 17 original articles · won praise 6 · views 30000 +

Guess you like

Origin blog.csdn.net/keenstyle/article/details/104343567