java应用之——读取word英文文档,统计单词频次并输出到excel

该文是java读取文件写入文件的应用,相关知识见上篇博文https://blog.csdn.net/nnnora/article/details/80734917

一个例子:读取word英文文档,统计单词频次并输出到excel

    public FileOutputStream getwordsFrequency(MultipartFile file) {
        FileOutputStream fos = null;
        try {
            //读取word文件
            String path = "Shocking level of sexual harassment at music festivals.docx";
            String buffer = "";
            if (path.endsWith(".doc")) {
                ClassPathResource classPathResource = new ClassPathResource(path);
                WordExtractor wordExtractor = new WordExtractor(classPathResource.getInputStream());
                buffer = wordExtractor.getText();
            } else if (path.endsWith("docx")) {
                ClassPathResource classPathResource = new ClassPathResource(path);
                XWPFDocument xdoc = new XWPFDocument(classPathResource.getInputStream());
                POIXMLTextExtractor extractor = new XWPFWordExtractor(xdoc);
                buffer = extractor.getText();
            } else {
                System.out.println("此文件不是word文件!");
            }
            Pattern expression = Pattern.compile("[a-zA-Z'\\-]+");  //定义正则表达式匹配单词
            Matcher matcher = expression.matcher(buffer);
            Map<String, Integer> map = new TreeMap<>();
            String word = "";
            int n = 0;
            Integer times = 0;
            while(matcher.find()){      //是否匹配单词
                word = matcher.group();     //得到一个单词,树映射中的键
                n++;
                if( map.containsKey(word) ){    //如果该键存在,则表示单词出现过
                    times = map.get(word);      //得到单词出现的次数
                    map.put(word, times+1);
                } else {
                    map.put(word, 1);   //否则单词是第一次出现,直接放入map
                }
            }
            List<Map.Entry<String, Integer>> list = new ArrayList<>(map.entrySet());
            Comparator<Map.Entry<String, Integer>> comparator = new Comparator<Map.Entry<String, Integer>>(){
                @Override
                public int compare(Map.Entry<String, Integer> m1, Map.Entry<String, Integer> m2) {
                    return (m2.getValue()).compareTo(m1.getValue());
                }

            };
            Collections.sort(list, comparator);
            System.out.println("统计分析如下:");
            System.out.println("文章中单词总数" + n + "个");

            //写入excel
            //第一步,创建一个workbook对应一个excel文件
            HSSFWorkbook workbook = new HSSFWorkbook();
            //第二部,在workbook中创建一个sheet对应excel中的sheet
            HSSFSheet sheet = workbook.createSheet("词频文件");
            //第三部,在sheet表中添加表头第0行,老版本的poi对sheet的行列有限制
            HSSFRow row = sheet.createRow(0);
            //第四步,创建单元格,设置表头
            HSSFCell cell = row.createCell(0);
            cell.setCellValue("单词");
            cell = row.createCell(1);
            cell.setCellValue("频次");

            //第五步,写入实体数据
            int i = 0;
            for(Map.Entry<String, Integer> entity : list){
                HSSFRow row1 = sheet.createRow(i + 1);
                row1.createCell(0).setCellValue(entity.getKey());
                row1.createCell(1).setCellValue(entity.getValue());
                i++;
            }
            //将文件保存到指定的位置
            try {
                fos = new FileOutputStream("wordsResult.xls");
                workbook.write(fos);
                logger.info("写入成功");
                fos.close();
            } catch (IOException e) {
                logger.error("@@WriteFileError", e);
            }
        }catch (Exception e){
            logger.error("@@ReadFileError", e);
        }
        return fos;
    }

猜你喜欢

转载自blog.csdn.net/NNnora/article/details/80735345
今日推荐