JAVA reads a large xlsx file with millions of data

1. I have nothing to do on weekends. Record a problem encountered this week: a large xlsx file data needs to be written into hbase, and the file data is 60W+.
1. At the beginning, I used the xls method to process it, but found that it was not working, and the xls file could only be processed to 65535 lines.
2. Then use the SXSSFWorkbook method, and found that this can only write or operate a large xlsx file, there is no read function, and when testing locally, it reports memory overflow, and the class will write temporary data to the local disk when reading the document. The second modification test still reports memory overflow.
3. Finally, I found a solution on the Apache official website:
official website address: large xlsx file or xml file read
4. After simple modification, it only takes less than a minute to write to hbase. Of course, you need to import the poi-ooxml and rj.jar packages of apache. The following code can be used. The first line of the xlsx file needs to be the column name:

package com.xlsx.utils;

import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.function.Consumer;

public class ReadBigDataXlsxFile {
    
    

    private Consumer<Map<String,String>> consumer;

    private String fileName;

    private boolean useTextReadFile;

    public ReadBigDataXlsxFile(String fileName,Consumer<Map<String,String>> consumer){
    
    
        this.fileName = fileName;
        this.consumer = consumer;
    }

    public void processOneSheet(String sheetId) throws OpenXML4JException, IOException, SAXException {
    
    
        OPCPackage open = OPCPackage.open(fileName);
        XSSFReader xssfReader = new XSSFReader(open);
        SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
        XMLReader  parser = fetchSheetParser(sharedStringsTable);
        InputStream sheet = xssfReader.getSheet(sheetId);
        InputSource inputStream = new InputSource(sheet);
        parser.parse(inputStream);
        sheet.close();

    }
    
    public void processAllSheets() throws OpenXML4JException, IOException, SAXException {
    
    
        OPCPackage open = OPCPackage.open(fileName);
        XSSFReader xssfReader = new XSSFReader(open);
        SharedStringsTable sharedStringsTable = xssfReader.getSharedStringsTable();
        XMLReader  parser = fetchSheetParser(sharedStringsTable);
        Iterator<InputStream> sheetsData = xssfReader.getSheetsData();
        while (sheetsData.hasNext()){
    
    
            InputStream next = sheetsData.next();
            InputSource inputSource = new InputSource(next);
            parser.parse(inputSource);
            next.close();
        }
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
    
    
        XMLReader xmlReader = XMLReaderFactory.createXMLReader();
        SheetHandler sheetHandler = new SheetHandler(sst, consumer);
        xmlReader.setContentHandler(sheetHandler);
        return xmlReader;

    }
    
    
    
    private static class SheetHandler extends DefaultHandler{
    
    
        
        private SharedStringsTable sst;
        
        private String lastConetents;
        
        private boolean nextIsString;

        //存储xlsx文件的第一行作为字段名
        private Map<String, String> tableHead = new HashMap<>();
        
        private Map<String,String> rowData = new HashMap<>();
        
        //记录行,如果是第一行,就写入tableHead,作为数据key
        private int rowNum = 0;
        
        private String key,value;
        
        //每行数据的处理逻辑
        private Consumer<Map<String,String>> consumer;
        
        private SheetHandler(SharedStringsTable sst,Consumer<Map<String,String>> consumer){
    
    
            this.sst = sst;
            this.consumer = consumer;
        }

        @Override
        public void startElement(String uri, String localName, String name, Attributes attributes) throws SAXException {
    
    
            if(name.equals("row")){
    
    
                rowNum ++;
                consumer.accept(rowData);
            }
            if(name.equals("c")){
    
    
                key = attributes.getValue("r").substring(0,1);
                String cellType = attributes.getValue("t");
                if(cellType != null && cellType.equals("s")){
    
    
                    nextIsString = true;
                }else{
    
    
                    nextIsString = false;
                }
                lastConetents = "";
            }
        }

        @Override
        public void endElement(String uri, String localName, String name) throws SAXException {
    
    
            if(nextIsString){
    
    
                if(lastConetents == ""){
    
    //处理表格列值为空的情况
                    lastConetents = " ";
                }else{
    
    
                    int indx = Integer.parseInt(lastConetents);
                    lastConetents = sst.getItems().get(indx).getT();
                }
                nextIsString = false;
            }
            if(name.equals("v")){
    
    
                if(rowNum == 1){
    
    
                    tableHead.put(key,lastConetents);
                }else{
    
    
                    rowData.put(tableHead.get(key),lastConetents);
                }
            }
            if(name.equals("c") && lastConetents.equals(" ")){
    
    
                rowData.put(tableHead.get(key),lastConetents);
            }
        }

        @Override
        public void characters(char[] ch, int start, int length) throws SAXException {
    
    
            lastConetents = new String(ch,start,length);
        }
    }

}

使用:new ReadBigDataXlsxFile(fileName,e-{}).processAllSheets();

Guess you like

Origin blog.csdn.net/u013326684/article/details/123289476