poi_sax读取excel,解决大数据量导致的内存溢出问题

前言

  记录一次解决poi读取excel内存溢出问题的过程,使用poi的sax解析方式处理海量数据的excel,也包括对空单元的处理及日期格式转化。

解决过程

   sax事件驱动解析与原来的解析方式最大的区别就是,sax将excel转换成xml格式然后一行去读取,而不是一下将excel解析到内存中,这样就可以有效的解决由数据量大导致的内存溢出问题了,sax解析时会忽略掉空格,这一点要单独去处理。下面用代码来说话。

1.sax将excel转换成xml

  为了更好的使用sax首先看看被转化成xml是什么样子的。

public static void main(String[] args) {
        String filename ="D:\\8月\\otc\\sax解析测试\\0221otcposdata20190909.xlsm";
        OPCPackage pkg;
        try {
            pkg = OPCPackage.open(filename);
            XSSFReader r = new XSSFReader(pkg);
            //查看转换的xml原始文件,方便理解后面解析时的处理,
            InputStream in = r.getSheet("rId1");
            byte[] buf = new byte[1024];
            int len;
            while ((len = in.read(buf)) != -1) {
                System.out.write(buf, 0, len);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

解析后的样子,发现有好多标签,cols、col、row、 c 、v等等这些就是excel中的元素,后面的解析会用到。

2.解析excel2007

  通过继承DefaultHandler类,重写process(),startElement(),characters(),endElement()这四个方法。

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.ss.usermodel.BuiltinFormats;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFCellStyle;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;


public class ExcelXlsxReader extends DefaultHandler {

    private IExcelRowReader rowReader;

    public void setRowReader(IExcelRowReader rowReader) {
        this.rowReader = rowReader;
    }

    private SharedStringsTable sst;
    private StylesTable stylesTable;
    private String lastContents;
    private int sheetIndex = -1;
    private CellDataType nextDataType = CellDataType.SSTINDEX;
    private final DataFormatter formatter = new DataFormatter();
    private short formatIndex;
    private String formatString;
    private List<List<String>> templist = new ArrayList<List<String>>();
    private List<String> rowList = null;
    private boolean allowNullRow = true;
    private String dimension;
    private int longest;
    private List<String> currentRow;
    private boolean isSSTIndex = false;
    private String sheetName = null;


    /**
     * 遍历工作簿中所有的电子表格
     * 
     * @param filename
     * @throws IOException
     * @throws OpenXML4JException
     * @throws SAXException
     * @throws Exception
     */
    public void process(String filename,String newFileName,String fileType) throws IOException,
            OpenXML4JException, SAXException {
        OPCPackage pkg = OPCPackage.open(filename);
        XSSFReader r = new XSSFReader(pkg);
        stylesTable = r.getStylesTable();
        SharedStringsTable sst = r.getSharedStringsTable();
        XMLReader parser = fetchSheetParser(sst);
        SheetIterator sheets = (SheetIterator) r.getSheetsData();
        while (sheets.hasNext()) {
            InputStream sheet = sheets.next();
            sheetName = sheets.getSheetName();
            sheetIndex++;
            InputSource sheetSource = new InputSource(sheet);
            parser.parse(sheetSource);
            sheet.close();
        }
        //写文件
        rowReader.writeExcel(templist,sheetName,newFileName,fileType);
        templist = null;
    }

    public XMLReader fetchSheetParser(SharedStringsTable sst)
            throws SAXException {
        XMLReader parser = XMLReaderFactory
                .createXMLReader("org.apache.xerces.parsers.SAXParser");
        this.sst = sst;
        parser.setContentHandler(this);
        return parser;
    }

    @Override
    public void characters(char[] ch, int start, int length)
            throws SAXException {
        // 得到单元格内容的值
        lastContents += new String(ch, start, length);
    }

    @Override
    public void startElement(String uri, String localName, String qName,
            Attributes attributes) throws SAXException {
        
        if (qName.equals("dimension")) {
            dimension = attributes.getValue("ref");
            longest = covertRowIdtoInt(dimension.substring(dimension
                    .indexOf(":") + 1));
        }
        //
        if (qName.equals("row")) {
            currentRow = new ArrayList<String>();
        }
        //单元格 
        if (qName.equals("c")) {
            // 判断单元格的值是SST的索引
            if (attributes.getValue("t") != null
                    && attributes.getValue("t").equals("s")) {
                isSSTIndex = true;
                nextDataType = CellDataType.SSTINDEX;
            } else {
                isSSTIndex = false;
                // 当单元格的值不是SST的索引放一个空值占位。
                currentRow.add("");
                // 判断单元格格式类型
                setNextDataType(attributes);
            }
        }
        lastContents = "";

    }

    @Override
    public void endElement(String uri, String localName, String qName)
            throws SAXException {
        // 行结束,存储一行数据
        if (qName.equals("row")) {
            if (isWriteRow(currentRow)) {
                rowList = new ArrayList<String>();
                rowList.addAll(currentRow);
                templist.add(rowList);
                rowList = null;
            }
        }
        // 取值
        if (qName.equals("v")) {
            // 单元格的值是SST的索引
            if (isSSTIndex) {
                lastContents = getDataValue(lastContents);
            } else {
                // 单元格的值不是SST的索引,删除占位空格,再取值
                currentRow.remove(currentRow.size() - 1);
                lastContents = getDataValue(lastContents);
            }
            currentRow.add(lastContents);
        }
    }

    /**
     * 单元格中的数据类型枚举
     */
    enum CellDataType {
        SSTINDEX, NUMBER, DATE, NULL
    }
    
    /**
     * 在取值前处理单元格数据类型,目前只对日期格式进行处理,可扩展。
     */
    public void setNextDataType(Attributes attributes) {
        nextDataType = CellDataType.NUMBER;
        formatIndex = -1;
        formatString = null;
        String dateFormat = "m/d/yy,yyyy\\-mm\\-dd;@,yyyy/m/d;@,yyyy/m/d\\ h:mm;@,mm/dd/yy;@,m/d;@,"
                + "yy/m/d;@,m/d/yy;@,[$-409]yyyy/m/d\\ h:mm\\ AM/PM;@,[$-F800]dddd\\,\\ mmmm\\ dd\\,\\ yyyy";
        String cellStyleStr = attributes.getValue("s");
        if (cellStyleStr != null) {
            int styleIndex = Integer.parseInt(cellStyleStr);
            XSSFCellStyle style = stylesTable.getStyleAt(styleIndex);
            formatIndex = style.getDataFormat();
            formatString = style.getDataFormatString();
            // 对日期类型进行处理
            if (dateFormat.contains(formatString)) {
                nextDataType = CellDataType.DATE;
                formatString = "yyyy-MM-dd";
            }
            if (formatString == null) {
                nextDataType = CellDataType.NULL;
                formatString = BuiltinFormats.getBuiltinFormat(formatIndex);
            }
        }
    }

    private boolean isWriteRow(List<String> list) {
        boolean flag = false;
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i) != null && !list.get(i).equals("")) {
                flag = true;
                break;
            }
        }
        if (allowNullRow) {
            if (!flag) {
                flag = true;// 允许一次空行
                allowNullRow = false;
            }
        }
        return flag;

    }

    public static int covertRowIdtoInt(String rowId) {
        int firstDigit = -1;
        for (int c = 0; c < rowId.length(); ++c) {
            if (Character.isDigit(rowId.charAt(c))) {
                firstDigit = c;
                break;
            }
        }
        String newRowId = rowId.substring(0, firstDigit);
        int num = 0;
        int result = 0;
        int length = newRowId.length();
        for (int i = 0; i < length; i++) {
            char ch = newRowId.charAt(length - i - 1);
            num = (int) (ch - 'A' + 1);
            num *= Math.pow(26, i);
            result += num;
        }
        return result;

    }

    /**
     * 对解析出来的数据进行类型处理
     */
    public String getDataValue(String value) {
        String thisStr = null;
        switch (nextDataType) {
        case SSTINDEX:
            String sstIndex = value.toString();
            try {
                int idx = Integer.parseInt(sstIndex);
                XSSFRichTextString rtss = new XSSFRichTextString(sst
                        .getEntryAt(idx));
                thisStr = rtss.toString();
                rtss = null;
            } catch (NumberFormatException ex) {
                thisStr = value.toString();
            }
            break;
        case DATE:
            thisStr = formatter.formatRawCellContents(
                    Double.parseDouble(value), formatIndex, formatString);
            break;
        default:
            thisStr = value;
            break;
        }

        return thisStr;
    }
    
}

3.解析excel2003

  与读取excel2007有所区别,Excel2003需要通过继承HSSFListener类来解析,重写process()、processRecord()两个方法,在processRecord中进行读取操作。

import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.poi.hssf.eventusermodel.EventWorkbookBuilder.SheetRecordCollectingListener;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.eventusermodel.MissingRecordAwareHSSFListener;
import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord;
import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord;
import org.apache.poi.hssf.model.HSSFFormulaParser;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BlankRecord;
import org.apache.poi.hssf.record.BoolErrRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.FormulaRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

public class ExcelXlsReader implements HSSFListener {

    private int minColumns = -1;

    private POIFSFileSystem fs;

    private int lastRowNumber;

    private int lastColumnNumber;

    /** Should we output the formula, or the value it has? */
    private boolean outputFormulaValues = true;

    /** For parsing Formulas */
    private SheetRecordCollectingListener workbookBuildingListener;

    // excel2003工作薄
    private HSSFWorkbook stubWorkbook;

    // Records we pick up as we process
    private SSTRecord sstRecord;

    private FormatTrackingHSSFListener formatListener;

    // 表索引
    private int sheetIndex = -1;

    private BoundSheetRecord[] orderedBSRs;

    @SuppressWarnings("unchecked")
    private ArrayList boundSheetRecords = new ArrayList();

    // For handling formulas with string results
    private int nextRow;

    private int nextColumn;

    private boolean outputNextStringRecord;

    // 当前行
    private int curRow = 0;

    // 存储行记录的容器
    private List<String> rowlist = new ArrayList<String>();;
    private boolean allowNullRow = true;
    @SuppressWarnings("unused")
    private String sheetName;
    private List<List<String>> templist = new ArrayList<List<String>>();
    private List<String> r = null;
    private IExcelRowReader rowReader;

    public void setRowReader(IExcelRowReader rowReader) {
        this.rowReader = rowReader;
    }

    /**
     * 遍历excel下所有的sheet
     * 
     * @throws IOException
     */
    public void process(String fileName,String newFileName,String fileType) throws IOException {
        this.fs = new POIFSFileSystem(new FileInputStream(fileName));
        MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this);
        formatListener = new FormatTrackingHSSFListener(listener);
        HSSFEventFactory factory = new HSSFEventFactory();     
        HSSFRequest request = new HSSFRequest();
        if (outputFormulaValues) {
            request.addListenerForAllRecords(formatListener);          
        } else {
            workbookBuildingListener = new SheetRecordCollectingListener(formatListener);
            request.addListenerForAllRecords(workbookBuildingListener);
        }
        factory.processWorkbookEvents(request, fs);
        rowReader.writeExcel(templist,sheetName,newFileName,fileType);    
           templist = null;
    }

    /**
     * HSSFListener 监听方法,处理 Record
     */
    @SuppressWarnings("unchecked")
    public void processRecord(Record record) {
        int thisRow = -1;
        int thisColumn = -1;
        String thisStr = null;
        String value = null;
        switch (record.getSid()) {
        case BoundSheetRecord.sid:
            boundSheetRecords.add(record);
            break;
        case BOFRecord.sid:
            BOFRecord br = (BOFRecord) record;
            if (br.getType() == BOFRecord.TYPE_WORKSHEET) {
                // 如果有需要,则建立子工作薄
                if (workbookBuildingListener != null && stubWorkbook == null) {
                    stubWorkbook = workbookBuildingListener.getStubHSSFWorkbook();
                }

                sheetIndex++;
                if (orderedBSRs == null) {
                    orderedBSRs = BoundSheetRecord.orderByBofPosition(boundSheetRecords);
                }
                sheetName = orderedBSRs[sheetIndex].getSheetname();
                //ExcelRowReader.SHEETNAME = sheetName;
            }
            break;

        case SSTRecord.sid:
            sstRecord = (SSTRecord) record;
            break;

        case BlankRecord.sid:
            BlankRecord brec = (BlankRecord) record;
            thisRow = brec.getRow();
            thisColumn = brec.getColumn();
            thisStr = "";
            rowlist.add(thisColumn, thisStr);
            break;
        case BoolErrRecord.sid: // 单元格为布尔类型
            BoolErrRecord berec = (BoolErrRecord) record;
            thisRow = berec.getRow();
            thisColumn = berec.getColumn();
            thisStr = berec.getBooleanValue() + "";
            rowlist.add(thisColumn, thisStr);
            break;

        case FormulaRecord.sid: // 单元格为公式类型
            FormulaRecord frec = (FormulaRecord) record;
            thisRow = frec.getRow();
            thisColumn = frec.getColumn();
            if (outputFormulaValues) {
                if (Double.isNaN(frec.getValue())) {
                    outputNextStringRecord = true;
                    nextRow = frec.getRow();
                    nextColumn = frec.getColumn();
                } else {
                    thisStr = formatListener.formatNumberDateCell(frec);
                }
            } else {
                thisStr = '"' + HSSFFormulaParser.toFormulaString(stubWorkbook, frec.getParsedExpression()) + '"';
            }
            rowlist.add(thisColumn, thisStr);
            break;
        case StringRecord.sid:// 单元格中公式的字符串
            if (outputNextStringRecord) {
                StringRecord srec = (StringRecord) record;
                thisStr = srec.getString();
                thisRow = nextRow;
                thisColumn = nextColumn;
                outputNextStringRecord = false;
            }
            break;
        case LabelRecord.sid:
            LabelRecord lrec = (LabelRecord) record;
            curRow = thisRow = lrec.getRow();
            thisColumn = lrec.getColumn();
            value = lrec.getValue().trim();
            value = value.equals("") ? " " : value;
            this.rowlist.add(thisColumn, value);
            break;
        case LabelSSTRecord.sid: // 单元格为字符串类型
            LabelSSTRecord lsrec = (LabelSSTRecord) record;
            curRow = thisRow = lsrec.getRow();
            thisColumn = lsrec.getColumn();
            if (sstRecord == null) {
                rowlist.add(thisColumn, " ");
            } else {
                value = sstRecord.getString(lsrec.getSSTIndex()).toString().trim();
                value = value.equals("") ? " " : value;
                rowlist.add(thisColumn, value);
            }
            break;
        case NumberRecord.sid: // 单元格为数字类型
            NumberRecord numrec = (NumberRecord) record;
            curRow = thisRow = numrec.getRow();
            thisColumn = numrec.getColumn();
            value = formatListener.formatNumberDateCell(numrec).trim();
            value = value.equals("") ? " " : value;
            // 向容器加入列值
            rowlist.add(thisColumn, value);
            break;
        default:
            break;
        }

        // 遇到新行的操作
        if (thisRow != -1 && thisRow != lastRowNumber) {
            lastColumnNumber = -1;
        }

        // 空值的操作
        if (record instanceof MissingCellDummyRecord) {
            MissingCellDummyRecord mc = (MissingCellDummyRecord) record;
            curRow = thisRow = mc.getRow();
            thisColumn = mc.getColumn();
            rowlist.add(thisColumn, " ");
        }

        // 更新行和列的值
        if (thisRow > -1)
            lastRowNumber = thisRow;
        if (thisColumn > -1)
            lastColumnNumber = thisColumn;

        // 行结束时的操作
        if (record instanceof LastCellOfRowDummyRecord) {
            if (minColumns > 0) {
                // 列值重新置空
                if (lastColumnNumber == -1) {
                    lastColumnNumber = 0;
                }
            }
            lastColumnNumber = -1;

            // 每行结束时, 调用getRows() 方法
            if(isWriteRow(rowlist)){
                //rowReader.getRows(sheetIndex, curRow, rowlist);
                r = new ArrayList<String>();
                r.addAll(rowlist);
                templist.add(r);
                r = null;
            }
            // 清空容器
            rowlist.clear();
        }
    }
    private boolean isWriteRow(List list){
        boolean flag = false; 
        for (int i = 0; i < list.size(); i++) {
            if (list.get(i)!= null && !list.get(i).equals("")) {
                flag = true;
                break;
            }
        }
        if (allowNullRow) {
            if (!flag) {
                flag = true;//允许一次空行
                allowNullRow = false;
            }            
        }
        return flag;       
    }


}

猜你喜欢

转载自www.cnblogs.com/wangchl/p/11400025.html