Android byte 数组转 String

不知道什么原因,Android 的 String.java 的好多方法不能使用,抛出这样的异常:

throw new UnsupportedOperationException("Use StringFactory instead.");

但是呢,StringFactory.java 这个类却又是 @hide 的,无法直接调用,无奈拷贝出其中的方法,拎出来使用:

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

public class Utils {
    
    
    private static final char REPLACEMENT_CHAR = (char) 0xfffd;

    private static final int[] TABLE_UTF8_NEEDED = new int[]{
    
    
            //0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f
            0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xc0 - 0xcf
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0xd0 - 0xdf
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xe0 - 0xef
            3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xf0 - 0xff
    };

    private static final ThreadLocal<StringBuilder> TMP_STRING_BUILDER
            = new ThreadLocal<StringBuilder>() {
    
    
        @Override
        protected StringBuilder initialValue() {
    
    
            return new StringBuilder();
        }
    };

    public static String newStringFromBytes(byte[] data) {
    
    
        return newStringFromBytes(data, 0, data.length);
    }

    public static String newStringFromBytes(byte[] data, int offset, int byteCount) {
    
    
        return newStringFromBytes(data, offset, byteCount, StandardCharsets.UTF_8);
    }

    public static String newStringFromBytes(byte[] data, int offset, int byteCount,
            Charset charset) {
    
    
        if ((offset | byteCount) < 0 || byteCount > data.length - offset) {
    
    
            throw new IndexOutOfBoundsException("length=" + data.length + "; regionStart=" + offset
                    + "; regionLength=" + byteCount);
        }

        char[] value;
        int length;

        // We inline UTF-8, ISO-8859-1, and US-ASCII decoders for speed.
        String canonicalCharsetName = charset.name();
        if (canonicalCharsetName.equals("UTF-8")) {
    
    
            /*
            This code converts a UTF-8 byte sequence to a Java String (UTF-16).
            It implements the W3C recommended UTF-8 decoder.
            https://www.w3.org/TR/encoding/#utf-8-decoder

            Unicode 3.2 Well-Formed UTF-8 Byte Sequences
            Code Points        First  Second Third Fourth
            U+0000..U+007F     00..7F
            U+0080..U+07FF     C2..DF 80..BF
            U+0800..U+0FFF     E0     A0..BF 80..BF
            U+1000..U+CFFF     E1..EC 80..BF 80..BF
            U+D000..U+D7FF     ED     80..9F 80..BF
            U+E000..U+FFFF     EE..EF 80..BF 80..BF
            U+10000..U+3FFFF   F0     90..BF 80..BF 80..BF
            U+40000..U+FFFFF   F1..F3 80..BF 80..BF 80..BF
            U+100000..U+10FFFF F4     80..8F 80..BF 80..BF

            Please refer to Unicode as the authority.
            p.126 Table 3-7 in http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf

            Handling Malformed Input
            The maximal subpart should be replaced by a single U+FFFD. Maximal subpart is
            the longest code unit subsequence starting at an unconvertible offset that is either
            1) the initial subsequence of a well-formed code unit sequence, or
            2) a subsequence of length one:
            One U+FFFD should be emitted for every sequence of bytes that is an incomplete prefix
            of a valid sequence, and with the conversion to restart after the incomplete sequence.

            For example, in byte sequence "41 C0 AF 41 F4 80 80 41", the maximal subparts are
            "C0", "AF", and "F4 80 80". "F4 80 80" can be the initial subsequence of "F4 80 80 80",
            but "C0" can't be the initial subsequence of any well-formed code unit sequence.
            Thus, the output should be "A\ufffd\ufffdA\ufffdA".

            Please refer to section "Best Practices for Using U+FFFD." in
            http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf
            */
            byte[] d = data;
            char[] v = new char[byteCount];

            int idx = offset;
            int last = offset + byteCount;
            int s = 0;

            int codePoint = 0;
            int utf8BytesSeen = 0;
            int utf8BytesNeeded = 0;
            int lowerBound = 0x80;
            int upperBound = 0xbf;

            while (idx < last) {
    
    
                int b = d[idx++] & 0xff;
                if (utf8BytesNeeded == 0) {
    
    
                    if ((b & 0x80) == 0) {
    
     // ASCII char. 0xxxxxxx
                        v[s++] = (char) b;
                        continue;
                    }

                    if ((b & 0x40) == 0) {
    
     // 10xxxxxx is illegal as first byte
                        v[s++] = REPLACEMENT_CHAR;
                        continue;
                    }

                    // 11xxxxxx
                    int tableLookupIndex = b & 0x3f;
                    utf8BytesNeeded = TABLE_UTF8_NEEDED[tableLookupIndex];
                    if (utf8BytesNeeded == 0) {
    
    
                        v[s++] = REPLACEMENT_CHAR;
                        continue;
                    }

                    // utf8BytesNeeded
                    // 1: b & 0x1f
                    // 2: b & 0x0f
                    // 3: b & 0x07
                    codePoint = b & (0x3f >> utf8BytesNeeded);
                    if (b == 0xe0) {
    
    
                        lowerBound = 0xa0;
                    } else if (b == 0xed) {
    
    
                        upperBound = 0x9f;
                    } else if (b == 0xf0) {
    
    
                        lowerBound = 0x90;
                    } else if (b == 0xf4) {
    
    
                        upperBound = 0x8f;
                    }
                } else {
    
    
                    if (b < lowerBound || b > upperBound) {
    
    
                        // The bytes seen are ill-formed. Substitute them with U+FFFD
                        v[s++] = REPLACEMENT_CHAR;
                        codePoint = 0;
                        utf8BytesNeeded = 0;
                        utf8BytesSeen = 0;
                        lowerBound = 0x80;
                        upperBound = 0xbf;
                        /*
                         * According to the Unicode Standard,
                         * "a UTF-8 conversion process is required to never consume well-formed
                         * subsequences as part of its error handling for ill-formed subsequences"
                         * The current byte could be part of well-formed subsequences. Reduce the
                         * index by 1 to parse it in next loop.
                         */
                        idx--;
                        continue;
                    }

                    lowerBound = 0x80;
                    upperBound = 0xbf;
                    codePoint = (codePoint << 6) | (b & 0x3f);
                    utf8BytesSeen++;
                    if (utf8BytesNeeded != utf8BytesSeen) {
    
    
                        continue;
                    }

                    // Encode chars from U+10000 up as surrogate pairs
                    if (codePoint < 0x10000) {
    
    
                        v[s++] = (char) codePoint;
                    } else {
    
    
                        v[s++] = (char) ((codePoint >> 10) + 0xd7c0);
                        v[s++] = (char) ((codePoint & 0x3ff) + 0xdc00);
                    }

                    utf8BytesSeen = 0;
                    utf8BytesNeeded = 0;
                    codePoint = 0;
                }
            }

            // The bytes seen are ill-formed. Substitute them by U+FFFD
            if (utf8BytesNeeded != 0) {
    
    
                v[s++] = REPLACEMENT_CHAR;
            }

            if (s == byteCount) {
    
    
                // We guessed right, so we can use our temporary array as-is.
                value = v;
                length = s;
            } else {
    
    
                // Our temporary array was too big, so reallocate and copy.
                value = new char[s];
                length = s;
                System.arraycopy(v, 0, value, 0, s);
            }
        } else {
    
    
            CharBuffer cb = charset.decode(ByteBuffer.wrap(data, offset, byteCount));
            length = cb.length();
            // The call to newStringFromChars below will copy length bytes out of value, so it does
            // not matter that cb.array().length may be > cb.length() or that a Charset could keep a
            // reference to the CharBuffer it returns and later mutate it.
            value = cb.array();
        }

        StringBuilder tmpSb = TMP_STRING_BUILDER.get();
        tmpSb.delete(0, tmpSb.length());
        tmpSb.append(value, 0, length);
        return tmpSb.toString();
    }
}

猜你喜欢

转载自blog.csdn.net/hegan2010/article/details/107118640
今日推荐