utf-8码点转unicode编码

utf-8是unicode实现的一种方案,简单来说unicode规定了编码的字的数值,采用一一对应的方式使得所有文字都能找到对应的编码值,而utf-8引入了码点的概念,使得码点能够组合成对应的编码值,从而找到对应的文字,通过码点的范围,有组织的得到编码值

这里写图片描述

查看文字utf-8码点 unicode编码值

一般实现中都把21位码点位数及以上的编码值通过多个码元组合得到,这里直接使用uint32_t将其组合在一起

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <stdint.h>

#define IS_ONE2NINE(ch) ((ch >= '0') && (ch <= '9'))
#define IS_A2F(ch) ((ch >= 'A') && (ch <= 'F'))
#define IS_a2f(ch) ((ch >= 'a') && (ch <= 'f')) 
#define IS_ONE_CPB(u) ((u >= (uint32_t)0x0000) && (u <= (uint32_t)0x007F))
#define IS_TWO_CPB(u) ((u >= (uint32_t)0x0080) && (u <= (uint32_t)0x07FF))
#define IS_THREE_CPB(u) ((u >= (uint32_t)0x0800) && (u <= (uint32_t)0xFFFF))
#define IS_FOUR_CPB(u) ((u >= (uint32_t)0x10000) && (u <= (uint32_t)0x10FFFF))


uint32_t get_hex4(const char* s,int m) {
    uint32_t h = 0;
    for (int i = 0; i < m; ++i) {
        if (IS_ONE2NINE(*s)) {
            h += ((*s) - '0');
        }else if (IS_A2F(*s)) {
            h += 10 + ((*s) - 'A');
        }else if (IS_a2f(*s)) {
            h += 10 + ((*s) - 'a');
        }else {
            assert(0);
        }
        if (i < m - 1) {
            h <<= 4;
        }
        ++s;
    }
    return h;
}

uint32_t parse_utf8(const char* s,int m) {
    assert(*s == '\\' && *(s + 1) == 'u');
    s = s + 2;
    uint32_t u = get_hex4(s,m);
    if (IS_ONE_CPB(u)) {
        return u;   
    }else if (IS_TWO_CPB(u)) {
        return (0xC000 | (u << 2)) | (0x0080 | (u & 0x3F));
    }else if (IS_THREE_CPB(u)) {
        return ((0xE00000 | ((u << 4) & 0x0F0000)) | (0x008000 | ((u << 2) & 0x003F00)) | (0x000080 | (u & 0x00003F)));
    }else if (IS_FOUR_CPB(u)) {
        return (0xF0000000 | (u << 6)) | (0x00800000 | ((u << 4) & 0x003F0000)) | (0x00008000 | ((u << 2) & 0x00003F00)) | (0x00000080 | (u & 0x0000003F));
    }else {
        assert(0);
    }
}

int main() {

    ///文字:我  码点:\u6211 unicode编码:0xE68891
    printf("unicode编码(我)：0x%x\t期待值:0xE68891\n",parse_utf8("\\u6211",4));
    //文字:爱 码点:\u7231 unicode编码:0xE788B1
    printf("unicode编码(爱)：0x%x\t期待值:0xE788B1\n",parse_utf8("\\u7231",4));
    //文字:你 码点:\u4F60 unicode编码:0xE4BDA0
    printf("unicode编码(你): 0x%x\t期待值:0xE4BDA0\n",parse_utf8("\\u4F60",4));

    //21位码点时可能与别的网站实现不同
    printf("0x%x\n",parse_utf8("\\u10EBEA",6));
    return 0;
}

utf-8码点转unicode编码

猜你喜欢