utf-8是unicode实现的一种方案,简单来说unicode规定了编码的字的数值,采用一一对应的方式使得所有文字都能找到对应的编码值,而utf-8引入了码点的概念,使得码点能够组合成对应的编码值,从而找到对应的文字,通过码点的范围,有组织的得到编码值
一般实现中都把21位码点位数及以上的编码值通过多个码元组合得到,这里直接使用uint32_t将其组合在一起
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <stdint.h>
#define IS_ONE2NINE(ch) ((ch >= '0') && (ch <= '9'))
#define IS_A2F(ch) ((ch >= 'A') && (ch <= 'F'))
#define IS_a2f(ch) ((ch >= 'a') && (ch <= 'f'))
#define IS_ONE_CPB(u) ((u >= (uint32_t)0x0000) && (u <= (uint32_t)0x007F))
#define IS_TWO_CPB(u) ((u >= (uint32_t)0x0080) && (u <= (uint32_t)0x07FF))
#define IS_THREE_CPB(u) ((u >= (uint32_t)0x0800) && (u <= (uint32_t)0xFFFF))
#define IS_FOUR_CPB(u) ((u >= (uint32_t)0x10000) && (u <= (uint32_t)0x10FFFF))
uint32_t get_hex4(const char* s,int m) {
uint32_t h = 0;
for (int i = 0; i < m; ++i) {
if (IS_ONE2NINE(*s)) {
h += ((*s) - '0');
}else if (IS_A2F(*s)) {
h += 10 + ((*s) - 'A');
}else if (IS_a2f(*s)) {
h += 10 + ((*s) - 'a');
}else {
assert(0);
}
if (i < m - 1) {
h <<= 4;
}
++s;
}
return h;
}
uint32_t parse_utf8(const char* s,int m) {
assert(*s == '\\' && *(s + 1) == 'u');
s = s + 2;
uint32_t u = get_hex4(s,m);
if (IS_ONE_CPB(u)) {
return u;
}else if (IS_TWO_CPB(u)) {
return (0xC000 | (u << 2)) | (0x0080 | (u & 0x3F));
}else if (IS_THREE_CPB(u)) {
return ((0xE00000 | ((u << 4) & 0x0F0000)) | (0x008000 | ((u << 2) & 0x003F00)) | (0x000080 | (u & 0x00003F)));
}else if (IS_FOUR_CPB(u)) {
return (0xF0000000 | (u << 6)) | (0x00800000 | ((u << 4) & 0x003F0000)) | (0x00008000 | ((u << 2) & 0x00003F00)) | (0x00000080 | (u & 0x0000003F));
}else {
assert(0);
}
}
int main() {
///文字:我 码点:\u6211 unicode编码:0xE68891
printf("unicode编码(我):0x%x\t期待值:0xE68891\n",parse_utf8("\\u6211",4));
//文字:爱 码点:\u7231 unicode编码:0xE788B1
printf("unicode编码(爱):0x%x\t期待值:0xE788B1\n",parse_utf8("\\u7231",4));
//文字:你 码点:\u4F60 unicode编码:0xE4BDA0
printf("unicode编码(你): 0x%x\t期待值:0xE4BDA0\n",parse_utf8("\\u4F60",4));
//21位码点时可能与别的网站实现不同
printf("0x%x\n",parse_utf8("\\u10EBEA",6));
return 0;
}