linux应用程序_1_文本浏览器_2_encoding_2_各种编码格式
记事本以ascii编码格式保存文件
英文:ascii编码,一个字符占一字节,编码小于128
中文:gbk编码,一个字占两字节,低字节编码大于等于128
utf-8编码又称unicode码,是一种变长字节编码方式
第一个字节,从最高位开始,连续的N个1,决定这个字由几个字节表示,组合时舍去这N个1
N=0——————1个字节
N=1、2—————2个字节
N=k(2<k<7)——k个字节
其余字节,最高位固定为1,次高位固定为0,组合时只取低六位
utf-16是一个字占两字节的编码格式,分为be(小端)和le(大端)两种
对于utf-16be,低字节在前,高字节在后
对于utf-16le,高字节在前,低字节在后
以ascii.c为例分析:
编码结构体:
static T_EncodingOpr g_tAsciiEncodingOpr = {
.pcName = "ascii",
.iHeadLen = 0,
.IsSuppot = AsciiIsSuppot,
.GetCode = AsciiGetCode,
};
编码格式解析:
如果不是utf-8、utf16le、utf16be,这里认为就是ascii
static int AsciiIsSuppot(char *pcBufHead)
{
const char pucStrUtf8[] = {0xEF, 0xBB, 0xBF};
const char pucStrUtf16le[] = {0xFF, 0xFE};
const char pucStrUtf16be[] = {0xFE, 0xFF};
if(!strncmp(pucStrUtf8, pcBufHead,3))
{
return 0;
}
if(!strncmp(pucStrUtf16le, pcBufHead,2))
{
return 0;
}
if(!strncmp(pucStrUtf16be, pcBufHead,2))
{
return 0;
}
return 1;
}
解析、获取编码:
以指针取回编码字符,函数返回字符占据字节数,以供后面处理字符使用
static int AsciiGetCode(unsigned char *pucBufEndStart,unsigned char *pucBufEndEnd, unsigned int *pdwCode)
{
unsigned char *pucBuf = pucBufEndStart;
unsigned char ucOneChar = *pucBuf;
if((pucBuf < pucBufEndEnd) && (ucOneChar < 0x80))
{
*pdwCode = ucOneChar;
return 1;
}
if(((pucBuf + 1) < pucBufEndEnd) && (ucOneChar >= 0x80))
{
*pdwCode = ((unsigned int)pucBuf[1]<<8) + pucBuf[0];
return 2;
}
if(pucBuf < pucBufEndEnd)
{
*pdwCode = ucOneChar;
return 3;
}
return 0;
}
ascii编码初始化:
添加支持的字库,注册编码结构体
int AsciiEncodingInit(void)
{
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("freetype"));
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("gbk"));
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("ascii"));
return RegisterEncoding(&g_tAsciiEncodingOpr);
}
(四种编码的)完整代码:
ascii.c
#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>
static int AsciiIsSuppot(char *pcBufHead);
static int AsciiGetCode(unsigned char *pucBufEndStart, unsigned char *pucBufEndEnd, unsigned int *pdwCode);
static T_EncodingOpr g_tAsciiEncodingOpr = {
.pcName = "ascii",
.iHeadLen = 0,
.IsSuppot = AsciiIsSuppot,
.GetCode = AsciiGetCode,
};
static int AsciiIsSuppot(char *pcBufHead)
{
const char pucStrUtf8[] = {0xEF, 0xBB, 0xBF};
const char pucStrUtf16le[] = {0xFF, 0xFE};
const char pucStrUtf16be[] = {0xFE, 0xFF};
if(!strncmp(pucStrUtf8, pcBufHead,3))
{
return 0;
}
if(!strncmp(pucStrUtf16le, pcBufHead,2))
{
return 0;
}
if(!strncmp(pucStrUtf16be, pcBufHead,2))
{
return 0;
}
return 1;
}
static int AsciiGetCode(unsigned char *pucBufEndStart,unsigned char *pucBufEndEnd, unsigned int *pdwCode)
{
unsigned char *pucBuf = pucBufEndStart;
unsigned char ucOneChar = *pucBuf;
if((pucBuf < pucBufEndEnd) && (ucOneChar < 0x80))
{
*pdwCode = ucOneChar;
return 1;
}
if(((pucBuf + 1) < pucBufEndEnd) && (ucOneChar >= 0x80))
{
*pdwCode = ((unsigned int)pucBuf[1]<<8) + pucBuf[0];
return 2;
}
if(pucBuf < pucBufEndEnd)
{
*pdwCode = ucOneChar;
return 3;
}
return 0;
}
int AsciiEncodingInit(void)
{
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("freetype"));
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("gbk"));
AddFontOprForEncoding(&g_tAsciiEncodingOpr, GetFontOpr("ascii"));
return RegisterEncoding(&g_tAsciiEncodingOpr);
}
utf8.c
#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>
#include <stdio.h>
static int Utf8IsSuppot(char *pcBufHead);
static int Utf8GetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);
static T_EncodingOpr g_tUtf8EncodingOpr = {
.pcName = "utf8",
.iHeadLen = 3,
.IsSuppot = Utf8IsSuppot,
.GetCode = Utf8GetCode,
};
static int Utf8IsSuppot(char *pcBufHead)
{
const char pcStrUtf8[] = {0xEF, 0xBB, 0xBF};
if(!strncmp(pcStrUtf8, pcBufHead,3))
{
return 1;
}
return 0;
}
static int GetHeadBits(unsigned char ucVal)
{
int iBits = 0;
int iCnt;
for(iCnt = 7;iCnt >= 0;iCnt--)
{
if(ucVal & (1<<iCnt))
iBits++;
else
break;
}
return iBits;
}
static int Utf8GetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{
int iBits;
int iCnt;
if(pucBufStart >= pucBufEnd)
{
DBG_PRINT("End\r\n");
return 0;
}
iBits = GetHeadBits(pucBufStart[0]);
if(pucBufStart + iBits > pucBufEnd)
{
DBG_PRINT("Error at Utf8GetCode : pucBufStart + iBits > pucBufEnd\r\n");
return 0;
}
if(iBits == 0)
{
*pdwCode = pucBufStart[0];
iBits = 1;
}
else
{
*pdwCode = (pucBufStart[0] << iBits) & 0xff;
*pdwCode >>= iBits;
for(iCnt = 1;iCnt < iBits;iCnt++)
{
*pdwCode <<= 6;
*pdwCode += pucBufStart[(unsigned char)iCnt] & 0x3f;
}
}
return iBits;
}
int Utf8EncodingInit(void)
{
AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("freetype"));
AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("gbk"));
AddFontOprForEncoding(&g_tUtf8EncodingOpr, GetFontOpr("ascii"));
return RegisterEncoding(&g_tUtf8EncodingOpr);
}
utf16le.c
#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>
static int Utf16leIsSuppot(char *pcBufHead);
static int Utf16leGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);
static T_EncodingOpr g_tUtf16leEncodingOpr = {
.pcName = "utf16le",
.iHeadLen = 0,
.IsSuppot = Utf16leIsSuppot,
.GetCode = Utf16leGetCode,
};
static int Utf16leIsSuppot(char *pcBufHead)
{
const char pcStrUtf16le[] = {0xFF, 0xFE};
if(!strncmp(pcStrUtf16le, pcBufHead,2))
{
return 1;
}
return 0;
}
static int Utf16leGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{
if (pucBufStart + 1 < pucBufEnd)
{
*pdwCode = (((unsigned int)pucBufStart[1])<<8) + pucBufStart[0];
return 2;
}
return 0;
}
int Utf16leEncodingInit(void)
{
AddFontOprForEncoding(&g_tUtf16leEncodingOpr, GetFontOpr("gbk"));
AddFontOprForEncoding(&g_tUtf16leEncodingOpr, GetFontOpr("ascii"));
return RegisterEncoding(&g_tUtf16leEncodingOpr);
}
utf16be.c
#include <config.h>
#include <encoding_manager.h>
#include <fonts_manager.h>
#include <string.h>
static int Utf16beIsSuppot(char *pcBufHead);
static int Utf16beGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode);
static T_EncodingOpr g_tUtf16beEncodingOpr = {
.pcName = "utf16be",
.iHeadLen = 0,
.IsSuppot = Utf16beIsSuppot,
.GetCode = Utf16beGetCode,
};
static int Utf16beIsSuppot(char *pcBufHead)
{
const char pcStrUtf16be[] = {0xFE, 0xFF};
if(!strncmp(pcStrUtf16be, pcBufHead,2))
{
return 1;
}
return 0;
}
static int Utf16beGetCode(unsigned char *pucBufStart, unsigned char *pucBufEnd, unsigned int *pdwCode)
{
if(pucBufStart+1 > pucBufEnd)
{
return 0;
}
*pdwCode = ((unsigned int)pucBufStart[0]<<8) + pucBufStart[1];
return 2;
}
int Utf16beEncodingInit(void)
{
AddFontOprForEncoding(&g_tUtf16beEncodingOpr, GetFontOpr("gbk"));
AddFontOprForEncoding(&g_tUtf16beEncodingOpr, GetFontOpr("ascii"));
return RegisterEncoding(&g_tUtf16beEncodingOpr);
}