unicode 字符串转UTF-8字符串,适合于所有操作系统,包含Linux,MacOS,Windows等。不需要依赖ICONV库。linux 实现UNICODE到UTF8的转码函数,可以替代Windows里的WideCharToMutiByte 函数 。适合所有Unicode字符串,包含U16,U32。
函数原型:
int unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)
参数说明:
const wchar_t * u16str:要转换的unicode字符串;
size_t u16str_len:要转换的unicode字符串长度,必须指定;
char * out_buf:用于接收UTF-8字符串输出的缓冲区指针,如果为NULL,则函数返回需要的字节数。
size_t out_buf_size: 用于接收UTF-8字符串输出的缓冲区的尺寸,如果out_buf为NULL,则忽略此参数,否则,函数最多转换输出out_buf_size个字节。
备注:如果out_buf_size足够大,函数会在输出缓冲的最后添加一个‘\0'字符。
int unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)
{
size_t output_length;
const wchar_t * src_end = u16str + u16str_len;
const wchar_t * src = u16str;
int ch;
output_length = 0;
while(src < src_end) {
ch = *src;
if( ch<= 0x0000007f){
output_length++;
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
output_length+=2;
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
output_length+=3;
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
output_length+=4;
}
src++;
}
if (out_buf) {
src = u16str;
if (out_buf_size > output_length) {
while(src < src_end) {
ch = *src;
if( ch <= 0x0000007f){
*out_buf++ = (char)(ch & 0x0000007f);
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 );
*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
}
src++;
}
*out_buf = '\0';
return (int)output_length;
}
else {
size_t real_length = 0;
while(src < src_end) {
ch = *src;
if( ch <= 0x0000007f){
if (real_length + 1 <= out_buf_size) {
*out_buf++ = (char)(ch & 0x0000007f);
real_length++;
}
else {
break;
}
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
if (real_length + 2 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
real_length += 2;
}
else {
break;
}
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
if (real_length + 3 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
real_length += 3;
}
else {
break;
}
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
if (real_length + 4 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 );
*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
real_length += 4;
}
else {
break;
}
}
src++;
}
if (real_length < out_buf_size) {
*out_buf = 0;
}
return real_length;
}
}
return (int)output_length;
}
附: 演示代码
#include <stdio.h>
#include <stdlib.h>
#ifdef WIN32
#include <windows.h>
#include <tchar.h>
#endif
/*
src_len : unicode char count
*/
int unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)
{
size_t output_length;
const wchar_t * src_end = u16str + u16str_len;
const wchar_t * src = u16str;
int ch;
output_length = 0;
while(src < src_end) {
ch = *src;
if( ch<= 0x0000007f){
output_length++;
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
output_length+=2;
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
output_length+=3;
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
output_length+=4;
}
src++;
}
if (out_buf) {
src = u16str;
if (out_buf_size > output_length) {
while(src < src_end) {
ch = *src;
if( ch <= 0x0000007f){
*out_buf++ = (char)(ch & 0x0000007f);
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 );
*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
}
src++;
}
*out_buf = '\0';
return (int)output_length;
}
else {
size_t real_length = 0;
while(src < src_end) {
ch = *src;
if( ch <= 0x0000007f){
if (real_length + 1 <= out_buf_size) {
*out_buf++ = (char)(ch & 0x0000007f);
real_length++;
}
else {
break;
}
}
else if(ch >= 0x00000080 && ch <= 0x000007ff ) {
if (real_length + 2 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
real_length += 2;
}
else {
break;
}
}
else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
if (real_length + 3 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
real_length += 3;
}
else {
break;
}
}
else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
if (real_length + 4 <= out_buf_size) {
*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 );
*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
real_length += 4;
}
else {
break;
}
}
src++;
}
if (real_length < out_buf_size) {
*out_buf = 0;
}
return real_length;
}
}
return (int)output_length;
}
int main(int argc, char * argv[])
{
wchar_t src[] = L"我是中国人 ABc", decode[10];
char out_buf[64];
size_t out_len;
int len;
out_len = unicode_to_utf8(src, wcslen(src), out_buf, 64);
printf("output: %d\n", (int)out_len);
#ifdef WIN32
len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
decode[len] = '\0';
#else
out_buf[out_len] = '\0';
printf("%s\n", out_buf);
#endif
out_len = unicode_to_utf8(src, wcslen(src), out_buf, 12);
printf("output: %d\n", (int)out_len);
#ifdef WIN32
len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
decode[len] = '\0';
#else
out_buf[out_len] = '\0';
printf("%s\n", out_buf);
#endif
out_len = unicode_to_utf8(src, wcslen(src), out_buf, 17);
printf("output: %d\n", (int)out_len);
#ifdef WIN32
len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
decode[len] = '\0';
#else
out_buf[out_len] = '\0';
printf("%s\n", out_buf);
#endif
return 0;
}