unicode_to_utf8转换函数源码

unicode 字符串转UTF-8字符串,适合于所有操作系统,包含Linux,MacOS,Windows等。不需要依赖ICONV库。linux 实现UNICODE到UTF8的转码函数,可以替代Windows里的WideCharToMutiByte 函数 。适合所有Unicode字符串,包含U16,U32。

函数原型:

int  unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)

 参数说明:

const wchar_t * u16str:要转换的unicode字符串;

size_t u16str_len:要转换的unicode字符串长度,必须指定;

char * out_buf:用于接收UTF-8字符串输出的缓冲区指针,如果为NULL,则函数返回需要的字节数。

size_t out_buf_size: 用于接收UTF-8字符串输出的缓冲区的尺寸,如果out_buf为NULL,则忽略此参数,否则,函数最多转换输出out_buf_size个字节。

备注:如果out_buf_size足够大,函数会在输出缓冲的最后添加一个‘\0'字符。

int  unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)
{
	size_t          output_length;
	const wchar_t * src_end = u16str + u16str_len;
	const wchar_t * src = u16str;
	int             ch;

	output_length = 0;

	while(src < src_end) {
		ch = *src;
		if( ch<= 0x0000007f){
			output_length++;
		}   
		else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
			output_length+=2;
		}   
		else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
			output_length+=3;
		}   
		else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
			output_length+=4;
		} 
		src++;
	} 

	if (out_buf) {
		src = u16str;

		if (out_buf_size > output_length) {
			while(src < src_end) {
				ch = *src;

				if( ch <= 0x0000007f){
					*out_buf++ = (char)(ch & 0x0000007f);
				}   
				else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
					*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
					*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
				}   
				else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
					*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
					*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
				}   
				else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
					*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 ); 
					*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
				}

				src++;
			}
			*out_buf = '\0';
			return (int)output_length;
		}
		else {
			size_t real_length = 0;
			while(src < src_end) {
				ch = *src;
				if( ch <= 0x0000007f){
					if (real_length + 1 <= out_buf_size) {
						*out_buf++ = (char)(ch & 0x0000007f);
						real_length++;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
					if (real_length + 2 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
						*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
						real_length += 2;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
					if (real_length + 3 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
						*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
						real_length += 3;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
					if (real_length + 4 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 ); 
						*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
						real_length += 4;
					}
					else {
						break;
					}
				}

				src++;
			}

			if (real_length < out_buf_size) {
				*out_buf = 0;
			}

			return real_length;
		}
	}
	
	return (int)output_length;  
}

附: 演示代码

#include <stdio.h>
#include <stdlib.h>
#ifdef WIN32
#include <windows.h>
#include <tchar.h>
#endif

/*
src_len : unicode char count
*/
int  unicode_to_utf8(const wchar_t * u16str, size_t u16str_len, char * out_buf, size_t out_buf_size)
{
	size_t          output_length;
	const wchar_t * src_end = u16str + u16str_len;
	const wchar_t * src = u16str;
	int             ch;

	output_length = 0;

	while(src < src_end) {
		ch = *src;
		if( ch<= 0x0000007f){
			output_length++;
		}   
		else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
			output_length+=2;
		}   
		else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
			output_length+=3;
		}   
		else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
			output_length+=4;
		} 
		src++;
	} 

	if (out_buf) {
		src = u16str;

		if (out_buf_size > output_length) {
			while(src < src_end) {
				ch = *src;

				if( ch <= 0x0000007f){
					*out_buf++ = (char)(ch & 0x0000007f);
				}   
				else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
					*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
					*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
				}   
				else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
					*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
					*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
				}   
				else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
					*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 ); 
					*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
					*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
				}

				src++;
			}
			*out_buf = '\0';
			return (int)output_length;
		}
		else {
			size_t real_length = 0;
			while(src < src_end) {
				ch = *src;
				if( ch <= 0x0000007f){
					if (real_length + 1 <= out_buf_size) {
						*out_buf++ = (char)(ch & 0x0000007f);
						real_length++;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00000080 && ch <= 0x000007ff  ) {
					if (real_length + 2 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x000007c0) >>6 ) | 0x000000e0 );
						*out_buf++ = (char)( (ch & 0x0000003f) | 0x00000080 );
						real_length += 2;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00000800 && ch <= 0x0000ffff ) {
					if (real_length + 3 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x0000f000) >> 12 ) | 0x000000e0 );
						*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
						real_length += 3;
					}
					else {
						break;
					}
				}   
				else if(ch >= 0x00010000 && ch <= 0x0010ffff ){
					if (real_length + 4 <= out_buf_size) {
						*out_buf++ = (char)(((ch & 0x001c0000) >> 16 ) | 0x000000f0 ); 
						*out_buf++ = (char)(((ch & 0x0003f000)>>12) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x00000fc0)>>6) | 0x00000080 );
						*out_buf++ = (char)(((ch & 0x0000003f)) | 0x00000080);
						real_length += 4;
					}
					else {
						break;
					}
				}

				src++;
			}

			if (real_length < out_buf_size) {
				*out_buf = 0;
			}

			return real_length;
		}
	}
	
	return (int)output_length;  
}


int main(int argc, char * argv[])
{
	wchar_t src[] = L"我是中国人 ABc", decode[10];

	char out_buf[64];
	size_t out_len;
	int len;

	out_len = unicode_to_utf8(src, wcslen(src), out_buf, 64);
	printf("output: %d\n", (int)out_len);
#ifdef WIN32
	len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
	decode[len] = '\0';
#else
	out_buf[out_len] = '\0';
	printf("%s\n", out_buf);
#endif

	out_len = unicode_to_utf8(src, wcslen(src), out_buf, 12);
	printf("output: %d\n", (int)out_len);
#ifdef WIN32
	len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
	decode[len] = '\0';
#else
	out_buf[out_len] = '\0';
	printf("%s\n", out_buf);
#endif

	out_len = unicode_to_utf8(src, wcslen(src), out_buf, 17);
	printf("output: %d\n", (int)out_len);
#ifdef WIN32
	len = MultiByteToWideChar(CP_UTF8, 0, out_buf, (int)out_len, decode, 10);
	decode[len] = '\0';
#else
	out_buf[out_len] = '\0';
	printf("%s\n", out_buf);
#endif
	return 0;
}

猜你喜欢

转载自blog.csdn.net/ababab12345/article/details/122215499
今日推荐