desmume/tools/vio2sf/src/foobar8/pfc/utf8.cpp

#include "pfc.h"

#include <locale.h>

//utf8 stuff

#ifndef BYTE
typedef unsigned char BYTE;
#endif

#ifndef UINT
typedef unsigned int UINT;
#endif

static const BYTE mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};

static const BYTE val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};

unsigned utf8_decode_char(const char *p_utf8,unsigned * wide,unsigned max)
{
	const BYTE * utf8 = (const BYTE*)p_utf8;

	if (wide) *wide = 0;

	if (max==0)
		return 0;
	else if (max>6) max = 6;

	if (utf8[0]<0x80)
	{
		if (wide) *wide = utf8[0];
		return utf8[0]>0 ? 1 : 0;
	}

	unsigned res=0;
	unsigned n;
	unsigned cnt=0;
	while(1)
	{
		if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
		if (++cnt>=max) return 0;
	}
	cnt++;

	for(n=0;n<cnt;n++)
		if (utf8[n]==0) return 0;


	if (cnt==2 && !(*utf8&0x1E)) return 0;

	if (cnt==1)
		res=*utf8;
	else
		res=(0xFF>>(cnt+1))&*utf8;

	for (n=1;n<cnt;n++)
	{
		if ((utf8[n]&0xC0) != 0x80)
			return 0;
		if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
			return 0;

		res=(res<<6)|(utf8[n]&0x3F);
	}

	if (wide)
		*wide=res;

	return cnt;
}


unsigned utf8_encode_char(unsigned wide,char * target)
{
	unsigned int count;

	if (wide < 0x80)
		count = 1;
	else if (wide < 0x800)
		count = 2;
	else if (wide < 0x10000)
		count = 3;
	else if (wide < 0x200000)
		count = 4;
	else if (wide < 0x4000000)
		count = 5;
	else if (wide <= 0x7FFFFFFF)
		count = 6;
	else
		return 0;
	//if (count>max) return 0;

	if (target == 0)
		return count;

	switch (count)
	{
    case 6:
		target[5] = 0x80 | (wide & 0x3F);
		wide = wide >> 6;
		wide |= 0x4000000;
    case 5:
		target[4] = 0x80 | (wide & 0x3F);
		wide = wide >> 6;
		wide |= 0x200000;
    case 4:
		target[3] = 0x80 | (wide & 0x3F);
		wide = wide >> 6;
		wide |= 0x10000;
    case 3:
		target[2] = 0x80 | (wide & 0x3F);
		wide = wide >> 6;
		wide |= 0x800;
    case 2:
		target[1] = 0x80 | (wide & 0x3F);
		wide = wide >> 6;
		wide |= 0xC0;
	case 1:
		target[0] = wide;
	}

	return count;
}

unsigned utf16_encode_char(unsigned cur_wchar,WCHAR * out)
{
	if (cur_wchar>0 && cur_wchar<(1<<20))
	{
		if (cur_wchar>=0x10000)
		{
			unsigned c = cur_wchar - 0x10000;
			//MSDN:
			//The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.
			out[0] = (WCHAR)(0xD800 | (0x3FF & (c>>10)) );
			out[1] = (WCHAR)(0xDC00 | (0x3FF & c) ) ;
			return 2;
		}
		else
		{
			*out = (WCHAR)cur_wchar;
			return 1;
		}
	}
	return 0;
}

unsigned utf16_decode_char(const WCHAR * src,unsigned * out)
{
	unsigned rv = 0;
	unsigned int cur_wchar = *(src++);
	if (cur_wchar)
	{
		rv = 1;
		if ((cur_wchar & 0xFC00) == 0xD800)
		{
			unsigned int low = *src;
			if ((low & 0xFC00) == 0xDC00)
			{
				src++;
				cur_wchar = 0x10000 + ( ((cur_wchar & 0x3FF) << 10) | (low & 0x3FF) );
				rv = 2;
			}
		}
	}
	*out = cur_wchar;
	return rv;
}


UINT utf8_get_char(const char * src)
{
	UINT rv = 0;
	utf8_decode_char(src,&rv);
	return rv;
}


unsigned utf8_char_len(const char * s)
{
	return utf8_decode_char(s,0);
}

int skip_utf8_chars(const char * ptr,int count)
{
	int num = 0;
	for(;count && ptr[num];count--)
	{
		int d = utf8_char_len(ptr+num);
		if (d<=0) break;
		num+=d;
	}
	return num;
}

unsigned convert_utf8_to_utf16(const char * src,WCHAR * dst,unsigned len)
{
	unsigned rv = 0;
	while(*src && len)
	{
		unsigned c,d;
		d = utf8_decode_char(src,&c,len);
		if (d==0 || d>len) break;
		src += d;
		len -= d;
		d = utf16_encode_char(c,dst);
		if (d==0) break;
		dst += d;
		rv += d;
	}
	*dst = 0;
	return rv;
}

unsigned convert_utf16_to_utf8(const WCHAR * src,char * dst,unsigned len)
{
	unsigned rv = 0;
	while(*src && len)
	{
		unsigned c,d;
		d = utf16_decode_char(src,&c);
		if (d==0 || d>len) break;
		src += d;
		len -= d;
		d = utf8_encode_char(c,dst);
		if (d==0) break;
		dst += d;
		rv += d;
	}
	*dst = 0;
	return rv;
}

unsigned convert_ansi_to_utf16(const char * src,WCHAR * dst,unsigned len)
{
	len = strlen_max(src,len);
	unsigned rv;
#ifdef WIN32
	rv = MultiByteToWideChar(CP_ACP,0,src,len,dst,estimate_ansi_to_utf16(src));
#else
	setlocale(LC_CTYPE,"");
	rv = mbstowcs(dst,src,len);
#endif
	if ((signed)rv<0) rv = 0;
	dst[rv]=0;
	return rv;
}

unsigned convert_utf16_to_ansi(const WCHAR * src,char * dst,unsigned len)
{
	len = wcslen_max(src,len);
	unsigned rv;
#ifdef WIN32
	rv = WideCharToMultiByte(CP_ACP,0,src,len,dst,estimate_utf16_to_ansi(src),0,0);
#else
	setlocale(LC_CTYPE,"");
	rv = wcstombs(dst,src,len);
#endif
	if ((signed)rv<0) rv = 0;
	dst[rv]=0;
	return rv;
}

unsigned convert_utf8_to_ansi(const char * src,char * dst,unsigned len)
{//meh
	len = strlen_max(src,len);

	unsigned temp_len = estimate_utf8_to_utf16(src,len);
	mem_block_t<WCHAR> temp_block;
	WCHAR * temp = (temp_len * sizeof(WCHAR) <= PFC_ALLOCA_LIMIT) ? (WCHAR*)alloca(temp_len * sizeof(WCHAR)) : temp_block.set_size(temp_len);
	assert(temp);

	len = convert_utf8_to_utf16(src,temp,len);
	return convert_utf16_to_ansi(temp,dst,len);
}

unsigned convert_ansi_to_utf8(const char * src,char * dst,unsigned len)
{//meh
	len = strlen_max(src,len);

	unsigned temp_len = estimate_ansi_to_utf16(src,len);
	mem_block_t<WCHAR> temp_block;
	WCHAR * temp = (temp_len * sizeof(WCHAR) <= PFC_ALLOCA_LIMIT) ? (WCHAR*)alloca(temp_len * sizeof(WCHAR)) : temp_block.set_size(temp_len);
	assert(temp);

	len = convert_ansi_to_utf16(src,temp,len);
	return convert_utf16_to_utf8(temp,dst,len);
}

void string_base::add_string_ansi(const char * src,unsigned len)
{
	len = strlen_max(src,len);

	unsigned temp_len = estimate_ansi_to_utf8(src,len);
	mem_block_t<char> temp_block;
	char * temp = (temp_len * sizeof(char) <= PFC_ALLOCA_LIMIT) ? (char*)alloca(temp_len * sizeof(char)) : temp_block.set_size(temp_len);
	assert(temp);

	len = convert_ansi_to_utf8(src,temp,len);
	add_string_n(temp,len);
}

void string_base::add_string_utf16(const WCHAR * src,unsigned len)
{
	len = wcslen_max(src,len);

	unsigned temp_len = estimate_utf16_to_utf8(src,len);
	mem_block_t<char> temp_block;
	char * temp = (temp_len * sizeof(char) <= PFC_ALLOCA_LIMIT) ? (char*)alloca(temp_len * sizeof(char)) : temp_block.set_size(temp_len);
	assert(temp);

	len = convert_utf16_to_utf8(src,temp,len);
	add_string_n(temp,len);
}

bool is_valid_utf8(const char * param)
{
	__try {
		while(*param)
		{
			unsigned d;
			d = utf8_decode_char(param,0);
			if (d==0) return false;
			param += d;
		}
		return true;
	}
	__except(1)
	{
		return false;
	}
}

bool is_lower_ascii(const char * param)
{
	while(*param)
	{
		if (*param<0) return false;
		param++;
	}
	return true;
}

static bool check_end_of_string(const char * ptr)
{
	__try {
		return !*ptr;
	}
	__except(1) {return true;}
}

unsigned strcpy_utf8_truncate(const char * src,char * out,unsigned maxbytes)
{
	unsigned rv = 0 , ptr = 0;
	if (maxbytes>0)
	{
		maxbytes--;//for null
		while(!check_end_of_string(src) && maxbytes>0)
		{
			__try {
				unsigned delta = utf8_char_len(src);
				if (delta>maxbytes || delta==0) break;
				do
				{
					out[ptr++] = *(src++);
				} while(--delta);
			} __except(1) { break; }
			rv = ptr;
		}
		out[rv]=0;
	}
	return rv;
}

void recover_invalid_utf8(const char * src,char * out,unsigned replace)
{
	while(!check_end_of_string(src))
	{
		unsigned c,d;
		__try {
			d = utf8_decode_char(src,&c);
		} __except(1) {d = 0;}
		if (d==0) c = replace;
		out += utf8_encode_char(c,out);
	}
	*out = 0;
}

unsigned string8::replace_char(unsigned c1,unsigned c2,unsigned start)
{
	string8 temp(get_ptr()+start);
	truncate(start);
	const char * ptr = temp;
	unsigned rv = 0;
	while(*ptr)
	{
		unsigned test;
		unsigned delta = utf8_decode_char(ptr,&test);
		if (delta==0 || test==0) break;
		if (test == c1) {test = c2;rv++;}
		add_char(test);
		ptr += delta;
	}
	return rv;
}

unsigned strlen_utf8(const char * p,unsigned num)
{
	unsigned w,d;
	unsigned ret = 0;
	for(;num;)
	{
		d = utf8_decode_char(p,&w);
		if (w==0 || d<=0) break;
		ret++;
		p+=d;
		num-=d;
	}
	return ret;
}

unsigned utf8_chars_to_bytes(const char * string,unsigned count)
{
	unsigned bytes = 0;
	while(count)
	{
		unsigned delta = utf8_decode_char(string+bytes,0);
		if (delta==0) break;
		bytes += delta;
		count--;
	}
	return bytes;
}