desmume/tools/vio2sf/src/foobar8/pfc/utf8.cpp

432 lines
8.6 KiB
C++

#include "pfc.h"
#include <locale.h>
//utf8 stuff
#ifndef BYTE
typedef unsigned char BYTE;
#endif
#ifndef UINT
typedef unsigned int UINT;
#endif
static const BYTE mask_tab[6]={0x80,0xE0,0xF0,0xF8,0xFC,0xFE};
static const BYTE val_tab[6]={0,0xC0,0xE0,0xF0,0xF8,0xFC};
unsigned utf8_decode_char(const char *p_utf8,unsigned * wide,unsigned max)
{
const BYTE * utf8 = (const BYTE*)p_utf8;
if (wide) *wide = 0;
if (max==0)
return 0;
else if (max>6) max = 6;
if (utf8[0]<0x80)
{
if (wide) *wide = utf8[0];
return utf8[0]>0 ? 1 : 0;
}
unsigned res=0;
unsigned n;
unsigned cnt=0;
while(1)
{
if ((*utf8&mask_tab[cnt])==val_tab[cnt]) break;
if (++cnt>=max) return 0;
}
cnt++;
for(n=0;n<cnt;n++)
if (utf8[n]==0) return 0;
if (cnt==2 && !(*utf8&0x1E)) return 0;
if (cnt==1)
res=*utf8;
else
res=(0xFF>>(cnt+1))&*utf8;
for (n=1;n<cnt;n++)
{
if ((utf8[n]&0xC0) != 0x80)
return 0;
if (!res && n==2 && !((utf8[n]&0x7F) >> (7 - cnt)))
return 0;
res=(res<<6)|(utf8[n]&0x3F);
}
if (wide)
*wide=res;
return cnt;
}
unsigned utf8_encode_char(unsigned wide,char * target)
{
unsigned int count;
if (wide < 0x80)
count = 1;
else if (wide < 0x800)
count = 2;
else if (wide < 0x10000)
count = 3;
else if (wide < 0x200000)
count = 4;
else if (wide < 0x4000000)
count = 5;
else if (wide <= 0x7FFFFFFF)
count = 6;
else
return 0;
//if (count>max) return 0;
if (target == 0)
return count;
switch (count)
{
case 6:
target[5] = 0x80 | (wide & 0x3F);
wide = wide >> 6;
wide |= 0x4000000;
case 5:
target[4] = 0x80 | (wide & 0x3F);
wide = wide >> 6;
wide |= 0x200000;
case 4:
target[3] = 0x80 | (wide & 0x3F);
wide = wide >> 6;
wide |= 0x10000;
case 3:
target[2] = 0x80 | (wide & 0x3F);
wide = wide >> 6;
wide |= 0x800;
case 2:
target[1] = 0x80 | (wide & 0x3F);
wide = wide >> 6;
wide |= 0xC0;
case 1:
target[0] = wide;
}
return count;
}
unsigned utf16_encode_char(unsigned cur_wchar,WCHAR * out)
{
if (cur_wchar>0 && cur_wchar<(1<<20))
{
if (cur_wchar>=0x10000)
{
unsigned c = cur_wchar - 0x10000;
//MSDN:
//The first (high) surrogate is a 16-bit code value in the range U+D800 to U+DBFF. The second (low) surrogate is a 16-bit code value in the range U+DC00 to U+DFFF. Using surrogates, Unicode can support over one million characters. For more details about surrogates, refer to The Unicode Standard, version 2.0.
out[0] = (WCHAR)(0xD800 | (0x3FF & (c>>10)) );
out[1] = (WCHAR)(0xDC00 | (0x3FF & c) ) ;
return 2;
}
else
{
*out = (WCHAR)cur_wchar;
return 1;
}
}
return 0;
}
unsigned utf16_decode_char(const WCHAR * src,unsigned * out)
{
unsigned rv = 0;
unsigned int cur_wchar = *(src++);
if (cur_wchar)
{
rv = 1;
if ((cur_wchar & 0xFC00) == 0xD800)
{
unsigned int low = *src;
if ((low & 0xFC00) == 0xDC00)
{
src++;
cur_wchar = 0x10000 + ( ((cur_wchar & 0x3FF) << 10) | (low & 0x3FF) );
rv = 2;
}
}
}
*out = cur_wchar;
return rv;
}
UINT utf8_get_char(const char * src)
{
UINT rv = 0;
utf8_decode_char(src,&rv);
return rv;
}
unsigned utf8_char_len(const char * s)
{
return utf8_decode_char(s,0);
}
int skip_utf8_chars(const char * ptr,int count)
{
int num = 0;
for(;count && ptr[num];count--)
{
int d = utf8_char_len(ptr+num);
if (d<=0) break;
num+=d;
}
return num;
}
unsigned convert_utf8_to_utf16(const char * src,WCHAR * dst,unsigned len)
{
unsigned rv = 0;
while(*src && len)
{
unsigned c,d;
d = utf8_decode_char(src,&c,len);
if (d==0 || d>len) break;
src += d;
len -= d;
d = utf16_encode_char(c,dst);
if (d==0) break;
dst += d;
rv += d;
}
*dst = 0;
return rv;
}
unsigned convert_utf16_to_utf8(const WCHAR * src,char * dst,unsigned len)
{
unsigned rv = 0;
while(*src && len)
{
unsigned c,d;
d = utf16_decode_char(src,&c);
if (d==0 || d>len) break;
src += d;
len -= d;
d = utf8_encode_char(c,dst);
if (d==0) break;
dst += d;
rv += d;
}
*dst = 0;
return rv;
}
unsigned convert_ansi_to_utf16(const char * src,WCHAR * dst,unsigned len)
{
len = strlen_max(src,len);
unsigned rv;
#ifdef WIN32
rv = MultiByteToWideChar(CP_ACP,0,src,len,dst,estimate_ansi_to_utf16(src));
#else
setlocale(LC_CTYPE,"");
rv = mbstowcs(dst,src,len);
#endif
if ((signed)rv<0) rv = 0;
dst[rv]=0;
return rv;
}
unsigned convert_utf16_to_ansi(const WCHAR * src,char * dst,unsigned len)
{
len = wcslen_max(src,len);
unsigned rv;
#ifdef WIN32
rv = WideCharToMultiByte(CP_ACP,0,src,len,dst,estimate_utf16_to_ansi(src),0,0);
#else
setlocale(LC_CTYPE,"");
rv = wcstombs(dst,src,len);
#endif
if ((signed)rv<0) rv = 0;
dst[rv]=0;
return rv;
}
unsigned convert_utf8_to_ansi(const char * src,char * dst,unsigned len)
{//meh
len = strlen_max(src,len);
unsigned temp_len = estimate_utf8_to_utf16(src,len);
mem_block_t<WCHAR> temp_block;
WCHAR * temp = (temp_len * sizeof(WCHAR) <= PFC_ALLOCA_LIMIT) ? (WCHAR*)alloca(temp_len * sizeof(WCHAR)) : temp_block.set_size(temp_len);
assert(temp);
len = convert_utf8_to_utf16(src,temp,len);
return convert_utf16_to_ansi(temp,dst,len);
}
unsigned convert_ansi_to_utf8(const char * src,char * dst,unsigned len)
{//meh
len = strlen_max(src,len);
unsigned temp_len = estimate_ansi_to_utf16(src,len);
mem_block_t<WCHAR> temp_block;
WCHAR * temp = (temp_len * sizeof(WCHAR) <= PFC_ALLOCA_LIMIT) ? (WCHAR*)alloca(temp_len * sizeof(WCHAR)) : temp_block.set_size(temp_len);
assert(temp);
len = convert_ansi_to_utf16(src,temp,len);
return convert_utf16_to_utf8(temp,dst,len);
}
void string_base::add_string_ansi(const char * src,unsigned len)
{
len = strlen_max(src,len);
unsigned temp_len = estimate_ansi_to_utf8(src,len);
mem_block_t<char> temp_block;
char * temp = (temp_len * sizeof(char) <= PFC_ALLOCA_LIMIT) ? (char*)alloca(temp_len * sizeof(char)) : temp_block.set_size(temp_len);
assert(temp);
len = convert_ansi_to_utf8(src,temp,len);
add_string_n(temp,len);
}
void string_base::add_string_utf16(const WCHAR * src,unsigned len)
{
len = wcslen_max(src,len);
unsigned temp_len = estimate_utf16_to_utf8(src,len);
mem_block_t<char> temp_block;
char * temp = (temp_len * sizeof(char) <= PFC_ALLOCA_LIMIT) ? (char*)alloca(temp_len * sizeof(char)) : temp_block.set_size(temp_len);
assert(temp);
len = convert_utf16_to_utf8(src,temp,len);
add_string_n(temp,len);
}
bool is_valid_utf8(const char * param)
{
__try {
while(*param)
{
unsigned d;
d = utf8_decode_char(param,0);
if (d==0) return false;
param += d;
}
return true;
}
__except(1)
{
return false;
}
}
bool is_lower_ascii(const char * param)
{
while(*param)
{
if (*param<0) return false;
param++;
}
return true;
}
static bool check_end_of_string(const char * ptr)
{
__try {
return !*ptr;
}
__except(1) {return true;}
}
unsigned strcpy_utf8_truncate(const char * src,char * out,unsigned maxbytes)
{
unsigned rv = 0 , ptr = 0;
if (maxbytes>0)
{
maxbytes--;//for null
while(!check_end_of_string(src) && maxbytes>0)
{
__try {
unsigned delta = utf8_char_len(src);
if (delta>maxbytes || delta==0) break;
do
{
out[ptr++] = *(src++);
} while(--delta);
} __except(1) { break; }
rv = ptr;
}
out[rv]=0;
}
return rv;
}
void recover_invalid_utf8(const char * src,char * out,unsigned replace)
{
while(!check_end_of_string(src))
{
unsigned c,d;
__try {
d = utf8_decode_char(src,&c);
} __except(1) {d = 0;}
if (d==0) c = replace;
out += utf8_encode_char(c,out);
}
*out = 0;
}
unsigned string8::replace_char(unsigned c1,unsigned c2,unsigned start)
{
string8 temp(get_ptr()+start);
truncate(start);
const char * ptr = temp;
unsigned rv = 0;
while(*ptr)
{
unsigned test;
unsigned delta = utf8_decode_char(ptr,&test);
if (delta==0 || test==0) break;
if (test == c1) {test = c2;rv++;}
add_char(test);
ptr += delta;
}
return rv;
}
unsigned strlen_utf8(const char * p,unsigned num)
{
unsigned w,d;
unsigned ret = 0;
for(;num;)
{
d = utf8_decode_char(p,&w);
if (w==0 || d<=0) break;
ret++;
p+=d;
num-=d;
}
return ret;
}
unsigned utf8_chars_to_bytes(const char * string,unsigned count)
{
unsigned bytes = 0;
while(count)
{
unsigned delta = utf8_decode_char(string+bytes,0);
if (delta==0) break;
bytes += delta;
count--;
}
return bytes;
}