195 lines
6.9 KiB
Python
195 lines
6.9 KiB
Python
![]() |
#!/usr/bin/python
|
||
|
# -*- coding: ascii -*-
|
||
|
# Unicode Data Converter
|
||
|
#
|
||
|
# This file is part of the Public Domain C Library (PDCLib).
|
||
|
# Permission is granted to use, modify, and / or redistribute at will.
|
||
|
"""
|
||
|
Converts the character information provdied by Unicode in the UnicodeData.txt
|
||
|
file from the Unicode character database into a table for use by PDCLib.
|
||
|
|
||
|
Usage: Download the UnicodeData.txt file to the same directory as this script
|
||
|
and then run it. Both Python 2 and 3 are supported.
|
||
|
|
||
|
Download the data from
|
||
|
ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||
|
|
||
|
We do some simple "run" compression, because characters in the Unicode Data file
|
||
|
tend to come in groups with the same properties.
|
||
|
"""
|
||
|
import os
|
||
|
|
||
|
# MUST BE KEPT SYNCHRONIZED WITH _PDCLIB_locale.h
|
||
|
BIT_ALPHA = 1
|
||
|
BIT_BLANK = 2
|
||
|
BIT_CNTRL = 4
|
||
|
BIT_GRAPH = 8
|
||
|
BIT_PUNCT = 16
|
||
|
BIT_SPACE = 32
|
||
|
BIT_LOWER = 64
|
||
|
BIT_UPPER = 128
|
||
|
BIT_DIGIT = 256
|
||
|
BIT_XDIGT = 512
|
||
|
|
||
|
# Category to bitfield mapping
|
||
|
categories = {
|
||
|
'Lu': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Uppercase
|
||
|
'Ll': BIT_ALPHA | BIT_GRAPH | BIT_LOWER, # Lowercase
|
||
|
'Lt': BIT_ALPHA | BIT_GRAPH | BIT_UPPER, # Title case. Upper?
|
||
|
'Lm': BIT_ALPHA | BIT_GRAPH, # Modifier. Case?
|
||
|
'Lo': BIT_ALPHA | BIT_GRAPH, # "Other" letter (e.g. Ideograph)
|
||
|
'Nd': BIT_DIGIT | BIT_GRAPH, # Decimal digit
|
||
|
'Nl': BIT_GRAPH, # Letter-like numeric character
|
||
|
'No': BIT_GRAPH, # Other numeric
|
||
|
'Pc': BIT_PUNCT | BIT_GRAPH, # Connecting punctuation
|
||
|
'Pd': BIT_PUNCT | BIT_GRAPH, # Dash punctuation
|
||
|
'Ps': BIT_PUNCT | BIT_GRAPH, # Opening punctuation
|
||
|
'Pe': BIT_PUNCT | BIT_GRAPH, # Closing punctuation
|
||
|
'Pi': BIT_PUNCT | BIT_GRAPH, # Opening quote
|
||
|
'Pf': BIT_PUNCT | BIT_GRAPH, # Closing quote
|
||
|
'Po': BIT_PUNCT | BIT_GRAPH, # Other punctuation
|
||
|
'Sm': BIT_GRAPH, # Mathematical symbol
|
||
|
'Sc': BIT_GRAPH, # Currency symbol
|
||
|
'Sk': BIT_GRAPH, # Non-letterlike modifier symbol
|
||
|
'So': BIT_GRAPH, # Other symbol
|
||
|
'Zs': BIT_SPACE, # Non-zero-width space character
|
||
|
'Zl': BIT_SPACE, # Line separator
|
||
|
'Zp': BIT_SPACE, # Paragraph separator
|
||
|
'Cc': BIT_CNTRL, # C0/C1 control codes
|
||
|
}
|
||
|
|
||
|
# Characters with special properties
|
||
|
special = {
|
||
|
# Blank characters
|
||
|
0x0020: BIT_SPACE | BIT_BLANK, # space
|
||
|
0x0009: BIT_SPACE | BIT_BLANK, # tab
|
||
|
|
||
|
# Digits
|
||
|
0x0030: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0031: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0032: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0033: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0034: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0035: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0036: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0037: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0038: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
0x0039: BIT_XDIGT | BIT_DIGIT | BIT_GRAPH,
|
||
|
|
||
|
# A-F (hex uppercase)
|
||
|
0x0041: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
0x0042: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
0x0043: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
0x0044: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
0x0045: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
0x0046: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_UPPER,
|
||
|
|
||
|
|
||
|
# a-f (hex lowercase)
|
||
|
0x0061: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
0x0062: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
0x0063: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
0x0064: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
0x0065: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
0x0066: BIT_XDIGT | BIT_ALPHA | BIT_GRAPH | BIT_LOWER,
|
||
|
}
|
||
|
|
||
|
class Group:
|
||
|
def __init__(self, start, flags, upper_delta, lower_delta):
|
||
|
self.start = start
|
||
|
self.flags = flags
|
||
|
self.upper_delta = upper_delta
|
||
|
self.lower_delta = lower_delta
|
||
|
self.chars = []
|
||
|
|
||
|
def add_char(self, num, label):
|
||
|
self.chars.append((num, label))
|
||
|
|
||
|
def write_to_file(self, f):
|
||
|
for char in self.chars:
|
||
|
f.write("// %x %s\n" % char)
|
||
|
f.write(" { 0x%X, \t0x%X, \t0x%X, \t%d, \t%d },\n" %
|
||
|
(self.start, len(self.chars), self.flags, self.lower_delta, self.upper_delta))
|
||
|
|
||
|
def next(self):
|
||
|
return self.start + len(self.chars)
|
||
|
|
||
|
groups = []
|
||
|
|
||
|
def add_char(num, upper, lower, bits, label):
|
||
|
upper_delta = upper - num
|
||
|
lower_delta = lower - num
|
||
|
|
||
|
if len(groups) != 0:
|
||
|
cur = groups[-1]
|
||
|
if num == cur.next() and cur.flags == bits and \
|
||
|
cur.upper_delta == upper_delta and \
|
||
|
cur.lower_delta == lower_delta:
|
||
|
cur.add_char(num, label)
|
||
|
return
|
||
|
|
||
|
g = Group(num, bits, upper_delta, lower_delta)
|
||
|
g.add_char(num, label)
|
||
|
groups.append(g)
|
||
|
|
||
|
in_file = open('UnicodeData.txt', 'r')
|
||
|
out_file = open('_PDCLIB_unicodedata.c', 'w')
|
||
|
try:
|
||
|
for line in in_file:
|
||
|
(num_hex, name, category, combining_class, bidi_class, decomposition,
|
||
|
numeric_type, numeric_digit, numeric_value, mirrored, u1name, iso_com,
|
||
|
upper_case_hex, lower_case_hex, title_case_hex) = line.split(";")
|
||
|
|
||
|
num = int(num_hex, 16)
|
||
|
upper_case = int(upper_case_hex, 16) if len(upper_case_hex) else num
|
||
|
lower_case = int(lower_case_hex, 16) if len(lower_case_hex) else num
|
||
|
bits = special.get(num, categories.get(category, 0))
|
||
|
|
||
|
if upper_case == 0 and lower_case == 0 and bits == 0:
|
||
|
continue
|
||
|
|
||
|
add_char(num, upper_case, lower_case, bits, name)
|
||
|
|
||
|
out_file.write("""
|
||
|
/* Unicode Character Information ** AUTOMATICALLY GENERATED FILE **
|
||
|
*
|
||
|
* This file is part of the PDCLib public domain C Library, but is automatically
|
||
|
* generated from the Unicode character data information file found at
|
||
|
* ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||
|
*
|
||
|
* As a result, the licensing that applies to that file also applies to this
|
||
|
* file. The licensing which applies to the Unicode character data can be found
|
||
|
* in Exhibit 1 of the Unicode Terms of Use, found at
|
||
|
* http://www.unicode.org/copyright.html#Exhibit1
|
||
|
*/
|
||
|
#ifndef REGTEST
|
||
|
#include <_PDCLIB_locale.h>
|
||
|
|
||
|
const _PDCLIB_wcinfo_t _PDCLIB_wcinfo[] = {
|
||
|
// { value, \tlength, \tflags,\tlower,\tupper\t}, // name
|
||
|
""")
|
||
|
for g in groups:
|
||
|
g.write_to_file(out_file)
|
||
|
out_file.write('};\n\n')
|
||
|
out_file.write("""
|
||
|
const size_t _PDCLIB_wcinfo_size = sizeof(_PDCLIB_wcinfo) / sizeof(_PDCLIB_wcinfo[0]);
|
||
|
#endif
|
||
|
|
||
|
#ifdef TEST
|
||
|
#include "_PDCLIB_test.h"
|
||
|
int main( void )
|
||
|
{
|
||
|
return TEST_RESULTS;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
""")
|
||
|
except:
|
||
|
in_file.close()
|
||
|
out_file.close()
|
||
|
os.remove('_PDCLIB_unicodedata.c')
|
||
|
raise
|
||
|
else:
|
||
|
in_file.close()
|
||
|
out_file.close()
|