| #!/usr/bin/env python |
| """ |
| Unicode case folding database conversion utility |
| |
| Parses the database and generates a C++ function which implements the case |
| folding algorithm. The database entries are of the form: |
| |
| <code>; <status>; <mapping>; # <name> |
| |
| <status> can be one of four characters: |
| C - Common mappings |
| S - mappings for Simple case folding |
| F - mappings for Full case folding |
| T - special case for Turkish I characters |
| |
| Right now this generates a function which implements simple case folding (C+S |
| entries). |
| """ |
| |
| import sys |
| import re |
| import urllib2 |
| |
| # This variable will body of the mappings function |
| body = "" |
| |
| # Reads file line-by-line, extracts Common and Simple case fold mappings and |
| # returns a (from_char, to_char, from_name) tuple. |
| def mappings(f): |
| previous_from = -1 |
| expr = re.compile(r'^(.*); [CS]; (.*); # (.*)') |
| for line in f: |
| m = expr.match(line) |
| if not m: continue |
| from_char = int(m.group(1), 16) |
| to_char = int(m.group(2), 16) |
| from_name = m.group(3) |
| |
| if from_char <= previous_from: |
| raise Exception("Duplicate or unsorted characters in input") |
| yield from_char, to_char, from_name |
| previous_from = from_char |
| |
| # Computes the shift (to_char - from_char) in a mapping. |
| def shift(mapping): |
| return mapping[1] - mapping[0] |
| |
| # Computes the stride (from_char2 - from_char1) of two mappings. |
| def stride2(mapping1, mapping2): |
| return mapping2[0] - mapping1[0] |
| |
| # Computes the stride of a list of mappings. The list should have at least two |
| # mappings. All mappings in the list are assumed to have the same stride. |
| def stride(block): |
| return stride2(block[0], block[1]) |
| |
| |
| # b is a list of mappings. All the mappings are assumed to have the same |
| # shift and the stride between adjecant mappings (if any) is constant. |
| def dump_block(b): |
| global body |
| |
| if len(b) == 1: |
| # Special case for handling blocks of length 1. We don't even need to |
| # emit the "if (C < X) return C" check below as all characters in this |
| # range will be caught by the "C < X" check emitted by the first |
| # non-trivial block. |
| body += " // {2}\n if (C == {0:#06x})\n return {1:#06x};\n".format(*b[0]) |
| return |
| |
| first = b[0][0] |
| last = first + stride(b) * (len(b)-1) |
| modulo = first % stride(b) |
| |
| # All characters before this block map to themselves. |
| body += " if (C < {0:#06x})\n return C;\n".format(first) |
| body += " // {0} characters\n".format(len(b)) |
| |
| # Generic pattern: check upper bound (lower bound is checked by the "if" |
| # above) and modulo of C, return C+shift. |
| pattern = " if (C <= {0:#06x} && C % {1} == {2})\n return C + {3};\n" |
| |
| if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0: |
| # Special case: |
| # We can elide the modulo-check because the expression "C|1" will map |
| # the intervening characters to themselves. |
| pattern = " if (C <= {0:#06x})\n return C | 1;\n" |
| elif stride(b) == 1: |
| # Another special case: X % 1 is always zero, so don't emit the |
| # modulo-check. |
| pattern = " if (C <= {0:#06x})\n return C + {3};\n" |
| |
| body += pattern.format(last, stride(b), modulo, shift(b[0])) |
| |
| current_block = [] |
| f = urllib2.urlopen(sys.argv[1]) |
| for m in mappings(f): |
| if len(current_block) == 0: |
| current_block.append(m) |
| continue |
| |
| if shift(current_block[0]) != shift(m): |
| # Incompatible shift, start a new block. |
| dump_block(current_block) |
| current_block = [m] |
| continue |
| |
| if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m): |
| current_block.append(m) |
| continue |
| |
| # Incompatible stride, start a new block. |
| dump_block(current_block) |
| current_block = [m] |
| f.close() |
| |
| dump_block(current_block) |
| |
| print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//' |
| print '//' |
| print '// This file was generated by utils/unicode-case-fold.py from the Unicode' |
| print '// case folding database at' |
| print '// ', sys.argv[1] |
| print '//' |
| print '// To regenerate this file, run:' |
| print '// utils/unicode-case-fold.py \\' |
| print '// "{}" \\'.format(sys.argv[1]) |
| print '// > lib/Support/UnicodeCaseFold.cpp' |
| print '//' |
| print '//===----------------------------------------------------------------------===//' |
| print '' |
| print '#include "llvm/Support/Unicode.h"' |
| print '' |
| print "int llvm::sys::unicode::foldCharSimple(int C) {" |
| print body |
| print " return C;" |
| print "}" |