db/dic2csv.py
changeset 120 c282594b523b
parent 4 140c907a66be
child 121 dcefb3741425
     1.1 --- a/db/dic2csv.py	Sun Mar 15 22:16:09 2015 +0100
     1.2 +++ b/db/dic2csv.py	Mon Mar 23 21:42:45 2015 +0100
     1.3 @@ -24,8 +24,20 @@
     1.4  
     1.5  args = p.parse_args()
     1.6  
     1.7 +try:
     1.8 +    from icu import UnicodeString, Locale
     1.9 +except ImportError:
    1.10 +    print("warning: PyICU not installed, using fallback", file=sys.stderr)
    1.11 +    def upper(x):
    1.12 +        return x.upper();
    1.13 +else:
    1.14 +    locale = Locale(args.lang)
    1.15 +    def upper(x):
    1.16 +        u = UnicodeString(x)
    1.17 +        return str(u.toUpper(locale))
    1.18 +
    1.19  _all = (
    1.20 -    word.match(line).group(1).upper()
    1.21 +    upper(word.match(line).group(1))
    1.22          for line in FileInput(
    1.23                  args.hunspell + "/" + args.lang + ".dic",
    1.24                  openhook=hook_encoded(args.encoding)
    1.25 @@ -36,6 +48,9 @@
    1.26  _words.sort()
    1.27  _words = [w for w, g in itertools.groupby(_words)]
    1.28  
    1.29 +while len(_words) > 65536 * 2:
    1.30 +    _words = _words[::2]
    1.31 +
    1.32  if len(_words) > 65536:
    1.33      _words = _words[:65536]
    1.34  elif len(_words) < 65536: