db/sort.py
changeset 457 7802147af797
child 1513 e7f7e42385b5
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/db/sort.py	Sun Feb 21 21:44:32 2016 +0100
     1.3 @@ -0,0 +1,74 @@
     1.4 +#! /usr/bin/env python3
     1.5 +
     1.6 +from argparse import ArgumentParser
     1.7 +from fileinput import FileInput, hook_encoded
     1.8 +import re, itertools, sys
     1.9 +
    1.10 +try:
    1.11 +    from math import log2
    1.12 +except:
    1.13 +    from math import log
    1.14 +    def log2(x): return log(x) / log(2)
    1.15 +
    1.16 +word = re.compile(r"(\S*?)(/|\s.*|$)")
    1.17 +unwanted = re.compile(r"(^\d|[^']*')")
    1.18 +space = re.compile(r'^\s')
    1.19 +
    1.20 +p = ArgumentParser(description="create dictionary csv out of hunspell data")
    1.21 +p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
    1.22 +    help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
    1.23 +p.add_argument('--lang', '-l', type=str, default="en_US",
    1.24 +    help='use dictionary for language LANG (default: en_US)')
    1.25 +p.add_argument('--encoding', '-e', type=str, default="utf-8",
    1.26 +    help='file encoding (default: utf-8)')
    1.27 +p.add_argument('--full', '-f', action='store_true',
    1.28 +    help="full list - don't reduce to 65536 words")
    1.29 +
    1.30 +args = p.parse_args()
    1.31 +
    1.32 +try:
    1.33 +    from icu import UnicodeString, Locale
    1.34 +except ImportError:
    1.35 +    print("warning: PyICU not installed, using fallback", file=sys.stderr)
    1.36 +    def upper(x):
    1.37 +        return x.upper();
    1.38 +else:
    1.39 +    locale = Locale(args.lang)
    1.40 +    def upper(x):
    1.41 +        u = UnicodeString(x)
    1.42 +        return str(u.toUpper(locale))
    1.43 +
    1.44 +_all = (
    1.45 +    upper(word.match(line).group(1))
    1.46 +        for line in FileInput(
    1.47 +                args.hunspell + "/" + args.lang + ".dic",
    1.48 +                openhook=hook_encoded(args.encoding)
    1.49 +            )
    1.50 +        if not space.match(line)
    1.51 +)
    1.52 +_words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
    1.53 +_words.sort()
    1.54 +_words = [w for w, g in itertools.groupby(_words)]
    1.55 +
    1.56 +if not args.full:
    1.57 +    while len(_words) > 65536 * 2:
    1.58 +        _words = _words[::2]
    1.59 +
    1.60 +if len(_words) > 65536:
    1.61 +    if not args.full:
    1.62 +        _words = _words[:65536]
    1.63 +elif len(_words) < 65536:
    1.64 +    sys.stderr.write(
    1.65 +            "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
    1.66 +                    args.lang,
    1.67 +                    log2(len(_words)),
    1.68 +                    log2(len(_words))*5
    1.69 +                )
    1.70 +        )
    1.71 +    _words.extend(_words[:65536-len(_words)])
    1.72 +
    1.73 +if not args.full:
    1.74 +    assert len(_words) == 65536, "lenght is {}".format(len(_words))
    1.75 +
    1.76 +for i, w in enumerate(_words):
    1.77 +    print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))