db/sort.py
author Krista Grothoff <krista@pep-project.org>
Tue, 18 Oct 2016 00:33:42 +0200
branchENGINE-109
changeset 1296 9491e990650c
parent 457 7802147af797
child 1513 e7f7e42385b5
permissions -rw-r--r--
ENGINE-109: closed branch
     1 #! /usr/bin/env python3
     2 
     3 from argparse import ArgumentParser
     4 from fileinput import FileInput, hook_encoded
     5 import re, itertools, sys
     6 
     7 try:
     8     from math import log2
     9 except:
    10     from math import log
    11     def log2(x): return log(x) / log(2)
    12 
    13 word = re.compile(r"(\S*?)(/|\s.*|$)")
    14 unwanted = re.compile(r"(^\d|[^']*')")
    15 space = re.compile(r'^\s')
    16 
    17 p = ArgumentParser(description="create dictionary csv out of hunspell data")
    18 p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
    19     help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
    20 p.add_argument('--lang', '-l', type=str, default="en_US",
    21     help='use dictionary for language LANG (default: en_US)')
    22 p.add_argument('--encoding', '-e', type=str, default="utf-8",
    23     help='file encoding (default: utf-8)')
    24 p.add_argument('--full', '-f', action='store_true',
    25     help="full list - don't reduce to 65536 words")
    26 
    27 args = p.parse_args()
    28 
    29 try:
    30     from icu import UnicodeString, Locale
    31 except ImportError:
    32     print("warning: PyICU not installed, using fallback", file=sys.stderr)
    33     def upper(x):
    34         return x.upper();
    35 else:
    36     locale = Locale(args.lang)
    37     def upper(x):
    38         u = UnicodeString(x)
    39         return str(u.toUpper(locale))
    40 
    41 _all = (
    42     upper(word.match(line).group(1))
    43         for line in FileInput(
    44                 args.hunspell + "/" + args.lang + ".dic",
    45                 openhook=hook_encoded(args.encoding)
    46             )
    47         if not space.match(line)
    48 )
    49 _words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
    50 _words.sort()
    51 _words = [w for w, g in itertools.groupby(_words)]
    52 
    53 if not args.full:
    54     while len(_words) > 65536 * 2:
    55         _words = _words[::2]
    56 
    57 if len(_words) > 65536:
    58     if not args.full:
    59         _words = _words[:65536]
    60 elif len(_words) < 65536:
    61     sys.stderr.write(
    62             "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
    63                     args.lang,
    64                     log2(len(_words)),
    65                     log2(len(_words))*5
    66                 )
    67         )
    68     _words.extend(_words[:65536-len(_words)])
    69 
    70 if not args.full:
    71     assert len(_words) == 65536, "lenght is {}".format(len(_words))
    72 
    73 for i, w in enumerate(_words):
    74     print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))