db/sort.py
author Krista Bennett <krista@pep-project.org>
Fri, 01 Jun 2018 10:30:19 +0200
branchENGINE-254
changeset 2723 7194b9c8599d
parent 1513 e7f7e42385b5
permissions -rw-r--r--
close branch
vb@457
     1
#! /usr/bin/env python3
vb@457
     2
vb@1513
     3
# This file is under GNU General Public License 3.0
vb@1513
     4
# see LICENSE.txt
vb@1513
     5
vb@1513
     6
vb@457
     7
from argparse import ArgumentParser
vb@457
     8
from fileinput import FileInput, hook_encoded
vb@457
     9
import re, itertools, sys
vb@457
    10
vb@457
    11
try:
vb@457
    12
    from math import log2
vb@457
    13
except:
vb@457
    14
    from math import log
vb@457
    15
    def log2(x): return log(x) / log(2)
vb@457
    16
vb@457
    17
word = re.compile(r"(\S*?)(/|\s.*|$)")
vb@457
    18
unwanted = re.compile(r"(^\d|[^']*')")
vb@457
    19
space = re.compile(r'^\s')
vb@457
    20
vb@457
    21
p = ArgumentParser(description="create dictionary csv out of hunspell data")
vb@457
    22
p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
vb@457
    23
    help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
vb@457
    24
p.add_argument('--lang', '-l', type=str, default="en_US",
vb@457
    25
    help='use dictionary for language LANG (default: en_US)')
vb@457
    26
p.add_argument('--encoding', '-e', type=str, default="utf-8",
vb@457
    27
    help='file encoding (default: utf-8)')
vb@457
    28
p.add_argument('--full', '-f', action='store_true',
vb@457
    29
    help="full list - don't reduce to 65536 words")
vb@457
    30
vb@457
    31
args = p.parse_args()
vb@457
    32
vb@457
    33
try:
vb@457
    34
    from icu import UnicodeString, Locale
vb@457
    35
except ImportError:
vb@457
    36
    print("warning: PyICU not installed, using fallback", file=sys.stderr)
vb@457
    37
    def upper(x):
vb@457
    38
        return x.upper();
vb@457
    39
else:
vb@457
    40
    locale = Locale(args.lang)
vb@457
    41
    def upper(x):
vb@457
    42
        u = UnicodeString(x)
vb@457
    43
        return str(u.toUpper(locale))
vb@457
    44
vb@457
    45
_all = (
vb@457
    46
    upper(word.match(line).group(1))
vb@457
    47
        for line in FileInput(
vb@457
    48
                args.hunspell + "/" + args.lang + ".dic",
vb@457
    49
                openhook=hook_encoded(args.encoding)
vb@457
    50
            )
vb@457
    51
        if not space.match(line)
vb@457
    52
)
vb@457
    53
_words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
vb@457
    54
_words.sort()
vb@457
    55
_words = [w for w, g in itertools.groupby(_words)]
vb@457
    56
vb@457
    57
if not args.full:
vb@457
    58
    while len(_words) > 65536 * 2:
vb@457
    59
        _words = _words[::2]
vb@457
    60
vb@457
    61
if len(_words) > 65536:
vb@457
    62
    if not args.full:
vb@457
    63
        _words = _words[:65536]
vb@457
    64
elif len(_words) < 65536:
vb@457
    65
    sys.stderr.write(
vb@457
    66
            "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
vb@457
    67
                    args.lang,
vb@457
    68
                    log2(len(_words)),
vb@457
    69
                    log2(len(_words))*5
vb@457
    70
                )
vb@457
    71
        )
vb@457
    72
    _words.extend(_words[:65536-len(_words)])
vb@457
    73
vb@457
    74
if not args.full:
vb@457
    75
    assert len(_words) == 65536, "lenght is {}".format(len(_words))
vb@457
    76
vb@457
    77
for i, w in enumerate(_words):
vb@457
    78
    print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))