db/dic2csv.py
author vb
Wed, 25 Jun 2014 18:48:18 +0200
changeset 2 f1fc3afacfd6
child 4 140c907a66be
permissions -rw-r--r--
extra files for db and testing
vb@2
     1
#! /usr/bin/env python3
vb@2
     2
vb@2
     3
from argparse import ArgumentParser
vb@2
     4
from fileinput import FileInput, hook_encoded
vb@2
     5
import re, itertools, sys
vb@2
     6
vb@2
     7
try:
vb@2
     8
    from math import log2
vb@2
     9
except:
vb@2
    10
    from math import log
vb@2
    11
    def log2(x): return log(x) / log(2)
vb@2
    12
vb@2
    13
word = re.compile(r"(\S*?)(/|\s.*|$)")
vb@2
    14
unwanted = re.compile(r"(^\d|[^']*')")
vb@2
    15
space = re.compile(r'^\s')
vb@2
    16
vb@2
    17
p = ArgumentParser(description="create dictionary csv out of hunspell data")
vb@2
    18
p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
vb@2
    19
    help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
vb@2
    20
p.add_argument('--lang', '-l', type=str, default="en_US",
vb@2
    21
    help='use dictionary for language LANG (default: en_US)')
vb@2
    22
p.add_argument('--encoding', '-e', type=str, default="utf-8",
vb@2
    23
    help='file encoding (default: utf-8)')
vb@2
    24
vb@2
    25
args = p.parse_args()
vb@2
    26
vb@2
    27
_all = (
vb@2
    28
    word.match(line).group(1).upper()
vb@2
    29
        for line in FileInput(
vb@2
    30
                args.hunspell + "/" + args.lang + ".dic",
vb@2
    31
                openhook=hook_encoded(args.encoding)
vb@2
    32
            )
vb@2
    33
        if not space.match(line)
vb@2
    34
)
vb@2
    35
_words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
vb@2
    36
_words.sort()
vb@2
    37
_words = [w for w, g in itertools.groupby(_words)]
vb@2
    38
vb@2
    39
if len(_words) > 65536:
vb@2
    40
    _words = _words[:65536]
vb@2
    41
elif len(_words) < 65536:
vb@2
    42
    sys.stderr.write(
vb@2
    43
            "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
vb@2
    44
                    args.lang,
vb@2
    45
                    log2(len(_words)),
vb@2
    46
                    log2(len(_words))*5
vb@2
    47
                )
vb@2
    48
        )
vb@2
    49
    _words.extend(_words[:65536-len(_words)])
vb@2
    50
vb@2
    51
assert len(_words) == 65536, "lenght is {}".format(len(_words))
vb@2
    52
vb@2
    53
for i, w in enumerate(_words):
vb@2
    54
    print("{l},{i},{w}".format(l=args.lang[:2], i=i, w=w))