db/dic2csv.py
author Dirk Zimmermann <dz@pep.security>
Tue, 19 Nov 2019 15:37:54 +0100
branchsync
changeset 4209 bcb1587bf92f
parent 1513 e7f7e42385b5
permissions -rw-r--r--
IOS-1663 Xcode: Update generated file list
     1 #! /usr/bin/env python3
     2 
     3 # This file is under GNU General Public License 3.0
     4 # see LICENSE.txt
     5 
     6 
     7 from argparse import ArgumentParser
     8 from fileinput import FileInput, hook_encoded
     9 import re, itertools, sys
    10 
    11 try:
    12     from math import log2
    13 except:
    14     from math import log
    15     def log2(x): return log(x) / log(2)
    16 
    17 word = re.compile(r"(\S*?)(/|\s.*|$)")
    18 unwanted = re.compile(r"(^\d|[^']*')")
    19 space = re.compile(r'^\s')
    20 
    21 p = ArgumentParser(description="create dictionary csv out of hunspell data")
    22 p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
    23     help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
    24 p.add_argument('--lang', '-l', type=str, default="en_US",
    25     help='use dictionary for language LANG (default: en_US)')
    26 p.add_argument('--encoding', '-e', type=str, default="utf-8",
    27     help='file encoding (default: utf-8)')
    28 p.add_argument('--cut', '-c', action='store_true',
    29     help="cut list - reduce to 65536 words")
    30 
    31 args = p.parse_args()
    32 
    33 try:
    34     from icu import UnicodeString, Locale
    35 except ImportError:
    36     print("warning: PyICU not installed, using fallback", file=sys.stderr)
    37     def upper(x):
    38         return x.upper();
    39 else:
    40     locale = Locale(args.lang)
    41     def upper(x):
    42         u = UnicodeString(x)
    43         return str(u.toUpper(locale))
    44 
    45 _all = (
    46     upper(word.match(line).group(1))
    47         for line in FileInput(
    48                 args.hunspell + "/" + args.lang + ".dic",
    49                 openhook=hook_encoded(args.encoding)
    50             )
    51         if not space.match(line)
    52 )
    53 _words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
    54 _words.sort()
    55 _words = [w for w, g in itertools.groupby(_words)]
    56 
    57 if args.cut:
    58     while len(_words) > 65536 * 2:
    59         _words = _words[::2]
    60 
    61 if len(_words) > 65536:
    62     if args.cut:
    63         _words = _words[:65536]
    64 elif len(_words) < 65536:
    65     sys.stderr.write(
    66             "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
    67                     args.lang,
    68                     log2(len(_words)),
    69                     log2(len(_words))*5
    70                 )
    71         )
    72     _words.extend(_words[:65536-len(_words)])
    73 
    74 if args.cut:
    75     assert len(_words) == 65536, "lenght is {}".format(len(_words))
    76 
    77 for i, w in enumerate(_words):
    78     print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))