1 #! /usr/bin/env python3
3 from argparse import ArgumentParser
4 from fileinput import FileInput, hook_encoded
5 import re, itertools, sys
11 def log2(x): return log(x) / log(2)
13 word = re.compile(r"(\S*?)(/|\s.*|$)")
14 unwanted = re.compile(r"(^\d|[^']*')")
15 space = re.compile(r'^\s')
17 p = ArgumentParser(description="create dictionary csv out of hunspell data")
18 p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
19 help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
20 p.add_argument('--lang', '-l', type=str, default="en_US",
21 help='use dictionary for language LANG (default: en_US)')
22 p.add_argument('--encoding', '-e', type=str, default="utf-8",
23 help='file encoding (default: utf-8)')
28 word.match(line).group(1).upper()
29 for line in FileInput(
30 args.hunspell + "/" + args.lang + ".dic",
31 openhook=hook_encoded(args.encoding)
33 if not space.match(line)
35 _words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
37 _words = [w for w, g in itertools.groupby(_words)]
39 if len(_words) > 65536:
40 _words = _words[:65536]
41 elif len(_words) < 65536:
43 "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
49 _words.extend(_words[:65536-len(_words)])
51 assert len(_words) == 65536, "lenght is {}".format(len(_words))
53 for i, w in enumerate(_words):
54 print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))