vb@2
|
1 |
#! /usr/bin/env python3
|
vb@2
|
2 |
|
vb@1517
|
3 |
# This file is under GNU General Public License 3.0
|
vb@1517
|
4 |
# see LICENSE.txt
|
vb@1517
|
5 |
|
vb@1517
|
6 |
|
vb@2
|
7 |
from argparse import ArgumentParser
|
vb@2
|
8 |
from fileinput import FileInput, hook_encoded
|
vb@2
|
9 |
import re, itertools, sys
|
vb@2
|
10 |
|
vb@2
|
11 |
try:
|
vb@2
|
12 |
from math import log2
|
vb@2
|
13 |
except:
|
vb@2
|
14 |
from math import log
|
vb@2
|
15 |
def log2(x): return log(x) / log(2)
|
vb@2
|
16 |
|
vb@2
|
17 |
word = re.compile(r"(\S*?)(/|\s.*|$)")
|
vb@2
|
18 |
unwanted = re.compile(r"(^\d|[^']*')")
|
vb@2
|
19 |
space = re.compile(r'^\s')
|
vb@2
|
20 |
|
vb@2
|
21 |
p = ArgumentParser(description="create dictionary csv out of hunspell data")
|
vb@2
|
22 |
p.add_argument('--hunspell', '-H', type=str, default="/usr/share/hunspell",
|
vb@2
|
23 |
help='directory where hunspell dictionary files reside (default: /usr/share/hunspell)')
|
vb@2
|
24 |
p.add_argument('--lang', '-l', type=str, default="en_US",
|
vb@2
|
25 |
help='use dictionary for language LANG (default: en_US)')
|
vb@2
|
26 |
p.add_argument('--encoding', '-e', type=str, default="utf-8",
|
vb@2
|
27 |
help='file encoding (default: utf-8)')
|
vb@1417
|
28 |
p.add_argument('--cut', '-c', action='store_true',
|
vb@1417
|
29 |
help="cut list - reduce to 65536 words")
|
vb@2
|
30 |
|
vb@2
|
31 |
args = p.parse_args()
|
vb@2
|
32 |
|
vb@120
|
33 |
try:
|
vb@120
|
34 |
from icu import UnicodeString, Locale
|
vb@120
|
35 |
except ImportError:
|
vb@120
|
36 |
print("warning: PyICU not installed, using fallback", file=sys.stderr)
|
vb@120
|
37 |
def upper(x):
|
vb@120
|
38 |
return x.upper();
|
vb@120
|
39 |
else:
|
vb@120
|
40 |
locale = Locale(args.lang)
|
vb@120
|
41 |
def upper(x):
|
vb@120
|
42 |
u = UnicodeString(x)
|
vb@120
|
43 |
return str(u.toUpper(locale))
|
vb@120
|
44 |
|
vb@2
|
45 |
_all = (
|
vb@120
|
46 |
upper(word.match(line).group(1))
|
vb@2
|
47 |
for line in FileInput(
|
vb@2
|
48 |
args.hunspell + "/" + args.lang + ".dic",
|
vb@2
|
49 |
openhook=hook_encoded(args.encoding)
|
vb@2
|
50 |
)
|
vb@2
|
51 |
if not space.match(line)
|
vb@2
|
52 |
)
|
vb@2
|
53 |
_words = [w for w in _all if len(w) > 2 and not unwanted.match(w)]
|
vb@2
|
54 |
_words.sort()
|
vb@2
|
55 |
_words = [w for w, g in itertools.groupby(_words)]
|
vb@2
|
56 |
|
vb@1417
|
57 |
if args.cut:
|
vb@121
|
58 |
while len(_words) > 65536 * 2:
|
vb@121
|
59 |
_words = _words[::2]
|
vb@120
|
60 |
|
vb@2
|
61 |
if len(_words) > 65536:
|
vb@1417
|
62 |
if args.cut:
|
vb@121
|
63 |
_words = _words[:65536]
|
vb@2
|
64 |
elif len(_words) < 65536:
|
vb@2
|
65 |
sys.stderr.write(
|
vb@2
|
66 |
"warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
|
vb@2
|
67 |
args.lang,
|
vb@2
|
68 |
log2(len(_words)),
|
vb@2
|
69 |
log2(len(_words))*5
|
vb@2
|
70 |
)
|
vb@2
|
71 |
)
|
vb@2
|
72 |
_words.extend(_words[:65536-len(_words)])
|
vb@2
|
73 |
|
vb@1417
|
74 |
if args.cut:
|
vb@121
|
75 |
assert len(_words) == 65536, "lenght is {}".format(len(_words))
|
vb@2
|
76 |
|
vb@2
|
77 |
for i, w in enumerate(_words):
|
Edouard@4
|
78 |
print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))
|