db/dic2csv.py
changeset 121 dcefb3741425
parent 120 c282594b523b
child 1417 3507e785e538
     1.1 --- a/db/dic2csv.py	Mon Mar 23 21:42:45 2015 +0100
     1.2 +++ b/db/dic2csv.py	Mon Mar 23 23:08:26 2015 +0100
     1.3 @@ -21,6 +21,8 @@
     1.4      help='use dictionary for language LANG (default: en_US)')
     1.5  p.add_argument('--encoding', '-e', type=str, default="utf-8",
     1.6      help='file encoding (default: utf-8)')
     1.7 +p.add_argument('--full', '-f', action='store_true',
     1.8 +    help="full list - don't reduce to 65536 words")
     1.9  
    1.10  args = p.parse_args()
    1.11  
    1.12 @@ -48,11 +50,13 @@
    1.13  _words.sort()
    1.14  _words = [w for w, g in itertools.groupby(_words)]
    1.15  
    1.16 -while len(_words) > 65536 * 2:
    1.17 -    _words = _words[::2]
    1.18 +if not args.full:
    1.19 +    while len(_words) > 65536 * 2:
    1.20 +        _words = _words[::2]
    1.21  
    1.22  if len(_words) > 65536:
    1.23 -    _words = _words[:65536]
    1.24 +    if not args.full:
    1.25 +        _words = _words[:65536]
    1.26  elif len(_words) < 65536:
    1.27      sys.stderr.write(
    1.28              "warning for {}: only {:.2f} bit in wordlist, that makes {:.2f} bit for 5 words\n".format(
    1.29 @@ -63,7 +67,8 @@
    1.30          )
    1.31      _words.extend(_words[:65536-len(_words)])
    1.32  
    1.33 -assert len(_words) == 65536, "lenght is {}".format(len(_words))
    1.34 +if not args.full:
    1.35 +    assert len(_words) == 65536, "lenght is {}".format(len(_words))
    1.36  
    1.37  for i, w in enumerate(_words):
    1.38      print("{l},{i},{w},0".format(l=args.lang[:2], i=i, w=w))