db/csv2csv.py
changeset 788 f75a6f866c38
child 1478 88dae00c8954
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/db/csv2csv.py	Sat Jul 02 16:53:53 2016 +0200
     1.3 @@ -0,0 +1,36 @@
     1.4 +#! /usr/bin/env python3
     1.5 +
     1.6 +from argparse import ArgumentParser
     1.7 +from fileinput import FileInput, hook_encoded
     1.8 +import re, itertools, sys
     1.9 +
    1.10 +space = re.compile(r'^\s')
    1.11 +
    1.12 +p = ArgumentParser(description="re-write re-order csv and strip lines with too long words")
    1.13 +p.add_argument('--input', '-i', type=str, default="somefile.cvs",
    1.14 +    help='input file')
    1.15 +p.add_argument('--length', '-l', type=int, default=100,
    1.16 +    help='min word length to stripp a line')
    1.17 +
    1.18 +args = p.parse_args()
    1.19 +
    1.20 +try:
    1.21 +    from icu import UnicodeString, Locale
    1.22 +except ImportError:
    1.23 +    print("warning: PyICU not installed, using fallback", file=sys.stderr)
    1.24 +else:
    1.25 +    locale = Locale("utf-8")
    1.26 +
    1.27 +_all = (
    1.28 +        line.split(',')
    1.29 +        for line in FileInput(
    1.30 +                args.input,
    1.31 +                openhook=hook_encoded("utf-8")
    1.32 +            )
    1.33 +        if not space.match(line)
    1.34 +)
    1.35 +
    1.36 +_some = line for line in _all if len(line[2]) < args.length
    1.37 +
    1.38 +for i, w in enumerate(_all):
    1.39 +    print("{l},{i},{w},0".format(l=w[0], i=i, w=w[2]))