db/csv2csv.py
author Krista Grothoff <krista@pep-project.org>
Mon, 24 Oct 2016 21:33:35 +0200
branchENGINE-128
changeset 1334 85973d7d2859
parent 788 f75a6f866c38
child 1478 88dae00c8954
permissions -rw-r--r--
ENGINE-128: closing branch
Edouard@788
     1
#! /usr/bin/env python3
Edouard@788
     2
Edouard@788
     3
from argparse import ArgumentParser
Edouard@788
     4
from fileinput import FileInput, hook_encoded
Edouard@788
     5
import re, itertools, sys
Edouard@788
     6
Edouard@788
     7
space = re.compile(r'^\s')
Edouard@788
     8
Edouard@788
     9
p = ArgumentParser(description="re-write re-order csv and strip lines with too long words")
Edouard@788
    10
p.add_argument('--input', '-i', type=str, default="somefile.cvs",
Edouard@788
    11
    help='input file')
Edouard@788
    12
p.add_argument('--length', '-l', type=int, default=100,
Edouard@788
    13
    help='min word length to stripp a line')
Edouard@788
    14
Edouard@788
    15
args = p.parse_args()
Edouard@788
    16
Edouard@788
    17
try:
Edouard@788
    18
    from icu import UnicodeString, Locale
Edouard@788
    19
except ImportError:
Edouard@788
    20
    print("warning: PyICU not installed, using fallback", file=sys.stderr)
Edouard@788
    21
else:
Edouard@788
    22
    locale = Locale("utf-8")
Edouard@788
    23
Edouard@788
    24
_all = (
Edouard@788
    25
        line.split(',')
Edouard@788
    26
        for line in FileInput(
Edouard@788
    27
                args.input,
Edouard@788
    28
                openhook=hook_encoded("utf-8")
Edouard@788
    29
            )
Edouard@788
    30
        if not space.match(line)
Edouard@788
    31
)
Edouard@788
    32
Edouard@788
    33
_some = line for line in _all if len(line[2]) < args.length
Edouard@788
    34
Edouard@788
    35
for i, w in enumerate(_all):
Edouard@788
    36
    print("{l},{i},{w},0".format(l=w[0], i=i, w=w[2]))