#!/usr/bin/env python3
import argparse
import os
import re
from glob import glob
from tqdm import tqdm
from ucca.convert import to_text
from ucca.ioutil import file2passage, get_passages_with_progress_bar
desc = """Parses files in UCCA standard format, and writes as text files or a text file with a line per passage."""
[docs]def numeric(x):
try:
return tuple(map(int, re.findall("\d+", x)))
except ValueError:
return x
[docs]def write_text(passage, f, sentences, lang, prepend_id=False):
for line in to_text(passage, sentences=sentences, lang=lang):
fields = [passage.ID, line] if prepend_id else [line]
print(*fields, file=f, sep="\t")
[docs]def main(args):
os.makedirs(args.outdir, exist_ok=True)
if args.join:
out_file = os.path.join(args.outdir, args.join)
with open(out_file, "w", encoding="utf-8") as f:
for passage in get_passages_with_progress_bar(sorted(args.filenames, key=numeric), desc="Converting"):
write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
print("Wrote '%s'." % out_file)
else: # one file per passage
for pattern in args.filenames:
for filename in tqdm(glob(pattern) or [pattern], desc="Converting", unit=" passages"):
passage = file2passage(filename)
basename = os.path.splitext(os.path.basename(filename))[0]
with open(os.path.join(args.outdir, basename + ".txt"), "w", encoding="utf-8") as f:
write_text(passage, f, sentences=args.sentences, lang=args.lang, prepend_id=args.prepend_id)
if __name__ == "__main__":
argparser = argparse.ArgumentParser(description=desc)
argparser.add_argument("filenames", nargs="+", help="passage file names to convert")
argparser.add_argument("-o", "--outdir", default=".", help="output directory")
argparser.add_argument("-s", "--sentences", action="store_true", help="split to sentences using spaCy")
argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model")
argparser.add_argument("-j", "--join", help="write just one text file with this name, with one line per passage")
argparser.add_argument("-p", "--prepend-id", action="store_true", help="prepend the passage ID to the output text")
main(argparser.parse_args())