Source code for scripts.join_passages

#!/usr/bin/env python3

import argparse
import os
import sys
from collections import defaultdict

import ucca.convert
from ucca.ioutil import passage2file, get_passages

desc = """Parses XML/pickle files in UCCA standard format, and writes a single passage.
"""


[docs]def main(args): os.makedirs(args.outdir, exist_ok=True) passages = list(get_passages(args.filenames)) if args.join_by_prefix: subsets = defaultdict(list) for passage in passages: subsets[passage.ID[:-3]].append(passage) else: subsets = {passages[0].ID: passages} for passage_id, subset in sorted(subsets.items()): print("Joining passages " + ", ".join(passage.ID for passage in subset), file=sys.stderr) joined = ucca.convert.join_passages(passages, passage_id=passage_id, remarks=args.remarks) outfile = "%s/%s.%s" % (args.outdir, args.prefix + joined.ID, "pickle" if args.binary else "xml") print("Writing joined passage file '%s'..." % outfile, file=sys.stderr) passage2file(joined, outfile, binary=args.binary)
if __name__ == '__main__': argparser = argparse.ArgumentParser(description=desc) argparser.add_argument("filenames", nargs="+", help="passage file names to join") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-r", "--remarks", action="store_true", help="annotate original IDs") argparser.add_argument("-b", "--binary", action="store_true", help="write in pickle binary format (.pickle)") argparser.add_argument("-j", "--join-by-prefix", action="store_true", help="join each set of passages whose IDs share all but the last 3 characters") main(argparser.parse_args())