Source code for scripts.convert_2_0_to_1_2

import csv
import os
from argparse import ArgumentParser

from ucca import layer1
from ucca.ioutil import get_passages_with_progress_bar, write_passage
from ucca.normalization import destroy, copy_edge

desc = """Convert the English Wiki corpus from version 2.0 to 1.2"""


[docs]def replace_time_and_quantifier(edge): if edge.tag in (layer1.EdgeTags.Time, layer1.EdgeTags.Quantifier): edge.tag = layer1.EdgeTags.Adverbial if edge.parent.is_scene() else layer1.EdgeTags.Elaborator if len(edge.parent.parents) == 1 and edge.parent.incoming[0].tag == edge.tag: for e in edge.parent: copy_edge(e, parent=edge.parent.parents[0]) destroy(edge.parent) return True return False
RULES = (replace_time_and_quantifier,)
[docs]def convert_passage(passage, report_writer): for rule in RULES: for node in passage.layer(layer1.LAYER_ID).all: for edge in node: parent = edge.parent parent_str = str(parent) if rule(edge): report_writer.writerow((rule.__name__, passage.ID, edge, parent_str, parent))
[docs]def main(args): os.makedirs(args.outdir, exist_ok=True) with open(args.outfile, "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(("rule", "passage", "edge", "before", "after")) for passage in get_passages_with_progress_bar(args.passages, desc="Converting"): convert_passage(passage, report_writer=writer) write_passage(passage, outdir=args.outdir, prefix=args.prefix, verbose=args.verbose) f.flush() print("Wrote '%s'" % args.outfile)
if __name__ == "__main__": argparser = ArgumentParser(description=desc) argparser.add_argument("passages", nargs="+", help="the corpus, given as xml/pickle file names") argparser.add_argument("-o", "--outdir", default=".", help="output directory") argparser.add_argument("-p", "--prefix", default="", help="output filename prefix") argparser.add_argument("-O", "--outfile", default=os.path.splitext(argparser.prog)[0] + ".csv", help="log file") argparser.add_argument("-v", "--verbose", action="store_true", help="print more information") main(argparser.parse_args())