Source code for scripts.site_pickle_to_standard
import argparse
import os
import pickle
from glob import glob
from xml.etree.ElementTree import Element
import ucca.convert
from ucca.ioutil import write_passage
desc = """Parses pickle files containing XML in UCCA site format, and convert to standard XML"""
[docs]def pickle_site2passage(filename):
"""Opens a pickle file containing XML in UCCA site format and returns its parsed Passage object"""
with open(filename, "rb") as h:
root = elem = pickle.load(h)
while isinstance(elem, list):
try:
elem = next(e for e in elem if isinstance(e, (Element, list)))
except StopIteration:
raise ValueError("Cannot parse %s" % root)
return ucca.convert.from_site(elem)
[docs]def main(args):
os.makedirs(args.out_dir, exist_ok=True)
exceptions = []
for pattern in args.filenames:
for filename in sorted(glob(pattern)) or [pattern]:
print("Reading '%s'..." % filename)
try:
passage = pickle_site2passage(filename)
write_passage(passage, outdir=args.out_dir, binary=args.binary, basename=os.path.basename(filename))
except ValueError as e:
exceptions.append((filename, e))
if exceptions:
for filename, e in exceptions:
print("'%s': %s" % (filename, e))
if __name__ == "__main__":
argparser = argparse.ArgumentParser(description=desc)
argparser.add_argument("filenames", nargs="*", help="pickle file names to convert")
argparser.add_argument("-o", "--out-dir", default=".", help="output directory")
argparser.add_argument("-b", "--binary", help="output binary pickle")
main(argparser.parse_args())