Source code for scripts.site_to_standard
import argparse
import os
import sqlite3
from glob import glob
from xml.etree.ElementTree import ElementTree, fromstring
import ucca.convert
from ucca.ioutil import write_passage
desc = """Parses an XML in UCCA site format.
The input can be given as either an XML file or a DB file with passage ID
and user name, and the output is either the standard format XML or
a pickled object.
Possible input methods are using a DB file with pid and user, which gets the
annotation of the specified user for the specified passage from teh DB file,
or using filenames of a site-formatted XML file.
"""
[docs]def site2passage(filename):
"""Opens a file and returns its parsed Passage object"""
with open(filename, encoding="utf-8") as f:
print("Reading '%s'..." % filename)
return ucca.convert.from_site(ElementTree().parse(f))
[docs]def db2passage(handle, pid, user):
"""Gets the annotation of user to pid from the DB handle - returns a passage"""
handle.execute("SELECT id FROM users WHERE username=?", (user,))
uid = handle.fetchone()[0]
handle.execute("SELECT xml FROM xmls WHERE paid=? AND uid=? ORDER BY ts DESC", (pid, uid))
return ucca.convert.from_site(fromstring(handle.fetchone()[0]))
[docs]def main(args):
os.makedirs(args.out_dir, exist_ok=True)
for filename, passage in ((filename, site2passage(filename)) for pattern in args.filenames
for filename in sorted(glob(pattern)) or [pattern]) if args.filenames \
else ((pid, db2passage(sqlite3.connect(args.db).cursor(), pid, args.user)) for pid in args.pids):
write_passage(passage, outdir=args.out_dir, binary=args.binary)
[docs]def check_illegal_combinations(args):
if args.db and not (args.pids and args.user):
argparser.error("Must specify a username and a passage ID when using DB file option")
if (args.pids or args.user) and not args.db:
argparser.error("Cannot use user and passage ID options without DB file")
return args
if __name__ == "__main__":
argparser = argparse.ArgumentParser(description=desc)
argparser.add_argument("filenames", nargs="*", help="XML file name to convert")
argparser.add_argument("-d", "--db", help="DB file to get input from")
argparser.add_argument("-o", "--out-dir", default=".", help="output directory for standard XML")
argparser.add_argument("-b", "--binary", help="output file for binary pickle")
argparser.add_argument("-p", "--pids", nargs="*", type=int, help="PassageIDs to query DB")
argparser.add_argument("-u", "--user", help="Username to DB query")
main(check_illegal_combinations(argparser.parse_args()))