Source code for scripts.site_to_text
#! /usr/bin/python3
import argparse
import pickle
from xml.etree.ElementTree import ElementTree, fromstring
import psycopg2
import ucca.convert
desc = """Parses an XML in UCCA site format.
The input can be given as either an XML file or a DB file with passage ID
and user name, and the output is either the standard format XML or
a pickled object.
Possible input methods are using a DB file with pid and user, which gets the
annotation of the specified user for the specified passage from teh DB file,
or using filename of a site-formatted XML file.
"""
[docs]def site2passage(filename):
"""Opens a file and returns its parsed Passage object"""
with open(filename, encoding="utf-8") as f:
etree = ElementTree().parse(f)
return ucca.convert.from_site(etree)
[docs]def db2passage(handle, pid, user):
"""Gets the annotation of user to pid from the DB handle - returns a passage"""
handle.execute("SET search_path to oabend")
handle.execute("SELECT id FROM users WHERE username=%s", (user,))
uid = handle.fetchone()[0]
handle.execute("SELECT xml,ts FROM xmls WHERE paid=%s AND uid=%s " +
"ORDER BY ts DESC", (pid, uid))
raw_xml, ts = handle.fetchone()
#print('extracted passage from '+str(ts))
return ucca.convert.from_site(fromstring(raw_xml))
[docs]def main(args):
# Checking for illegal combinations
if args.db and args.filename:
argparser.error("Only one source, XML or DB file, can be used")
if (not args.db) and (not args.filename):
argparser.error("Must specify one source, XML or DB file")
if args.db and not (args.pid and args.user):
argparser.error("Must specify a username and a passage ID when " +
"using DB file option")
if (args.pid or args.user) and not args.db:
argparser.error("Cannot use user and passage ID options without DB file")
if args.filename:
passage = site2passage(args.filename)
else:
conn = psycopg2.connect(host=args.host, database=args.db)
c = conn.cursor()
passage = db2passage(c, args.pid, args.user)
if args.binary:
with open(args.binary, "wb") as binf:
pickle.dump(passage, binf)
else:
output = ucca.convert.to_text(passage, lang=args.lang)
if args.outfile:
with open(args.outfile, "w", encoding="utf-8") as outf:
outf.write(output)
else:
print(output)
if __name__ == "__main__":
argparser = argparse.ArgumentParser(description=desc)
argparser.add_argument("filename", nargs="?", help="XML file name to convert")
argparser.add_argument("-o", "--outfile", help="output file for standard XML")
argparser.add_argument("-b", "--binary", help="output file for binary pickel")
argparser.add_argument("-d", "--db", help="DB file to get input from")
argparser.add_argument("--host", help="DB host server to get input from")
argparser.add_argument("-p", "--pid", type=int, help="PassageID to query DB")
argparser.add_argument("-u", "--user", help="Username to DB query")
argparser.add_argument("-l", "--lang", default="en", help="language two-letter code for sentence model")
main(argparser.parse_args())