Source code for uccaapp.upload_conllu_passages

#!/usr/bin/env python3
import sys

import argparse
import re
from glob import glob

from ucca.convert import to_json, from_text
from uccaapp.api import ServerAccessor

try:
    from simplejson.scanner import JSONDecodeError
except ImportError:
    from json.decoder import JSONDecodeError

desc = """Upload passages from CoNLL-U files including complete tokenization, and create annotation task for each"""


[docs]class ConlluPassageUploader(ServerAccessor): def __init__(self, user_id, annotation_user_id, source_id, project_id, **kwargs): super().__init__(**kwargs) self.set_source(source_id) self.set_project(project_id) self.set_user(user_id) self.annotation_user = dict(id=annotation_user_id) if annotation_user_id else self.user
[docs] def upload_passages(self, filenames, **kwargs): del kwargs for pattern in filenames: filenames = sorted(glob(pattern)) if not filenames: raise IOError("Not found: " + pattern) for filename in sorted(filenames): with open(filename, encoding="utf-8") as f: external_id = None tokens = [] try: for line in f: line = line.strip() m = re.match(r"^# sent_id = (.*)", line) if m: external_id = m.group(1) elif line: tokens.append(line.split("\t")[1]) else: self.upload_passage(external_id, tokens) external_id = None tokens = [] if tokens: self.upload_passage(external_id, tokens) except (IndexError, AssertionError) as e: raise ValueError(filename) from e
[docs] def upload_passage(self, external_id, tokens): assert external_id, "Missing external ID for passage %s" % tokens assert tokens, "Empty passage %s" % external_id passage_out = self.create_passage(text=" ".join(tokens), external_id=external_id, type="PUBLIC", source=self.source) task_in = dict(type="TOKENIZATION", status="SUBMITTED", project=self.project, user=self.user, passage=passage_out, manager_comment="External ID: "+external_id, user_comment="", parent=None, is_demo=False, is_active=True) tok_task_out = self.create_task(**task_in) tok_user_task_in = dict(tok_task_out) passage = list(from_text(tokens, tokenized=True))[0] tok_user_task_in.update(to_json(passage, return_dict=True, tok_task=True)) self.submit_task(**tok_user_task_in) task_in = dict(type="ANNOTATION", status="NOT_STARTED", project=self.project, user=self.annotation_user, passage=tok_task_out["passage"], manager_comment="External ID: "+external_id, user_comment=external_id, parent=tok_task_out, is_demo=False, is_active=True) self.create_task(**task_in) print("Uploaded passage "+external_id+" successfully")
[docs] @staticmethod def add_arguments(argparser): argparser.add_argument("filenames", nargs="+", help="filename pattern of CoNLL-U files") ServerAccessor.add_project_id_argument(argparser) ServerAccessor.add_source_id_argument(argparser) ServerAccessor.add_user_id_argument(argparser) argparser.add_argument("--annotation-user-id", type=int, help="user id for annotation tasks, if different") ServerAccessor.add_arguments(argparser)
[docs]def main(**kwargs): ConlluPassageUploader(**kwargs).upload_passages(**kwargs)
if __name__ == "__main__": argument_parser = argparse.ArgumentParser(description=desc) ConlluPassageUploader.add_arguments(argument_parser) main(**vars(argument_parser.parse_args())) sys.exit(0)