Source code for scripts.statistics
#!/usr/bin/env python3
import argparse
from collections import Counter
import pandas as pd
from ucca import layer0, layer1
from ucca.ioutil import get_passages_with_progress_bar
desc = """Prints statistics on UCCA passages"""
[docs]def main(args):
df = pd.DataFrame(index=args.directories, columns=["sentences", "tokens", "nodes", "discontinuous", "reentrant",
"implicit", "edges", "primary", "remote"])
df.fillna(0, inplace=True)
for i, directory in enumerate(args.directories):
row = df.loc[directory]
for passage in get_passages_with_progress_bar(directory, desc=directory):
l1 = passage.layer(layer1.LAYER_ID)
non_terminals = [n for n in l1.all if n not in l1.heads and len(n.get_terminals()) > 1]
edges = {e for n in non_terminals for e in n}
remote_counter = Counter(e.attrib.get("remote", False) for e in edges)
row["sentences"] += 1
row["tokens"] += len(passage.layer(layer0.LAYER_ID).all)
row["nodes"] += len(non_terminals)
row["discontinuous"] += sum(1 for n in non_terminals if n.discontiguous)
row["reentrant"] += sum(1 for n in non_terminals if any(e.attrib.get("remote") for e in n.incoming))
row["edges"] += len(edges)
row["primary"] += remote_counter[False]
row["remote"] += remote_counter[True]
row["implicit"] += sum(1 for n in l1.all if n.attrib.get("implicit"))
# Change to percentages
df["discontinuous"] *= 100. / df["nodes"]
df["reentrant"] *= 100. / df["nodes"]
df["implicit"] *= 100. / df["nodes"]
df["primary"] *= 100. / df["edges"]
df["remote"] *= 100. / df["edges"]
# Print
if args.outfile:
df.T.to_csv(args.outfile, float_format="%.2f", sep="&", line_terminator=" \\\\\n")
print("Saved to " + args.outfile)
else:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
print(df.T)
if __name__ == '__main__':
argparser = argparse.ArgumentParser(description=desc)
argparser.add_argument("directories", nargs="+", help="directories to process")
argparser.add_argument("-o", "--outfile", help="output file for statistics")
main(argparser.parse_args())