#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# encoding: utf-8


"""
parsetei2.py
(C) 2011 by Damir Cavar <dcavar@me.com>

See: Python 3 for (Computational and Corpus) Linguists
URL: http://www.cavar.me/damir/py4cl/


The general idea is taken from Dive into Python 3, Chapter on XML
http://diveintopython3.ep.io/xml.html

Opens XML files that are encoded in TEI and prints info about s-tags,
if present, and their content.
"""


import sys
import xml.etree.ElementTree as etree


def parseXML(file):
   if file[-4:] == ".xml":
      print(file)
      tree = etree.parse(file)
      root = tree.getroot()

      frqp = {}
      pars = list(root.iter("{http://www.tei-c.org/ns/1.0}p"))
      for p in pars:
         sentences = list(p.iter("{http://www.tei-c.org/ns/1.0}s"))
         for s in sentences:
            print("Sentence number:", s.attrib.get("n", "-"))
            parseS(s, frqp)
         print("")
      total = sum(frqp.values())
      for t in frqp:
         print(t, frqp[t]/total, sep="\t")


def parseS(root, frqp):
   for w in root:
      token = w.text.strip()
      tag = w.attrib.get("type", "").strip()
      print(token, tag, sep="\\", end=" ")
      frqp[tag] = frqp.get( tag, 0 ) + 1
   print("")


if __name__ == '__main__':
   for i in sys.argv[1:]:
      parseXML(i)