#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# encoding: utf-8

(C) 2011 by Damir Cavar <dcavar@me.com>

See: Python 3 for (Computational and Corpus) Linguists
URL: http://www.cavar.me/damir/py4cl/

The general idea is taken from Dive into Python 3, Chapter on XML

Opens XML files that are encoded in TEI and prints info about s-tags,
if present, and their content.

import sys
import xml.etree.ElementTree as etree

def parseXML(file):
   if file[-4:] == ".xml":
      tree = etree.parse(file)
      root = tree.getroot()

      frqp = {}
      pars = list(root.iter("{http://www.tei-c.org/ns/1.0}p"))
      for p in pars:
         sentences = list(p.iter("{http://www.tei-c.org/ns/1.0}s"))
         for s in sentences:
            print("Sentence number:", s.attrib.get("n", "-"))
            parseS(s, frqp)
      total = sum(frqp.values())
      for t in frqp:
         print(t, frqp[t]/total, sep="\t")

def parseS(root, frqp):
   for w in root:
      token = w.text.strip()
      tag = w.attrib.get("type", "").strip()
      print(token, tag, sep="\\", end=" ")
      frqp[tag] = frqp.get( tag, 0 ) + 1

if __name__ == '__main__':
   for i in sys.argv[1:]: