"""
parsetei2.py
(C) 2011 by Damir Cavar <dcavar@me.com>
See: Python 3 for (Computational and Corpus) Linguists
URL: http://www.cavar.me/damir/py4cl/
The general idea is taken from Dive into Python 3, Chapter on XML
http://diveintopython3.ep.io/xml.html
Opens XML files that are encoded in TEI and prints info about s-tags,
if present, and their content.
"""
import sys
import xml.etree.ElementTree as etree
def parseXML(file):
if file[-4:] == ".xml":
print(file)
tree = etree.parse(file)
root = tree.getroot()
frqp = {}
pars = list(root.iter("{http://www.tei-c.org/ns/1.0}p"))
for p in pars:
sentences = list(p.iter("{http://www.tei-c.org/ns/1.0}s"))
for s in sentences:
print("Sentence number:", s.attrib.get("n", "-"))
parseS(s, frqp)
print("")
total = sum(frqp.values())
for t in frqp:
print(t, frqp[t]/total, sep="\t")
def parseS(root, frqp):
for w in root:
token = w.text.strip()
tag = w.attrib.get("type", "").strip()
print(token, tag, sep="\\", end=" ")
frqp[tag] = frqp.get( tag, 0 ) + 1
print("")
if __name__ == '__main__':
for i in sys.argv[1:]:
parseXML(i)