Py4CL readbrown2.py.html

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# encoding: utf-8


"""
readbrown2.py
(C) 2011 by Damir Cavar <dcavar@me.com>

See: Python 3 for (Computational and Corpus) Linguists
URL: http://damir.cavar.me/py4cl/

Process Brown corpus files.
Split single tokens with slash-annotated part-of-speech in tuples
of token and part-of-speech, count the PoS-tags only, and generate
a relative frequency profile.
"""

import sys
from collections import Counter


def readBrownFile(filename, mycounters):
   tokenlist = []
   try:
      ifp = open(filename, mode='r', encoding='utf8')
      mycounters.update( ( x[1] for x in ( tuple(token.split("/")) for token in ifp.read().split() ) ) )
      ifp.close()
   except IOError:
      print("Could not read file:", filename)


if __name__ == '__main__':
   mycounters = Counter()

   for i in sys.argv[1:]:
      readBrownFile(i, mycounters)

   total = sum(mycounters.values())
   for i in mycounters:
      print(i, mycounters[i]/total, sep="\t")