Py4CL readbrown.py.html

#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
# encoding: utf-8


"""
readbrown.py
(C) 2011 by Damir Cavar <dcavar@me.com>

See: Python 3 for (Computational and Corpus) Linguists
URL: http://www.cavar.me/damir/py4cl/

Process Brown corpus files.
Split single tokens with slash-annotated part-of-speech in tuples
of token and part-of-speech, count these tuples, and generate
a relative frequency profile.
"""


import sys
from collections import Counter


def readBrownFile(filename, mycounters):
   text = ""
   try:
      ifp = open(filename, mode='r', encoding='utf8')
      mycounters.update( ( tuple(token.split("/")) for token in ifp.read().split() ) )
      ifp.close()
   except IOError:
      print("Could not read file:", filename)


if __name__ == '__main__':
   mycounters = Counter()

   for i in sys.argv[1:]:
      readBrownFile(i, mycounters)

   total = sum(mycounters.values())
   for i in mycounters:
      print(i, mycounters[i]/total, sep="\t")