~/Documents/Teaching/DGfS Herbstschule 2005/Code/Brown-ngram.py.html

#!/usr/bin/env python

"""
Brown-ngram.py
(C) 2005 by Damir Cavar

This code is free; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This is an implementation of a simple ngram generator for the Brown corpus format.

Each token is structured as follows:
word + "/" + part of speech
"""


import sys, glob, os.path, string
from ngram import Ngrams


def makeNgrams(ngrams, filename):
   try:
      file = open(filename)
      tokens = file.read().split()
      file.close()
   except IOError:
      print "Cannot read from file", filename
   for i in tokens:
      i = i.split("/")
      if i[0][0] not in string.punctuation:
         i[0] = i[0].lower()
         ngrams.addNgram( tuple(i) )
   return ngrams


if __name__ == "__main__":
   myNgrams = Ngrams(2)
   if len(sys.argv) > 1:
      for x in sys.argv[1:]:
         for y in glob.glob(os.path.normcase(x)):
            print "Loading file:", y
            myNgrams = makeNgrams(myNgrams, y)
      for x in myNgrams.frequencyProfile(False):
         print x
   else:
      print "Usage:"
      print "python Brown-ngram.py filename[s]"