~/Documents/Teaching/DGfS Herbstschule 2005/Code/freq6.py.html

#! /usr/bin/env python
# -*- coding: utf8 -*-

"""
freq5.py
(C) 2005 by Damir Cavar <dcavar@indiana.edu>
GNU General Public License

Functionality: Counting words
"""

import sys, os.path, glob, string, codecs, re
from operator import itemgetter

countername = u"__count__"

functionWordsEN = ["a", "abroad", "about", "above", "across", "after", "again",
   "against", "ago", "ahead", "all", "almost", "alongside", "already",
   "also", "although", "always", "am", "amid", "amidst", "among", "amongst",
   "an", "and", "another", "any", "anybody", "anyone", "anything", "anywhere",
   "apart", "are", "aren't", "around", "as", "aside", "at", "away", "back",
   "backward", "backwards", "be", "because", "been", "before", "beforehand",
   "behind", "being", "below", "between", "beyond", "both", "but", "by",
   "can", "can't", "cannot", "could", "couldn't", "dare", "daren't", "despite",
   "did", "didn't", "directly", "do", "does", "doesn't", "doing", "don't",
   "done", "down", "during", "each", "either", "else", "elsewhere", "enough",
   "even", "ever", "evermore", "every", "everybody", "everyone", "everything",
   "everywhere", "except", "fairly", "farther", "few", "fewer", "for",
   "forever", "forward", "from", "further", "furthermore", "had", "hadn't",
   "half", "hardly", "has", "hasn't", "i", "have", "haven't", "having", "he",
   "hence", "her", "here", "hers", "herself", "him", "himself", "his", "how",
   "however", "if", "in", "indeed", "inner", "inside", "instead", "into",
   "is", "isn't", "it", "its", "itself", "just", "keep", "kept", "later",
   "least", "less", "lest", "like", "likewise", "little", "low", "lower",
   "many", "may", "mayn't", "me", "might", "mightn't", "mine", "minus",
   "moreover", "most", "much", "must", "mustn't", "my", "myself", "near",
   "need", "needn't", "neither", "never", "nevertheless", "next", "no",
   "no-one", "nobody", "none", "nor", "not", "nothing", "notwithstanding",
   "now", "nowhere", "of", "off", "often", "on", "once", "one", "ones",
   "only", "onto", "opposite", "or", "other", "others", "otherwise", "ought",
   "oughtn't", "our", "ours", "ourselves", "out", "outside", "over", "own",
   "past", "per", "perhaps", "please", "plus", "provided", "quite", "rather",
   "really", "round", "same", "self", "selves", "several", "shall", "shan't",
   "she", "should", "shouldn't", "since", "so", "some", "somebody", "someday",
   "someone", "something", "sometimes", "somewhat", "still", "such", "that",
   "than", "the", "their", "theirs", "them", "themselves", "then", "there",
   "therefore", "these", "they", "thing", "things", "this", "those", "though",
   "through", "throughout", "thus", "till", "to", "together", "too", "towards",
   "under", "underneath", "undoing", "unless", "unlike", "until", "up", "upon",
   "upwards", "us", "versus", "very", "via", "was", "wasn't", "way", "we",
   "well", "were", "weren't", "what", "whatever", "when", "whence", "whenever",
   "where", "whereby", "wherein", "wherever", "whether", "which", "whichever",
   "while", "whilst", "whither", "who", "whoever", "whom", "whose", "why",
   "will", "with", "within", "without", "won't", "would", "wouldn't", "yet",
   "you", "your", "yours", "yourself", "yourselves", "thou", "thee", "thy" ]


def countWords(words, filename):
   """Counts words in file and returns dictionary."""
   count = words.get(countername, 0)
   try:
      file = codecs.open(filename, "r", "utf8")
      tokens = [string.lower(i) for i in re.findall(ur"[A-Za-zčČćĆšŠžŽđĐ]+'?[A-Za-zčČćĆšŠžŽđĐ]?",file.read())]
      for i in tokens:
         if i not in functionWordsEN:
            words[i] = words.get(i, 0) + 1
            count += 1
      file.close()
   except IOError:
      print "Cannot read from file:", filename
   words[countername] = count
   return words


if __name__ == "__main__":
   words = {}
   for x in sys.argv[1:]:
      for y in glob.glob(os.path.normcase(x)):
         words = countWords(words, y)

   # get word count
   count = words.get(countername, 1)

   # Items sorted by value
   wordsort = sorted(words.items(), key=itemgetter(1), reverse=True)

   try:
      file = codecs.open("log.txt", "w", "utf8")
      file.write("word\tfrequency\n")
      for x in wordsort:
         if x[0] != countername:
            file.write(x[0] + "\t" + str(float(x[1])/float(count)) + "\n")
      file.close()
   except IOError:
      print "Output error."