#! /usr/bin/env python # -*- coding: utf8 -*- """ freq4.py (C) 2005 by Damir Cavar <dcavar@indiana.edu> GNU General Public License Functionality: Counting words """ import sys, os.path, glob, string, codecs from operator import itemgetter def countWords(words, filename): """Counts words in file and returns dictionary.""" count = words.get("__count__", 0) try: file = codecs.open(filename, "r", "utf8") tokens = [ string.strip(string.lower(i)) for i in file.read().split() ] for i in tokens: words[i] = words.get(i, 0) + 1 count += 1 file.close() except IOError: print "Cannot read from file:", filename words["__count__"] = count return words if __name__ == "__main__": words = {} for x in sys.argv[1:]: for y in glob.glob(os.path.normcase(x)): words = countWords(words, y) # Items sorted by value # The keyword argument `key` allows easy selection of sorting criteria wordsort = sorted(words.items(), key=itemgetter(1), reverse=True) # In-place sort still works, and also has the same new features as sorted #wordsort = words.items() #wordsort.sort(key=itemgetter(1), reverse=True) try: file = codecs.open("log.txt", "w", "utf8") file.write("word\tfrequency\n") count = words.get("__count__", 1) for x in wordsort: if x[0] != "__count__": file.write(x[0] + "\t" + str(float(x[1])/float(count)) + "\n") file.close() except IOError: print "Output error."