#!/usr/bin/env python # -*- coding: utf-8 -*- """ ngram.py (C) 2005 by Damir Cavar ngram class License: This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Description: Generates frequency profiles over ngrams. Serializes ngram models to files. """ import sys, pickle, os.path from operator import itemgetter class Ngrams: """Ngram class for counting ngrams and storing ngram models.""" def __init__(self, n = 2): """Constructor.""" self.ngrams = {} self.ngrams["__count__"] = 0 self.ngrams["__n__"] = n self.__ngramrel = {} self.__changed = False def getNgramFrequency(self, ngram): """Returns the absolute frequency of an ngram.""" if self.ngrams.has_key(ngram): return self.ngrams[ngram] return 0 def getNgramRelativeFrequency(self, ngram): """Returns the relative frequency of an ngram.""" if self.ngrams["__count__"] > 0: return float(self.getNgramFrequency(ngram))/float(self.ngrams["__count__"]) else: return 0.0 def getNumberTokens(self): """Return number of tokens.""" return self.ngrams["__count__"] def getNumberTypes(self): """return number of types.""" return len(self.ngrams.keys()) - 2 def addNgram(self, ngram): """Adds an ngram to the collection.""" if len(ngram) == self.ngrams["__n__"]: self.ngrams[ngram] = self.ngrams.get(ngram, 0) + 1 self.ngrams["__count__"] += 1 self.__changed = True # else: # raise some exception def removeNgram(self, ngram): """Removes one occurrence of an ngram from the collection by decreasing its counter. If the counter equals 0 after decreasing, the ngram is removed from the collection. """ if self.ngrams.has_key(ngram): if self.ngrams[ngram] > 1: self.ngrams[ngram] -= 1 else: del self.ngrams[ngram] self.ngrams["__count__"] -= 1 self.__changed = True # else # raise an error def frequencyProfile(self, increasing = True): """Returns the frequency profile of the ngram items. If increasing is set to True, the returned frequency profile will be increasing, if it is set to False, the returned frequency profile is decreasing. """ e = self.ngrams.copy() del e["__count__"] del e["__n__"] if increasing == True: return sorted(e.items(), key=itemgetter(1)) items = e.items() items.sort(key = itemgetter(1), reverse=True) return items def relativeFrequencyProfile(self, increasing = True): """Returns the relative frequency profile of the ngram items. If increasing is set to True, the returned profile will be increasing, if it is set to False, it is decreasing. """ if changed == True: self.__ngramrel = self.ngrams.copy() del self.__ngramrel["__count__"] del self.__ngramrel["__n__"] for i in self.__ngramrel.keys(): self.__ngramrel[i] = self.getNgramRelativeFrequency(i) self.__changed = False return self.frequencyProfile(increasing, self.__ngramrel) def getMostFrequent(self, ngram): """Returns the most frequent ngram.""" return self.frequencyProfile()[-1] def getLeastFrequent(self, ngram): """Returns the least frequent ngram.""" return self.frequencyProfile()[0] def serialize(self, filename = "ngrams"): """Dump the ngram model to a file.""" try: if filename == "ngrams": filename = filename + str(self.ngrams["__n__"]) + ".p" pickle.dump(self.ngrams, open(filename, "w")) self.__changed = True except Exception, e: print "Exception %s" % e def deSerialize(self, filename = "ngrams"): """Read ngram model from filename.""" try: if filename == "ngrams": filename = filename + str(self.ngrams["__n__"]) + ".p" if os.path.exists(filename): self.ngrams = pickle.load(open(filename)) self.__changed = True except Exception, e: print "Exception %s" % e # sparcify ngram dictionary for speed increase e = self.ngrams.copy() self.ngrams.update(e)