#! /usr/bin/env python # -*- coding: utf-8 -*- """ lidtrainer.py (C) 2005 by Damir Cavar <dcavar@indiana.edu> License: This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. Functionality: Lidtrainer processes all the files given as parameters to the script in the following way: It extracts all tri-grams from all files. It keeps track of the frequencies of single tri-grams over all documents. It prints the sorted list (based on frequency/probability) of the tri-grams to the screen. The output can be piped to a file. This file represents the language model for Lid. Read about Lid to understand how this algorithm works. Please send your comments and suggestions! """ __version__ = 0.2 __author__ = "Damir Cavar" import sys, re, os.path, glob from string import * class Trigrams: trigrams = {} # tri-grams are stored in a dictionary num = 0 # number of tri-grams characters = 0 # number of characters def createTrigrams(self, text): """Creates trigrams from characters.""" text = re.sub(r"\n", r" ", text) text = re.sub(r"\s+", r" ", text) self.characters = self.characters + len(text) # go thru list up to one but last word and take # the actual word and the following word together for i in range(len(text) - 2): self.num += 1 self.trigrams[text[i:i+3]] = self.trigrams.get(text[i:i+3], 0) + 1 def calcProb(self): """Calculate the probabilities for each trigram.""" for x in self.trigrams.keys(): self.trigrams[x] = float(self.trigrams[x]) / float(self.num) def eliminateFrequences(self, num): """Eliminates all bigrams with a frequency <= num""" for x in self.trigrams.keys(): if self.trigrams[x] <= num: value = self.trigrams[x] del self.trigrams[x] self.num -= value def createTrigramNSC(self, text): """Creates bigrams without punctuation symbols.""" self.createTrigrams(self.cleanTextSC(text)) def cleanTextSC(self, text): """Eliminates punctuation symbols from the submitted text.""" for i in punctuation: if i in text: text = replace(text, i, " ") return text def cleanPBIG(self): """Eliminate tri-grams that contain punctuation marks.""" for i in self.trigrams.keys(): for a in punctuation: if a in i: value = self.trigrams[i] del self.trigrams[i] self.num -= value break if __name__ == "__main__": myTrigrams = Trigrams() if len(sys.argv) > 1: for x in sys.argv[1:]: for y in glob.glob(os.path.normcase(x)): try: myTrigrams.createTrigrams(myTrigrams.cleanTextSC(open(y).read())) except IOError: pass myTrigrams.eliminateFrequences(2) myTrigrams.calcProb() pairs = zip(myTrigrams.trigrams.values(), myTrigrams.trigrams.keys()) pairs.sort() pairs.reverse() for i in pairs: print i[1], i[0] else: print "Usage:" print "python lidtrainer.py [document1] ..."