tokens2Ngram.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
tokens2Ngram.py

Version 0.2

(C) 2010-2012 by Damir Cavar (http://cavar.me/damir/)

Simple ngram model generation algorithm.
Requires as input a token list generated by the tokenizer.py script.
Generates a CSV frequency profile of N-grams, where the N-gram can be
a string or itemized. It can also generate a tab-delimited N-gram
frequency profile, with the N-gram being a string and space delimted,
or itemized and tab-delimited. The N-gram model can be filtered:
a. all N-grams that contain stop-words, and
b. all N-grams that contain puntuation marks can be eliminated
   from the putput.
"""


import sys, os, os.path, glob, getopt, codecs, re

"""
Fix output for Windows and Komodo Edit...
if you have problems with the encoding of the output when
piping it on Windows to some file, try to set the system
variable PYTHONIOENCODING to utf-8 (also possible solution
on Mac or Linux. Or try by uncommenting the following line
"""
#sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())


stpwdict = set( ("a","a's","able","about","above","according","accordingly",
                 "across","actually","after","afterwards","again","against",
                 "ain't","all","allow","allows","almost","alone","along",
                 "already","also","although","always","am","among","amongst",
                 "an","and","another","any","anybody","anyhow","anyone",
                 "anything","anyway","anyways","anywhere","apart","appear",
                 "appreciate","appropriate","are","aren't","around","as",
                 "aside","ask","asking","associated","at","available","away",
                 "awfully","be","became","because","become","becomes",
                 "becoming","been","before","beforehand","behind","being",
                 "believe","below","beside","besides","best","better",
                 "between","beyond","both","brief","but","by","c'mon",
                 "c's","came","can","can't","cannot","cant","cause",
                 "causes","certain","certainly","changes","clearly",
                 "co","com","come","comes","concerning","consequently",
                 "consider","considering","contain","containing","contains",
                 "corresponding","could","couldn't","course","currently",
                 "definitely","described","despite","did","didn't",
                 "different","do","does","doesn't","doing","don't","done",
                 "down","downwards","during","each","edu","eg","eight",
                 "either","else","elsewhere","enough","entirely","especially",
                 "et","etc","even","ever","every","everybody","everyone",
                 "everything","everywhere","ex","exactly","example","except",
                 "far","few","fifth","first","five","followed","following",
                 "follows","for","former","formerly","forth","four","from",
                 "further","furthermore","get","gets","getting","given",
                 "gives","go","goes","going","gone","got","gotten",
                 "greetings","had","hadn't","happens","hardly","has",
                 "hasn't","have","haven't","having","he","he's","hello",
                 "help","hence","her","here","here's","hereafter","hereby",
                 "herein","hereupon","hers","herself","hi","him","himself",
                 "his","hither","hopefully","how","howbeit","however","i'd",
                 "s","i","it","I","It","The","A","i'll","i'm","i've","ie","if",
                 "ignored","immediate","in",
                 "inasmuch","inc","indeed","indicate","indicated","indicates",
                 "inner","insofar","instead","into","inward","is","isn't","it",
                 "it'd","it'll","it's","its","itself","just","keep","keeps",
                 "kept","know","known","knows","last","lately","later","latter",
                 "latterly","least","less","lest","let","let's","like","liked",
                 "likely","little","look","looking","looks","ltd","mainly",
                 "many","may","maybe","me","mean","meanwhile","merely","might",
                 "more","moreover","most","mostly","much","must","my","myself",
                 "name","namely","nd","near","nearly","necessary","need",
                 "needs","neither","never","nevertheless","new","next","nine",
                 "no","nobody","non","none","noone","nor","normally","not",
                 "nothing","novel","now","nowhere","obviously","of","off",
                 "often","oh","ok","okay","old","on","once","one","ones",
                 "only","onto","or","other","others","otherwise","ought",
                 "our","ours","ourselves","out","outside","over","overall",
                 "own","particular","particularly","per","perhaps","placed",
                 "please","plus","possible","presumably","probably",
                 "provides","que","quite","qv","rather","rd","re",
                 "really","reasonably","regarding","regardless","regards",
                 "relatively","respectively","right","said","same","saw",
                 "say","saying","says","second","secondly","see","seeing",
                 "seem","seemed","seeming","seems","seen","self","selves",
                 "sensible","sent","serious","seriously","seven","several",
                 "shall","she","should","shouldn't","since","six","so",
                 "some","somebody","somehow","someone","something",
                 "sometime","sometimes","somewhat","somewhere","soon",
                 "sorry","specified","specify","specifying","still","sub",
                 "such","sup","sure","t's","take","taken","tell","tends",
                 "th","than","thank","thanks","thanx","that","that's","thats",
                 "the","their","theirs","them","themselves","then","thence",
                 "there","there's","thereafter","thereby","therefore","therein",
                 "theres","thereupon","these","they","they'd","they'll",
                 "they're","they've","think","third","this","thorough","thoroughly",
                 "those","though","three","through","throughout","thru","thus",
                 "to","together","too","took","toward","towards","tried","tries",
                 "truly","try","trying","twice","two","un","under","unfortunately",
                 "unless","unlikely","until","unto","up","upon","us","use","used",
                 "useful","uses","using","usually","value","various","very","via",
                 "viz","vs","want","wants","was","wasn't","way","we","we'd","we'll",
                 "we're","we've","welcome","well","went","were","weren't","what",
                 "what's","whatever","when","whence","whenever","where","where's",
                 "whereafter","whereas","whereby","wherein","whereupon","wherever",
                 "whether","which","while","whither","who","who's","whoever",
                 "whole","whom","whose","why","will","willing","wish","with",
                 "within","without","won't","wonder","would","wouldn't","yes",
                 "yet","you","you'd","you'll","you're","you've","your","yours",
                 "yourself","yourselves","zero") )

regnonw = re.compile("\W+")

def main(fname, model, n):
   """main is a procedure that generates an N-gram model given a token list."""

   if not os.path.isfile(fname):
      return
   try:
      inStream = open(fname, mode="r", encoding="utf-8")
      tokens = tuple(inStream.read().split())
      inStream.close()
   except IOError:
      print("Cannot read from file:", fname)
   for i in range(len(tokens) - (n - 1)):
      model[tokens[i:i+n]] = model.get(tokens[i:i+n], 0) + 1


def prettyPrint(model, outputstring, tabdelimited, ltokens, rtokens, ctokens, stopwords, punctuation):
   """Print the N-gram model as specified in the parameters."""

   for i in model:
      tok = ( x.replace('"', '""') for x in  i)
      tokens = ( t.replace('"', '""') for t in i )
      ltokenprint = False
      rtokenprint = False
      ctokenprint = False
      stwdprint = True
      punctuationprint = True

      if i[0] in ltokens or len(ltokens) == 0:
         ltokenprint = True
      if i[-1] in rtokens or len(rtokens) == 0:
         rtokenprint = True

      if len(ctokens) == 0:
         ctokenprint = True

      for x in i:
         if stopwords:
            if x.lower() in stpwdict:
               stwdprint = False
         if x in ctokens:
            ctokenprint = True
         if punctuation:
            if regnonw.match(x):
               punctuationprint = False

      if ltokenprint and rtokenprint and ctokenprint and stwdprint and punctuationprint:
         if tabdelimited: # output is tab-delimited
            if outputstring:
               print(" ".join(tokens), model[i], sep="\t", file=sys.stdout)
            else:
               print("\t".join(tokens), model[i], sep="\t", file=sys.stdout)
         else: # output is CSV
            if outputstring:
               print('"' + " ".join(tokens) + '"', model[i], sep=",", end="\r\n", file=sys.stdout)
            else:
               print('"' + "\",\"".join(tokens) + '"', model[i], sep=",", end="\r\n", file=sys.stdout)


def usage():
   print("""
tokens2Ngram.py
-n number -- number is integer for length of N-grams, e.g. -n 2 for bigrams
-i        -- itemize ngram rather than ngram as string space delimited,
             e.g. -i will activate output: "the","house",4
-t        -- tab-delimited output, activates output: the\thouse\t4
-l token  -- show only ngrams that have token as left token
-r token  -- show only ngrams that have token as right token
-c token  -- show only ngrams that contain token
-s        -- filter out ngrams that contain stopwords
-p        -- filter out ngrams that contain punctuation symbols

Your Python 3 interpreter might be called python rather than python3 on Windows.
It might be called python3.2 on Macs. Change the example commandlines appropriately.

Example:
python3 tokens2Ngram.py -n 4 -t mytext.txt
or, if the script is made executable:
./tokens2Ngram.py -n 4 -t mytext.txt

To show ngrams that have "the" as the left token and "of" as the right token:
./tokens2Ngram.py -n 3 -l the -r of mytext.txt

To show ngrams that have "the" or "a" as the left token:
./tokens2Ngram.py -n 3 -l the -l a mytext.txt
""")


if __name__ == '__main__':
   try:
      opts, args = getopt.getopt(sys.argv[1:], "hitl:r:n:c:spv", ["help", "number=", "itemngram", "tabdelimited", "lefttoken", "righttoken", "containstoken", "verbose", "stopwords", "punctuation"])
   except getopt.GetoptError as err:
      print(err) # will print something like "option -a not recognized"
      sys.exit(2)
   n = 2
   verbose = False
   outputstring = True
   tabdelimited = False
   stopwords = False
   punctuation = False
   ltokens = []
   rtokens = []
   ctokens = []
   for o, a in opts:
      if o in ("-v", "--verbose"):
         verbose = True
      elif o in ("-i", "--itemngram"):
         outputstring = False
      elif o in ("-t", "--tabdelimited"):
         tabdelimited = True
      elif o in ("-h", "--help"):
         usage()
         sys.exit()
      elif o in ("-n", "--number"):
         try:
            if a[0] == "=":
               n = int(a[1:])
            else:
               n = int(a)
         except ValueError:
            print("\nParameter of option -n or --number has to be an integer!\n")
            usage()
            sys.exit(2)
      elif o in ("-l", "--lefttoken"):
         if a[0] == "=":
            ltokens.append(a[1:])
         else:
            ltokens.append(a)
      elif o in ("-r", "--righttoken"):
         if a[0] == "=":
            rtokens.append(a[1:])
         else:
            rtokens.append(a)
      elif o in ("-s", "--stopwords"):
         stopwords = True
      elif o in ("-p", "--punctuation"):
         punctuation = True
      elif o in ("-c", "--containstoken"):
         if a[0] == "=":
            ctokens.append(a[1:])
         else:
            ctokens.append(a)
      else:
         assert False, "unhandled option"
   model = {}
   for i in args:
      for j in glob.glob(i):
         main(os.path.expanduser(os.path.expandvars(j)), model, n)
   prettyPrint(model, outputstring, tabdelimited, ltokens, rtokens, ctokens, stopwords, punctuation)