"""
tokens2Ngram.py
Version 0.2
(C) 2010-2012 by Damir Cavar (http://cavar.me/damir/)
Simple ngram model generation algorithm.
Requires as input a token list generated by the tokenizer.py script.
Generates a CSV frequency profile of N-grams, where the N-gram can be
a string or itemized. It can also generate a tab-delimited N-gram
frequency profile, with the N-gram being a string and space delimted,
or itemized and tab-delimited. The N-gram model can be filtered:
a. all N-grams that contain stop-words, and
b. all N-grams that contain puntuation marks can be eliminated
from the putput.
"""
import sys, os, os.path, glob, getopt, codecs, re
"""
Fix output for Windows and Komodo Edit...
if you have problems with the encoding of the output when
piping it on Windows to some file, try to set the system
variable PYTHONIOENCODING to utf-8 (also possible solution
on Mac or Linux. Or try by uncommenting the following line
"""
stpwdict = set( ("a","a's","able","about","above","according","accordingly",
"across","actually","after","afterwards","again","against",
"ain't","all","allow","allows","almost","alone","along",
"already","also","although","always","am","among","amongst",
"an","and","another","any","anybody","anyhow","anyone",
"anything","anyway","anyways","anywhere","apart","appear",
"appreciate","appropriate","are","aren't","around","as",
"aside","ask","asking","associated","at","available","away",
"awfully","be","became","because","become","becomes",
"becoming","been","before","beforehand","behind","being",
"believe","below","beside","besides","best","better",
"between","beyond","both","brief","but","by","c'mon",
"c's","came","can","can't","cannot","cant","cause",
"causes","certain","certainly","changes","clearly",
"co","com","come","comes","concerning","consequently",
"consider","considering","contain","containing","contains",
"corresponding","could","couldn't","course","currently",
"definitely","described","despite","did","didn't",
"different","do","does","doesn't","doing","don't","done",
"down","downwards","during","each","edu","eg","eight",
"either","else","elsewhere","enough","entirely","especially",
"et","etc","even","ever","every","everybody","everyone",
"everything","everywhere","ex","exactly","example","except",
"far","few","fifth","first","five","followed","following",
"follows","for","former","formerly","forth","four","from",
"further","furthermore","get","gets","getting","given",
"gives","go","goes","going","gone","got","gotten",
"greetings","had","hadn't","happens","hardly","has",
"hasn't","have","haven't","having","he","he's","hello",
"help","hence","her","here","here's","hereafter","hereby",
"herein","hereupon","hers","herself","hi","him","himself",
"his","hither","hopefully","how","howbeit","however","i'd",
"s","i","it","I","It","The","A","i'll","i'm","i've","ie","if",
"ignored","immediate","in",
"inasmuch","inc","indeed","indicate","indicated","indicates",
"inner","insofar","instead","into","inward","is","isn't","it",
"it'd","it'll","it's","its","itself","just","keep","keeps",
"kept","know","known","knows","last","lately","later","latter",
"latterly","least","less","lest","let","let's","like","liked",
"likely","little","look","looking","looks","ltd","mainly",
"many","may","maybe","me","mean","meanwhile","merely","might",
"more","moreover","most","mostly","much","must","my","myself",
"name","namely","nd","near","nearly","necessary","need",
"needs","neither","never","nevertheless","new","next","nine",
"no","nobody","non","none","noone","nor","normally","not",
"nothing","novel","now","nowhere","obviously","of","off",
"often","oh","ok","okay","old","on","once","one","ones",
"only","onto","or","other","others","otherwise","ought",
"our","ours","ourselves","out","outside","over","overall",
"own","particular","particularly","per","perhaps","placed",
"please","plus","possible","presumably","probably",
"provides","que","quite","qv","rather","rd","re",
"really","reasonably","regarding","regardless","regards",
"relatively","respectively","right","said","same","saw",
"say","saying","says","second","secondly","see","seeing",
"seem","seemed","seeming","seems","seen","self","selves",
"sensible","sent","serious","seriously","seven","several",
"shall","she","should","shouldn't","since","six","so",
"some","somebody","somehow","someone","something",
"sometime","sometimes","somewhat","somewhere","soon",
"sorry","specified","specify","specifying","still","sub",
"such","sup","sure","t's","take","taken","tell","tends",
"th","than","thank","thanks","thanx","that","that's","thats",
"the","their","theirs","them","themselves","then","thence",
"there","there's","thereafter","thereby","therefore","therein",
"theres","thereupon","these","they","they'd","they'll",
"they're","they've","think","third","this","thorough","thoroughly",
"those","though","three","through","throughout","thru","thus",
"to","together","too","took","toward","towards","tried","tries",
"truly","try","trying","twice","two","un","under","unfortunately",
"unless","unlikely","until","unto","up","upon","us","use","used",
"useful","uses","using","usually","value","various","very","via",
"viz","vs","want","wants","was","wasn't","way","we","we'd","we'll",
"we're","we've","welcome","well","went","were","weren't","what",
"what's","whatever","when","whence","whenever","where","where's",
"whereafter","whereas","whereby","wherein","whereupon","wherever",
"whether","which","while","whither","who","who's","whoever",
"whole","whom","whose","why","will","willing","wish","with",
"within","without","won't","wonder","would","wouldn't","yes",
"yet","you","you'd","you'll","you're","you've","your","yours",
"yourself","yourselves","zero") )
regnonw = re.compile("\W+")
def main(fname, model, n):
"""main is a procedure that generates an N-gram model given a token list."""
if not os.path.isfile(fname):
return
try:
inStream = open(fname, mode="r", encoding="utf-8")
tokens = tuple(inStream.read().split())
inStream.close()
except IOError:
print("Cannot read from file:", fname)
for i in range(len(tokens) - (n - 1)):
model[tokens[i:i+n]] = model.get(tokens[i:i+n], 0) + 1
def prettyPrint(model, outputstring, tabdelimited, ltokens, rtokens, ctokens, stopwords, punctuation):
"""Print the N-gram model as specified in the parameters."""
for i in model:
tok = ( x.replace('"', '""') for x in i)
tokens = ( t.replace('"', '""') for t in i )
ltokenprint = False
rtokenprint = False
ctokenprint = False
stwdprint = True
punctuationprint = True
if i[0] in ltokens or len(ltokens) == 0:
ltokenprint = True
if i[-1] in rtokens or len(rtokens) == 0:
rtokenprint = True
if len(ctokens) == 0:
ctokenprint = True
for x in i:
if stopwords:
if x.lower() in stpwdict:
stwdprint = False
if x in ctokens:
ctokenprint = True
if punctuation:
if regnonw.match(x):
punctuationprint = False
if ltokenprint and rtokenprint and ctokenprint and stwdprint and punctuationprint:
if tabdelimited:
if outputstring:
print(" ".join(tokens), model[i], sep="\t", file=sys.stdout)
else:
print("\t".join(tokens), model[i], sep="\t", file=sys.stdout)
else:
if outputstring:
print('"' + " ".join(tokens) + '"', model[i], sep=",", end="\r\n", file=sys.stdout)
else:
print('"' + "\",\"".join(tokens) + '"', model[i], sep=",", end="\r\n", file=sys.stdout)
def usage():
print("""
tokens2Ngram.py
-n number -- number is integer for length of N-grams, e.g. -n 2 for bigrams
-i -- itemize ngram rather than ngram as string space delimited,
e.g. -i will activate output: "the","house",4
-t -- tab-delimited output, activates output: the\thouse\t4
-l token -- show only ngrams that have token as left token
-r token -- show only ngrams that have token as right token
-c token -- show only ngrams that contain token
-s -- filter out ngrams that contain stopwords
-p -- filter out ngrams that contain punctuation symbols
Your Python 3 interpreter might be called python rather than python3 on Windows.
It might be called python3.2 on Macs. Change the example commandlines appropriately.
Example:
python3 tokens2Ngram.py -n 4 -t mytext.txt
or, if the script is made executable:
./tokens2Ngram.py -n 4 -t mytext.txt
To show ngrams that have "the" as the left token and "of" as the right token:
./tokens2Ngram.py -n 3 -l the -r of mytext.txt
To show ngrams that have "the" or "a" as the left token:
./tokens2Ngram.py -n 3 -l the -l a mytext.txt
""")
if __name__ == '__main__':
try:
opts, args = getopt.getopt(sys.argv[1:], "hitl:r:n:c:spv", ["help", "number=", "itemngram", "tabdelimited", "lefttoken", "righttoken", "containstoken", "verbose", "stopwords", "punctuation"])
except getopt.GetoptError as err:
print(err)
sys.exit(2)
n = 2
verbose = False
outputstring = True
tabdelimited = False
stopwords = False
punctuation = False
ltokens = []
rtokens = []
ctokens = []
for o, a in opts:
if o in ("-v", "--verbose"):
verbose = True
elif o in ("-i", "--itemngram"):
outputstring = False
elif o in ("-t", "--tabdelimited"):
tabdelimited = True
elif o in ("-h", "--help"):
usage()
sys.exit()
elif o in ("-n", "--number"):
try:
if a[0] == "=":
n = int(a[1:])
else:
n = int(a)
except ValueError:
print("\nParameter of option -n or --number has to be an integer!\n")
usage()
sys.exit(2)
elif o in ("-l", "--lefttoken"):
if a[0] == "=":
ltokens.append(a[1:])
else:
ltokens.append(a)
elif o in ("-r", "--righttoken"):
if a[0] == "=":
rtokens.append(a[1:])
else:
rtokens.append(a)
elif o in ("-s", "--stopwords"):
stopwords = True
elif o in ("-p", "--punctuation"):
punctuation = True
elif o in ("-c", "--containstoken"):
if a[0] == "=":
ctokens.append(a[1:])
else:
ctokens.append(a)
else:
assert False, "unhandled option"
model = {}
for i in args:
for j in glob.glob(i):
main(os.path.expanduser(os.path.expandvars(j)), model, n)
prettyPrint(model, outputstring, tabdelimited, ltokens, rtokens, ctokens, stopwords, punctuation)