1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: average-mi.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. Punctuation marks are eliminated from the text.
21:;;; 3. The text is tokenized.
22:;;; 4. Two adjacent tokens are placed into a hash-table as keys, the value
23:;;; is the absolute frequency (i.e. count) of each token in the
24:;;; text.
25:;;; 5. Single tokens are placed in a second hash-table.
26:;;; 6. Bigrams that do not contain stopwords are printed out, together
27:;;; with their Mutual Information score, as well as the average
28:;;; left and right Mutual Information score.
29:;;;
30:;;; If the command line parameters contain more than one text file,
31:;;; the above results are accumulated over all the input text files.
32:;;;
33:;;; Usage:
34:;;; mzscheme -r average-mi.scm test1.txt test2.txt ...
35:;;; ----------------------------------------------------
36:
37:(require (lib "list.ss"))
38:(require (lib "string.ss")) ; for string-uppercase!
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "char-set.ss" "srfi" "14"))
41:(require (lib "vector-lib.ss" "srfi" "43"))
42:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
43:;(load "english.ss")
44:
45:
46:;;; global counters
47:(define tokencount 0.0) ; total number of words
48:(define bigramcount 0.0) ; total number of bigrams
49:
50:;;; hash-table containers
51:(define types (make-hash-table 'equal)) ; words and their counts
52:(define tags (make-hash-table 'equal))
53:(define bigramstoto (make-hash-table 'equal)) ; bigrams and their counts
54:(define bigramstato (make-hash-table 'equal)) ; bigrams and their counts
55:(define bigramstata (make-hash-table 'equal)) ; bigrams and their counts
56:(define bigramstota (make-hash-table 'equal)) ; bigrams and their counts
57:
58:;;; extra hash-tables
59:(define lefttoken (make-hash-table 'equal)) ; key = left token in bigram, value = list of bigrams with key left
60:(define righttoken (make-hash-table 'equal)) ; key = right token in bigram, value = list of bigrams with key right
61:(define lefttag (make-hash-table 'equal)) ; key = left token in bigram, value = list of bigrams with key left
62:(define righttag (make-hash-table 'equal)) ; key = right token in bigram, value = list of bigrams with key right
63:(define tags (make-hash-table 'equal))
64:
65:
66:;;; sort hash table with by value
67:;;; assuming values = reals/ints
68:;;; returning a sorted list of key-value tuples (lists)
69:(define sort-by-value
70: (lambda (table)
71: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
72: (sort keyval (lambda (a b)
73: (< (cadr a) (cadr b)))))))
74:
75:
76:(define add-data
77: (lambda (tokenlist)
78: ;(printf "Adding tokens and tags...\n")
79: ; remember the total counts of tokens and bigrams
80: (let ([count (- (length tokenlist) 1)])
81: (set! tokencount (+ tokencount (length tokenlist)))
82: (set! bigramcount (+ bigramcount count))
83:
84: ; count the first token in the list
85: (hash-table-put! types (caar tokenlist)
86: (+ (hash-table-get types (caar tokenlist) 0.0) 1))
87: (hash-table-put! tags (cadar tokenlist)
88: (+ (hash-table-get tags (cadar tokenlist) 0.0) 1))
89:
90: ; loop over the rest of the tokens
91: (let loop ([i 1])
92: (let* ([word (car (list-ref tokenlist i))] ; right token
93: [wordp (car (list-ref tokenlist (- i 1)))] ; left token
94: [tag (cadr (list-ref tokenlist i))] ; tag of right token
95: [tagp (cadr (list-ref tokenlist (- i 1)))] ; tag of left token
96: [bigramtato (list tagp word)] ; bigram = previous and current token as list
97: [bigramtota (list wordp tag)] ; bigram = previous and current token as list
98: [bigramtoto (list wordp word)] ; bigram = previous and current token as list
99: [bigramtata (list tagp tag)]) ; bigram = previous and current token as list
100:
101: ; store the bigram in the value for left and right
102: (let ([listval (hash-table-get lefttoken wordp '())])
103: (unless (member listval bigramtota)
104: (hash-table-put! lefttoken wordp (cons bigramtota listval))))
105: (let ([listval (hash-table-get righttoken word '())])
106: (unless (member listval bigramtato)
107: (hash-table-put! righttoken word (cons bigramtato listval))))
108:
109: (let ([listval (hash-table-get lefttag tagp '())])
110: (unless (member listval bigramtato)
111: (hash-table-put! lefttag tagp (cons bigramtato listval))))
112: (let ([listval (hash-table-get righttag tag '())])
113: (unless (member listval bigramtota)
114: (hash-table-put! righttag tag (cons bigramtota listval))))
115:
116: ; store tokens and bigrams in their hash-tables
117: (hash-table-put! types word
118: (+ (hash-table-get types word 0.0) 1)) ; increment counter for token
119: (hash-table-put! tags tag
120: (+ (hash-table-get tags tag 0.0) 1)) ; increment counter for token
121:
122: (hash-table-put! bigramstoto bigramtoto
123: (+ (hash-table-get bigramstoto bigramtoto 0.0) 1)) ; increment counter for bigram
124: (hash-table-put! bigramstato bigramtato
125: (+ (hash-table-get bigramstato bigramtato 0.0) 1)) ; increment counter for bigram
126: (hash-table-put! bigramstota bigramtota
127: (+ (hash-table-get bigramstota bigramtota 0.0) 1)) ; increment counter for bigram
128: (hash-table-put! bigramstata bigramtata
129: (+ (hash-table-get bigramstata bigramtata 0.0) 1)) ; increment counter for bigram
130:
131: ; go back to loop, if more tokens left
132: (if (< i count)
133: (loop (+ i 1))))))))
134:
135:
136:;;; log2 of value
137:;;; Base transformation:
138:;;; log2 is the natural log divided by the natural log of 2 (the base)
139:(define log2
140: (lambda (x)
141: (/ (log x) (log 2))))
142:
143:
144:;;; mutual-information of P(x), P(y), P(xy)
145:;;; calculate pointwise MI as
146:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
147:(define mutual-information
148: (lambda (px py pxy)
149: (* pxy (log2 (/ pxy (* px py))))))
150:
151:
152:;;; load-file (filename)
153:;;; load text from file into a string variable and return it
154:(define load-file
155: (lambda (name)
156: (call-with-input-file name
157: (lambda (p)
158: (read-string (file-size name) p)))))
159:
160:
161:; MI for token=word and tags in context
162:(define get-average-lrmi-tag
163: (lambda (token)
164: (let ([avleft 0.0]
165: [avright 0.0]
166: [px (/ (hash-table-get types token 0.0) tokencount)])
167: ; for all bigrams with token left, get MI and average it
168: (let* ([val (hash-table-get lefttoken token '())]
169: [lenval (length val)])
170: (for-each (lambda (bigram)
171: (let ([py (/ (hash-table-get tags (cadr bigram) 0.0) tokencount)]
172: [pxy (/ (hash-table-get bigramstota bigram 0.0) bigramcount)])
173: (set! avright (+ avright (mutual-information px py pxy)))))
174: val)
175: (unless (= 0 lenval)
176: (set! avright (/ avright lenval))))
177: ; for all bigrams with token right, get MI and average it
178: (let* ([val (hash-table-get righttoken token '())]
179: [lenval (length val)])
180: (for-each (lambda (bigram)
181: (let ([py (/ (hash-table-get tags (car bigram) 0.0) tokencount)]
182: [pxy (/ (hash-table-get bigramstato bigram 0.0) bigramcount)])
183: (set! avleft (+ avleft (mutual-information px py pxy)))))
184: val)
185: (unless (= 0 lenval)
186: (set! avleft (/ avleft (length val)))))
187: (list avleft avright))))
188:
189:
190:
191:; MI for token=tag and words in context
192:(define get-average-lrmi-token
193: (lambda (token)
194: (let ([avleft 0.0]
195: [avright 0.0]
196: [px (/ (hash-table-get tags token 0.0) tokencount)])
197: ; for all bigrams with token left, get MI and average it
198: (let* ([val (hash-table-get lefttag token '())]
199: [lenval (length val)])
200: (for-each (lambda (bigram)
201: (let ([py (/ (hash-table-get types (cadr bigram) 0.0) tokencount)]
202: [pxy (/ (hash-table-get bigramstato bigram 0.0) bigramcount)])
203: (set! avright (+ avright (mutual-information px py pxy)))))
204: val)
205: (unless (= 0 lenval)
206: (set! avright (/ avright lenval))))
207: ; for all bigrams with token right, get MI and average it
208: (let* ([val (hash-table-get righttag token '())]
209: [lenval (length val)])
210: (for-each (lambda (bigram)
211: (let ([py (/ (hash-table-get types (car bigram) 0.0) tokencount)]
212: [pxy (/ (hash-table-get bigramstota bigram 0.0) bigramcount)])
213: (set! avleft (+ avleft (mutual-information px py pxy)))))
214: val)
215: (unless (= 0 lenval)
216: (set! avleft (/ avleft (length val)))))
217: (list avleft avright))))
218:
219:
220:
221:; split token into list with (word tag)
222:(define prepare-brown
223: (lambda (tokens)
224: (map (lambda (token)
225: (list (substring token 0 (string-index token #\/))
226: (substring token (+ (string-index token #\/) 1))))
227: tokens)))
228:
229:
230:
231:(begin
232: (vector-for-each (lambda (i fname)
233: (printf "Processing file ~a (~a of ~a)\n" fname (+ i 1) (vector-length argv))
234: (let ([text (load-file fname)])
235: (string-lowercase! text)
236: (add-data (prepare-brown (string-tokenize text)))))
237: argv)
238:
239:
240: (let ([result (sort-by-value types)])
241: (printf "token\tfreq\tlMI\tlvMI\trMI\trvMI\n")
242: (for-each (lambda (token)
243: (let ([MI (get-average-lrmi-tag (car token))])
244: (printf "~a\t~a\t~a\t~a\t~a\t~a\n"
245: (car token)
246: (/ (cadr token) tokencount)
247: (if (< (car MI) (cadr MI))
248: "-"
249: "+")
250: (car MI)
251: (if (< (cadr MI) (car MI))
252: "-"
253: "+")
254: (cadr MI))))
255: (reverse result)))
256: (printf "---------------------------------------------------------\n")
257: (printf "Number of tokens: ~a\n" tokencount)
258: (printf "Number of types: ~a\n" (hash-table-count types))
259: (printf "Type/Token ratio: ~a\n" (/ (hash-table-count types) tokencount))
260: (printf "Number of bigrams: ~a\n" bigramcount)
261: (printf "Number of bigram types: ~a\n" (hash-table-count bigramstoto))
262: (printf "Bigram type/token ratio: ~a\n" (/ (hash-table-count bigramstoto) bigramcount))
263: (printf "---------------------------------------------------------\n")
264: (let ([result (sort-by-value tags)])
265: (printf "Number of tags: ~a\n" (hash-table-count tags))
266: (printf "token\tfreq\tlMI\tlvMI\trMI\trvMI\n")
267: (for-each (lambda (token)
268: (let ([MI (get-average-lrmi-token (car token))])
269: (printf "~a\t~a\t~a\t~a\t~a\t~a\n"
270: (car token)
271: (/ (cadr token) tokencount)
272: (if (< (car MI) (cadr MI))
273: "-"
274: "+")
275: (car MI)
276: (if (< (cadr MI) (car MI))
277: "-"
278: "+")
279: (cadr MI))))
280: (reverse result)))
281: (printf "---------------------------------------------------------\n")
282: (let ([result (sort-by-value bigramstoto)])
283: (printf "token\tfreq\tMI\n")
284: (for-each (lambda (token)
285: (let* ([bigram (car token)]
286: [MI (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
287: (/ (hash-table-get types (cadr bigram)) tokencount)
288: (/ (cadr token) bigramcount))])
289: (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
290: (reverse result)))
291: (printf "---------------------------------------------------------\n")
292: (let ([result (sort-by-value bigramstota)])
293: (printf "token\tfreq\tMI\n")
294: (for-each (lambda (token)
295: (let* ([bigram (car token)]
296: [MI (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
297: (/ (hash-table-get tags (cadr bigram)) tokencount)
298: (/ (cadr token) bigramcount))])
299: (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
300: (reverse result)))
301: (printf "---------------------------------------------------------\n")
302: (let ([result (sort-by-value bigramstato)])
303: (printf "token\tfreq\tMI\n")
304: (for-each (lambda (token)
305: (let* ([bigram (car token)]
306: [MI (mutual-information (/ (hash-table-get tags (car bigram)) tokencount)
307: (/ (hash-table-get types (cadr bigram)) tokencount)
308: (/ (cadr token) bigramcount))])
309: (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
310: (reverse result)))
311: (printf "---------------------------------------------------------\n")
312: (let ([result (sort-by-value bigramstata)])
313: (printf "token\tfreq\tMI\n")
314: (for-each (lambda (token)
315: (let* ([bigram (car token)]
316: [MI (mutual-information (/ (hash-table-get tags (car bigram)) tokencount)
317: (/ (hash-table-get tags (cadr bigram)) tokencount)
318: (/ (cadr token) bigramcount))])
319: (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
320: (reverse result)))
321: )