1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: average-mi.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. Punctuation marks are eliminated from the text.
  21:;;; 3. The text is tokenized.
  22:;;; 4. Two adjacent tokens are placed into a hash-table as keys, the value
  23:;;;    is the absolute frequency (i.e. count) of each token in the
  24:;;;    text.
  25:;;; 5. Single tokens are placed in a second hash-table.
  26:;;; 6. Bigrams that do not contain stopwords are printed out, together
  27:;;;    with their Mutual Information score, as well as the average
  28:;;;    left and right Mutual Information score.
  29:;;;
  30:;;; If the command line parameters contain more than one text file,
  31:;;; the above results are accumulated over all the input text files.
  32:;;;
  33:;;; Usage:
  34:;;; mzscheme -r average-mi.scm test1.txt test2.txt ...
  35:;;; ----------------------------------------------------
  36:
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss")) ; for string-uppercase!
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "char-set.ss"   "srfi" "14"))
  41:(require (lib "vector-lib.ss" "srfi" "43"))
  42:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  43:;(load "english.ss")
  44:
  45:
  46:;;; global counters
  47:(define tokencount  0.0) ; total number of words
  48:(define bigramcount 0.0) ; total number of bigrams
  49:
  50:;;; hash-table containers
  51:(define types   (make-hash-table 'equal)) ; words and their counts
  52:(define tags    (make-hash-table 'equal))
  53:(define bigramstoto (make-hash-table 'equal)) ; bigrams and their counts
  54:(define bigramstato (make-hash-table 'equal)) ; bigrams and their counts
  55:(define bigramstata (make-hash-table 'equal)) ; bigrams and their counts
  56:(define bigramstota (make-hash-table 'equal)) ; bigrams and their counts
  57:
  58:;;; extra hash-tables
  59:(define lefttoken  (make-hash-table 'equal)) ; key = left token in bigram, value = list of bigrams with key left
  60:(define righttoken (make-hash-table 'equal)) ; key = right token in bigram, value = list of bigrams with key right
  61:(define lefttag    (make-hash-table 'equal)) ; key = left token in bigram, value = list of bigrams with key left
  62:(define righttag   (make-hash-table 'equal)) ; key = right token in bigram, value = list of bigrams with key right
  63:(define tags       (make-hash-table 'equal))
  64:
  65:
  66:;;; sort hash table with by value
  67:;;; assuming values = reals/ints
  68:;;; returning a sorted list of key-value tuples (lists)
  69:(define sort-by-value
  70:  (lambda (table)
  71:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  72:      (sort keyval (lambda (a b)
  73:                     (< (cadr a) (cadr b)))))))
  74:
  75:
  76:(define add-data
  77:  (lambda (tokenlist)
  78:    ;(printf "Adding tokens and tags...\n")
  79:    ; remember the total counts of tokens and bigrams
  80:    (let ([count (- (length tokenlist) 1)])
  81:      (set! tokencount  (+ tokencount  (length tokenlist)))
  82:      (set! bigramcount (+ bigramcount count))
  83:
  84:      ; count the first token in the list
  85:      (hash-table-put! types (caar tokenlist)
  86:                       (+ (hash-table-get types (caar tokenlist) 0.0) 1))
  87:      (hash-table-put! tags (cadar tokenlist)
  88:                       (+ (hash-table-get tags (cadar tokenlist) 0.0) 1))
  89:
  90:      ; loop over the rest of the tokens
  91:      (let loop ([i 1])
  92:        (let* ([word     (car  (list-ref tokenlist i))]       ; right token
  93:               [wordp    (car  (list-ref tokenlist (- i 1)))] ; left token
  94:               [tag      (cadr (list-ref tokenlist i))]       ; tag of right token
  95:               [tagp     (cadr (list-ref tokenlist (- i 1)))] ; tag of left token
  96:               [bigramtato (list tagp  word)] ; bigram = previous and current token as list
  97:               [bigramtota (list wordp tag)]  ; bigram = previous and current token as list
  98:               [bigramtoto (list wordp word)] ; bigram = previous and current token as list
  99:               [bigramtata (list tagp  tag)])  ; bigram = previous and current token as list
 100:
 101:          ; store the bigram in the value for left and right
 102:          (let ([listval (hash-table-get lefttoken wordp '())])
 103:            (unless (member listval bigramtota)
 104:              (hash-table-put! lefttoken wordp (cons bigramtota listval))))
 105:          (let ([listval (hash-table-get righttoken word '())])
 106:            (unless (member listval bigramtato)
 107:              (hash-table-put! righttoken word (cons bigramtato listval))))
 108:          
 109:          (let ([listval (hash-table-get lefttag tagp '())])
 110:            (unless (member listval bigramtato)
 111:              (hash-table-put! lefttag tagp (cons bigramtato listval))))
 112:          (let ([listval (hash-table-get righttag tag '())])
 113:            (unless (member listval bigramtota)
 114:              (hash-table-put! righttag tag (cons bigramtota listval))))
 115:
 116:          ; store tokens and bigrams in their hash-tables
 117:          (hash-table-put! types word
 118:                           (+ (hash-table-get types word 0.0) 1)) ; increment counter for token
 119:          (hash-table-put! tags tag
 120:                           (+ (hash-table-get tags tag 0.0) 1)) ; increment counter for token
 121:          
 122:          (hash-table-put! bigramstoto bigramtoto
 123:                           (+ (hash-table-get bigramstoto bigramtoto 0.0) 1))   ; increment counter for bigram
 124:          (hash-table-put! bigramstato bigramtato
 125:                           (+ (hash-table-get bigramstato bigramtato 0.0) 1))   ; increment counter for bigram
 126:          (hash-table-put! bigramstota bigramtota
 127:                           (+ (hash-table-get bigramstota bigramtota 0.0) 1))   ; increment counter for bigram
 128:          (hash-table-put! bigramstata bigramtata
 129:                           (+ (hash-table-get bigramstata bigramtata 0.0) 1))   ; increment counter for bigram
 130:
 131:        ; go back to loop, if more tokens left
 132:        (if (< i count)
 133:            (loop (+ i 1))))))))
 134:
 135:
 136:;;; log2 of value
 137:;;; Base transformation:
 138:;;;   log2 is the natural log divided by the natural log of 2 (the base)
 139:(define log2
 140:    (lambda (x)
 141:      (/ (log x) (log 2))))
 142:
 143:
 144:;;; mutual-information of P(x), P(y), P(xy)
 145:;;; calculate pointwise MI as
 146:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
 147:(define mutual-information
 148:  (lambda (px py pxy)
 149:    (* pxy (log2 (/ pxy (* px py))))))
 150:
 151:
 152:;;; load-file (filename)
 153:;;; load text from file into a string variable and return it
 154:(define load-file
 155:  (lambda (name)
 156:    (call-with-input-file name
 157:      (lambda (p)
 158:        (read-string (file-size name) p)))))
 159:
 160:
 161:; MI for token=word and tags in context
 162:(define get-average-lrmi-tag
 163:  (lambda (token)
 164:    (let ([avleft  0.0]
 165:          [avright 0.0]
 166:          [px      (/ (hash-table-get types token 0.0) tokencount)])
 167:      ; for all bigrams with token left, get MI and average it
 168:      (let* ([val    (hash-table-get lefttoken token '())]
 169:             [lenval (length val)])
 170:        (for-each (lambda (bigram)
 171:                    (let ([py  (/ (hash-table-get tags (cadr bigram) 0.0) tokencount)]
 172:                          [pxy (/ (hash-table-get bigramstota bigram 0.0) bigramcount)])
 173:                      (set! avright (+ avright (mutual-information px py pxy)))))
 174:                  val)
 175:        (unless (= 0 lenval)
 176:          (set! avright (/ avright lenval))))
 177:      ; for all bigrams with token right, get MI and average it
 178:      (let* ([val    (hash-table-get righttoken token '())]
 179:             [lenval (length val)])
 180:        (for-each (lambda (bigram)
 181:                    (let ([py  (/ (hash-table-get tags (car bigram) 0.0) tokencount)]
 182:                          [pxy (/ (hash-table-get bigramstato bigram 0.0) bigramcount)])
 183:                      (set! avleft (+ avleft (mutual-information px py pxy)))))
 184:                  val)
 185:        (unless (= 0 lenval)
 186:          (set! avleft (/ avleft (length val)))))
 187:      (list avleft avright))))
 188:
 189:
 190:
 191:; MI for token=tag and words in context
 192:(define get-average-lrmi-token
 193:  (lambda (token)
 194:    (let ([avleft  0.0]
 195:          [avright 0.0]
 196:          [px      (/ (hash-table-get tags token 0.0) tokencount)])
 197:      ; for all bigrams with token left, get MI and average it
 198:      (let* ([val    (hash-table-get lefttag token '())]
 199:             [lenval (length val)])
 200:        (for-each (lambda (bigram)
 201:                    (let ([py  (/ (hash-table-get types (cadr bigram) 0.0) tokencount)]
 202:                          [pxy (/ (hash-table-get bigramstato bigram 0.0) bigramcount)])
 203:                      (set! avright (+ avright (mutual-information px py pxy)))))
 204:                  val)
 205:        (unless (= 0 lenval)
 206:          (set! avright (/ avright lenval))))
 207:      ; for all bigrams with token right, get MI and average it
 208:      (let* ([val    (hash-table-get righttag token '())]
 209:             [lenval (length val)])
 210:        (for-each (lambda (bigram)
 211:                    (let ([py  (/ (hash-table-get types (car bigram) 0.0) tokencount)]
 212:                          [pxy (/ (hash-table-get bigramstota bigram 0.0) bigramcount)])
 213:                      (set! avleft (+ avleft (mutual-information px py pxy)))))
 214:                  val)
 215:        (unless (= 0 lenval)
 216:          (set! avleft (/ avleft (length val)))))
 217:      (list avleft avright))))
 218:
 219:
 220:
 221:; split token into list with (word tag)
 222:(define prepare-brown
 223:  (lambda (tokens)
 224:    (map (lambda (token)
 225:           (list (substring token 0 (string-index token #\/))
 226:                 (substring token (+ (string-index token #\/) 1))))
 227:         tokens)))
 228:
 229:
 230:
 231:(begin
 232:  (vector-for-each (lambda (i fname)
 233:                     (printf "Processing file ~a (~a of ~a)\n" fname (+ i 1) (vector-length argv))
 234:                     (let ([text (load-file fname)])
 235:                       (string-lowercase! text)
 236:                       (add-data (prepare-brown (string-tokenize text)))))
 237:                   argv)
 238:  
 239:  
 240:  (let ([result (sort-by-value types)])
 241:    (printf "token\tfreq\tlMI\tlvMI\trMI\trvMI\n")
 242:    (for-each (lambda (token)
 243:                (let ([MI (get-average-lrmi-tag (car token))])
 244:                  (printf "~a\t~a\t~a\t~a\t~a\t~a\n"
 245:                          (car token)
 246:                          (/ (cadr token) tokencount)
 247:                          (if (< (car MI) (cadr MI))
 248:                              "-"
 249:                              "+")
 250:                          (car MI)
 251:                          (if (< (cadr MI) (car MI))
 252:                              "-"
 253:                              "+")
 254:                          (cadr MI))))
 255:              (reverse result)))
 256:  (printf "---------------------------------------------------------\n")
 257:  (printf "Number of tokens: ~a\n"  tokencount)
 258:  (printf "Number of types: ~a\n"   (hash-table-count types))
 259:  (printf "Type/Token ratio: ~a\n"  (/ (hash-table-count types) tokencount))
 260:  (printf "Number of bigrams: ~a\n" bigramcount)
 261:  (printf "Number of bigram types: ~a\n" (hash-table-count bigramstoto))
 262:  (printf "Bigram type/token ratio: ~a\n"  (/ (hash-table-count bigramstoto) bigramcount))
 263:  (printf "---------------------------------------------------------\n")
 264:  (let ([result (sort-by-value tags)])
 265:    (printf "Number of tags: ~a\n"   (hash-table-count tags))
 266:    (printf "token\tfreq\tlMI\tlvMI\trMI\trvMI\n")
 267:    (for-each (lambda (token)
 268:                (let ([MI (get-average-lrmi-token (car token))])
 269:                  (printf "~a\t~a\t~a\t~a\t~a\t~a\n"
 270:                          (car token)
 271:                          (/ (cadr token) tokencount)
 272:                          (if (< (car MI) (cadr MI))
 273:                              "-"
 274:                              "+")
 275:                          (car MI)
 276:                          (if (< (cadr MI) (car MI))
 277:                              "-"
 278:                              "+")
 279:                          (cadr MI))))
 280:              (reverse result)))
 281:  (printf "---------------------------------------------------------\n")
 282:  (let ([result (sort-by-value bigramstoto)])
 283:    (printf "token\tfreq\tMI\n")
 284:    (for-each (lambda (token)
 285:                (let* ([bigram (car token)]
 286:                       [MI (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
 287:                                               (/ (hash-table-get types (cadr bigram)) tokencount)
 288:                                               (/ (cadr token) bigramcount))])
 289:                  (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
 290:              (reverse result)))
 291:  (printf "---------------------------------------------------------\n")
 292:  (let ([result (sort-by-value bigramstota)])
 293:    (printf "token\tfreq\tMI\n")
 294:    (for-each (lambda (token)
 295:                (let* ([bigram (car token)]
 296:                       [MI (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
 297:                                               (/ (hash-table-get tags (cadr bigram)) tokencount)
 298:                                               (/ (cadr token) bigramcount))])
 299:                  (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
 300:              (reverse result)))
 301:  (printf "---------------------------------------------------------\n")
 302:  (let ([result (sort-by-value bigramstato)])
 303:    (printf "token\tfreq\tMI\n")
 304:    (for-each (lambda (token)
 305:                (let* ([bigram (car token)]
 306:                       [MI (mutual-information (/ (hash-table-get tags (car bigram)) tokencount)
 307:                                               (/ (hash-table-get types (cadr bigram)) tokencount)
 308:                                               (/ (cadr token) bigramcount))])
 309:                  (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
 310:              (reverse result)))
 311:  (printf "---------------------------------------------------------\n")
 312:  (let ([result (sort-by-value bigramstata)])
 313:    (printf "token\tfreq\tMI\n")
 314:    (for-each (lambda (token)
 315:                (let* ([bigram (car token)]
 316:                       [MI (mutual-information (/ (hash-table-get tags (car bigram)) tokencount)
 317:                                               (/ (hash-table-get tags (cadr bigram)) tokencount)
 318:                                               (/ (cadr token) bigramcount))])
 319:                  (printf "~a ~a\t~a\t~a\n" (car bigram) (cadr bigram) (/ (cadr token) tokencount) MI)))
 320:              (reverse result)))
 321:  )