1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countbigrams4.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of bigrams
  26:;;;    and their frequency is printed out.
  27:;;;
  28:;;; If the command line parameters contain more than one text file,
  29:;;; the above results are accumulated over all the input text files.
  30:;;;
  31:;;; Usage:
  32:;;; mzscheme -r countbigrams4.scm test1.txt test2.txt ...
  33:;;; ----------------------------------------------------
  34:
  35:
  36:;;; all necessary libraries and functions
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss"))   ; for string-uppercase!
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "vector-lib.ss" "srfi" "43"))  ; for vector-for-each
  41:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  42:
  43:(load "english.scm")
  44:
  45:;;; Global variables
  46:(define bigramcount 0.0)                      ; total number of bigrams
  47:(define bigrams     (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
  48:(define tokencount  0.0)                      ; total number of tokens
  49:(define types       (make-hash-table 'equal)) ; hash-table container for type-count pairs
  50:
  51:
  52:;;; sort-by-value
  53:;;; <- hash-table
  54:;;; -> list of key-value tuples (lists)
  55:;;; ----------------------------------------------------
  56:;;; Sort a hash-table of key-value pairs by value, by converting it
  57:;;; into a list of key-value tuples and sorting on the value.
  58:(define sort-by-value
  59:  (lambda (table)
  60:    (let ([keyval (hash-table-map table (lambda (key val)
  61:                                          (list key val)))])
  62:      (sort keyval (lambda (a b)
  63:                     (< (cadr a) (cadr b)))))))
  64:
  65:
  66:;;; log2 of value
  67:;;; Base transformation:
  68:;;;   log2 is the natural log divided by the natural log of 2 (the base)
  69:(define log2
  70:  (lambda (x)
  71:    (/ (log x) (log 2))))
  72:
  73:
  74:;;; mutual-information of P(x), P(y), P(xy)
  75:;;; calculate pointwise MI as
  76:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
  77:(define mutual-information
  78:  (lambda (px py pxy)
  79:    (* pxy (log2 (/ pxy (* px py))))))
  80:
  81:
  82:
  83:;;; add-bigrams
  84:;;; <- list of strings, i.e. token list
  85:;;; !-> updated hash-table bigram-hash
  86:;;; !-> updated count-bigrams counter
  87:;;; ----------------------------------------------------
  88:;;; Add word-bigrams from an ordered list of tokens to the hash-table
  89:;;; container and keep track of their count.
  90:(define add-bigrams
  91:  (lambda (words)
  92:    (let ([count (- (length words) 1)])          ; how many bigrams
  93:      (set! bigramcount (+ bigramcount count))     ; remember total count of bigrams
  94:      (set! tokencount  (+ tokencount  (length words))) ; remember total count of tokens
  95:      ; add first token to hash-table
  96:      (let ([val (hash-table-get types (car words) 0.0)])
  97:        (hash-table-put! types (car words) (+ val 1)))
  98:      ; loop over the rest and create tokens and bigrams
  99:      (let loop ([i 1])
 100:        ; add next token to hash-table
 101:        (let ([val (hash-table-get types (list-ref words i) 0.0)])
 102:          (hash-table-put! types (list-ref words i) (+ val 1)))
 103:        ; add next bigram to hash-table
 104:        (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))]  ; create bigram
 105:               [value  (hash-table-get bigrams bigram 0.0)])             ; get the calue for bigram
 106:          (hash-table-put! bigrams bigram (+ value 1)))
 107:        (if (< i count)
 108:            (loop (+ i 1)))))))
 109:
 110:
 111:;;; load-file
 112:;;; <- string filename
 113:;;; -> string file content
 114:;;; ----------------------------------------------------
 115:;;; Load text from file into a string variable and return it.
 116:(define load-file
 117:  (lambda (name)
 118:    (call-with-input-file name
 119:      (lambda (p)
 120:        (read-string (file-size name) p)))))
 121:
 122:
 123:;;; main steps
 124:(begin 
 125:  (vector-for-each (lambda (i fname)
 126:                     (let ([text (load-file fname)])
 127:                       (string-lowercase! text)
 128:                       (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))
 129:                       ;(add-bigrams (string-tokenize text))
 130:                       ))
 131:                   argv)
 132:  (printf "bigram\tfrequency\trelative frequency\tmutual information\n")
 133:  (let ([result (sort-by-value bigrams)])
 134:    (for-each (lambda (a)
 135:                (let ([bigram (car a)])
 136:                  ;(unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
 137:                    (printf "~a ~a\t~a\t~a\t~a\n"
 138:                            (car bigram)
 139:                            (cadr bigram)
 140:                            (cadr a)
 141:                            (/ (cadr a) bigramcount)
 142:                            (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
 143:                                                (/ (hash-table-get types (cadr bigram)) tokencount)
 144:                                                (/ (cadr a) bigramcount)))
 145:                    ;)
 146:                  ))
 147:              (reverse result)))
 148:  (printf "---------------------------------------------------------\n")
 149:  (printf "Number of bigrams\t~a\n" bigramcount)
 150:  (printf "Number of bigram-types\t~a\n" (hash-table-count bigrams))
 151:  (printf "Number of tokens\t~a\n" tokencount)
 152:  (printf "Number of types\t~a\n" (hash-table-count types)))