1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countunibigrams2.scm
   5:;;; Author:   Damir Cavar <dcavar@unizd.hr>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. Punctuation marks are eliminated from the text.
  21:;;; 3. The text is tokenized.
  22:;;; 4. Two adjacent tokens are placed into a hash-table as keys, the value
  23:;;;    is the absolute frequency (i.e. count) of each token in the
  24:;;;    text.
  25:;;; 5. Single tokens are placed in a second hash-table.
  26:;;; 6. Bigrams that do not contain stopwords are printed out, together
  27:;;;    with their Mutual Information score.
  28:;;;
  29:;;; If the command line parameters contain more than one text file,
  30:;;; the above results are accumulated over all the input text files.
  31:;;;
  32:;;; Usage:
  33:;;; mzscheme -r countunibigrams2.scm test1.txt test2.txt ...
  34:;;; ----------------------------------------------------
  35:
  36:(require (lib "list.ss"))
  37:(require (lib "string.ss"))   ; for string-uppercase!
  38:(require (lib "string.ss"     "srfi" "13"))
  39:(require (lib "vector-lib.ss" "srfi" "43"))
  40:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  41:(load "english.scm")
  42:
  43:
  44:;;; global counters
  45:(define tokencount  0.0) ; total number of words
  46:(define bigramcount 0.0) ; total number of bigrams
  47:
  48:;;; hash-table containers
  49:(define types   (make-hash-table 'equal)) ; words and their counts
  50:(define bigrams (make-hash-table 'equal)) ; bigrams and their counts
  51:
  52:
  53:;;; sort hash table with by value
  54:;;; assuming values = reals/ints
  55:;;; returning a sorted list of key-value tuples (lists)
  56:(define sort-by-value
  57:  (lambda (table)
  58:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  59:      (sort keyval (lambda (a b)
  60:                     (< (cadr a) (cadr b)))))))
  61:
  62:
  63:(define add-data
  64:  (lambda (tokenlist)
  65:    (let ([count (- (length tokenlist) 1)])
  66:      (set! tokencount  (+ tokencount  (length tokenlist)))
  67:      (set! bigramcount (+ bigramcount count))
  68:      ; count the first token in the list
  69:      (let ([value (hash-table-get types (car tokenlist) 0.0)])
  70:        (hash-table-put! types (car tokenlist) (+ value 1)))
  71:      ; loop over the rest of the tokens
  72:      (let loop ([i 1])
  73:        (let* ([token  (list-ref       tokenlist i)]              ; token = second element of bigram 
  74:               [bigram (list (list-ref tokenlist (- i 1)) token)] ; bigram = previous and current token as list
  75:               [wvalue (hash-table-get types   token  0.0)]       ; get value for token
  76:               [bvalue (hash-table-get bigrams bigram 0.0)])      ; get value for bigram
  77:          (hash-table-put! types   token  (+ wvalue 1))    ; increment counter for token
  78:          (hash-table-put! bigrams bigram (+ bvalue 1)))   ; increment counter for bigram
  79:        (if (< i count)
  80:            (loop (+ i 1)))))))
  81:
  82:
  83:;;; log2 of value
  84:;;; Base transformation:
  85:;;;   log2 is the natural log divided by the natural log of 2 (the base)
  86:(define log2
  87:    (lambda (x)
  88:      (/ (log x) (log 2))))
  89:
  90:
  91:;;; mutual-information of P(x), P(y), P(xy)
  92:;;; calculate pointwise MI as
  93:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
  94:(define mutual-information
  95:  (lambda (px py pxy)
  96:    (* pxy (log2 (/ pxy (* px py))))))
  97:
  98:
  99:;;; load-file (filename)
 100:;;; load text from file into a string variable and return it
 101:(define load-file
 102:  (lambda (name)
 103:      (call-with-input-file name
 104:        (lambda (p)
 105:          (read-string (file-size name) p)))))
 106:
 107:
 108:(begin 
 109:  (vector-for-each (lambda (i fname)
 110:                     (printf "Loading file: ~a\n" fname)
 111:                     (let ([text (load-file fname)])
 112:                       (string-lowercase! text)
 113:                       (add-data (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
 114:                   argv)
 115:  (let ((result (sort-by-value bigrams)))
 116:    (printf "bigram\tfreq\trel freq\tMI\n")
 117:    (for-each (lambda (a)
 118:                (unless (or (member (caar a) stopwords)
 119:                            (member (cadar a) stopwords))
 120:                  (printf "~a ~a\t~a\t~a\t~a\n"
 121:                          (caar a)
 122:                          (cadar a)
 123:                          (cadr a)
 124:                          (/ (cadr a) bigramcount)
 125:                          (mutual-information
 126:                           (/ (hash-table-get types (caar a)) tokencount)
 127:                           (/ (hash-table-get types (cadar a)) tokencount)
 128:                           (/ (cadr a) bigramcount)))))
 129:              (reverse result)))
 130:    (printf "---------------------------------------------------------\n")
 131:    (printf "Number of tokens: ~a\n"  tokencount)
 132:    (printf "Number of types: ~a\n"   (hash-table-count types))
 133:    (printf "Type/Token ratio: ~a\n"  (/ (hash-table-count types) tokencount))
 134:    (printf "Number of bigrams: ~a\n" bigramcount)
 135:    (printf "Number of bigram types: ~a\n" (hash-table-count bigrams))
 136:    (printf "Bigram type/token ratio: ~a\n"  (/ (hash-table-count bigrams) bigramcount)))