1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countbigrams3.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of bigrams
  26:;;;    and their frequency is printed out.
  27:;;;
  28:;;; If the command line parameters contain more than one text file,
  29:;;; the above results are accumulated over all the input text files.
  30:;;;
  31:;;; Usage:
  32:;;; mzscheme -r countbigrams3.scm test1.txt test2.txt ...
  33:;;; ----------------------------------------------------
  34:
  35:
  36:;;; all necessary libraries and functions
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss"))   ; for string-uppercase!
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "vector-lib.ss" "srfi" "43"))  ; for vector-for-each
  41:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  42:
  43:(load "english.scm")
  44:
  45:;;; Global variables
  46:(define bigramcount 0.0)                      ; total number of bigrams
  47:(define bigrams     (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
  48:
  49:
  50:;;; sort-by-value
  51:;;; <- hash-table
  52:;;; -> list of key-value tuples (lists)
  53:;;; ----------------------------------------------------
  54:;;; Sort a hash-table of key-value pairs by value, by converting it
  55:;;; into a list of key-value tuples and sorting on the value.
  56:(define sort-by-value
  57:  (lambda (table)
  58:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  59:      (sort keyval (lambda (a b)
  60:                     (< (cadr a) (cadr b)))))))
  61:
  62:
  63:;;; add-bigrams
  64:;;; <- list of strings, i.e. token list
  65:;;; !-> updated hash-table bigram-hash
  66:;;; !-> updated count-bigrams counter
  67:;;; ----------------------------------------------------
  68:;;; Add word-bigrams from an ordered list of tokens to the hash-table
  69:;;; container and keep track of their count.
  70:(define add-bigrams
  71:  (lambda (words)
  72:    (let ([count (- (length words) 1)])          ; how many bigrams
  73:      (set! bigramcount (+ bigramcount count)) ; remember the total count
  74:      (let loop ([i 1])
  75:        (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))]  ; create bigram
  76:               [value  (hash-table-get bigrams bigram 0.0)])             ; get the calue for bigram
  77:          (hash-table-put! bigrams bigram (+ value 1)))
  78:        (if (< i count)
  79:            (loop (+ i 1)))))))
  80:
  81:
  82:;;; load-file
  83:;;; <- string filename
  84:;;; -> string file content
  85:;;; ----------------------------------------------------
  86:;;; Load text from file into a string variable and return it.
  87:(define load-file
  88:  (lambda (name)
  89:    (call-with-input-file name
  90:      (lambda (p)
  91:        (read-string (file-size name) p)))))
  92:
  93:
  94:;;; main steps
  95:(begin 
  96:  (vector-for-each (lambda (i fname)
  97:                     (let ([text (load-file fname)])
  98:                       (string-lowercase! text) ; normalize the tokens to lowercase
  99:                       ; replace punctuation marks in text before tokenization
 100:                       (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
 101:                   argv)
 102:  (let ([result (sort-by-value bigrams)])
 103:    (for-each (lambda (a)
 104:                (let ([bigram (car a)])
 105:                  (unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
 106:                    (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a)))))
 107:              (reverse result)))
 108:  (printf "---------------------------------------------------------\n")
 109:  (printf "Number of tokens\t~a\n" bigramcount)
 110:  (printf "Number of types\t~a\n" (hash-table-count bigrams)))