1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countbigrams2.scm
   5:;;; Author:   Damir Cavar <dcavar@unizd.hr>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of bigrams
  26:;;;    and their frequency is printed out.
  27:;;;
  28:;;; If the command line parameters contain more than one text file,
  29:;;; the above results are accumulated over all the input text files.
  30:;;;
  31:;;; Usage:
  32:;;; mzscheme -r countbigrams2.scm test1.txt test2.txt ...
  33:;;; ----------------------------------------------------
  34:
  35:
  36:;;; all necessary libraries and functions
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss"))   ; for string-uppercase!
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "vector-lib.ss" "srfi" "43"))  ; for vector-for-each
  41:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  42:
  43:
  44:;;; Global variables
  45:(define bigramcount 0.0)                      ; total number of bigrams
  46:(define bigrams     (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
  47:
  48:
  49:;;; print-wordlist
  50:;;; <- hash-table of key-value pairs
  51:;;; side effect: print out of tab-delimited key-value pairs per line
  52:;;; ----------------------------------------------------
  53:;;; Pretty print wordlist as tab-delimited key-value lines.
  54:(define print-bigramlist!
  55:  (lambda (table)
  56:    (hash-table-for-each table
  57:                         (lambda (key value)
  58:                           (printf "~a ~a\t~a\n" (car key) (cadr key) value)))))
  59:
  60:
  61:;;; sort-by-value
  62:;;; <- hash-table
  63:;;; -> list of key-value tuples (lists)
  64:;;; ----------------------------------------------------
  65:;;; Sort a hash-table of key-value pairs by value, by converting it
  66:;;; into a list of key-value tuples and sorting on the value.
  67:(define sort-by-value
  68:  (lambda (table)
  69:    (let ([keyval (hash-table-map table (lambda (key val)
  70:                                          (list key val)))])
  71:      (sort keyval (lambda (a b)
  72:                     (< (cadr a) (cadr b)))))))
  73:
  74:
  75:;;; add-bigrams
  76:;;; <- list of strings, i.e. token list
  77:;;; !-> updated hash-table bigram-hash
  78:;;; !-> updated count-bigrams counter
  79:;;; ----------------------------------------------------
  80:;;; Add word-bigrams from an ordered list of tokens to the hash-table
  81:;;; container and keep track of their count.
  82:(define add-bigrams
  83:  (lambda (words)
  84:    (let ([count (- (length words) 1)])      ; how many bigrams
  85:      (set! bigramcount (+ bigramcount count)) ; remember the total count
  86:      (let loop ([i 1])
  87:        (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))]  ; create bigram
  88:               [value  (hash-table-get bigrams bigram 0.0)])             ; get the calue for bigram
  89:          (hash-table-put! bigrams bigram (+ value 1)))
  90:        (if (< i count)
  91:            (loop (+ i 1)))))))
  92:
  93:
  94:;;; load-file
  95:;;; <- string filename
  96:;;; -> string file content
  97:;;; ----------------------------------------------------
  98:;;; Load text from file into a string variable and return it.
  99:(define load-file
 100:  (lambda (name)
 101:      (call-with-input-file name
 102:        (lambda (p)
 103:          (read-string (file-size name) p)))))
 104:
 105:
 106:;;; main steps
 107:(begin 
 108:  (vector-for-each (lambda (i fname)
 109:                     (printf "Loading file: ~a\n" fname)
 110:                     (let ([text (load-file fname)])
 111:                       (string-lowercase! text)
 112:                       (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
 113:                   argv)
 114:  (printf "Number of tokens: ~a\n" bigramcount)
 115:  (printf "Number of types: ~a\n" (hash-table-count bigrams))
 116:  (printf "---------------------------------------------------------\n")
 117:  (print-bigramlist! bigrams)
 118:  (let ([result (sort-by-value bigrams)])
 119:    (printf "Sorted decreasing:\n")
 120:    (for-each (lambda (a)
 121:                (let ([bigram (car a)])
 122:                  (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a))))
 123:              (reverse result))))