1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countbigrams1.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of bigrams
  26:;;;    and their frequency is printed out.
  27:;;;
  28:;;; If the command line parameters contain more than one text file,
  29:;;; the above results are accumulated over all the input text files.
  30:;;;
  31:;;; Usage:
  32:;;; mzscheme -r countbigrams1.scm test1.txt test2.txt ...
  33:;;; ----------------------------------------------------
  34:
  35:
  36:;;; all necessary libraries and functions
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss"     "srfi" "13"))
  39:(require (lib "vector-lib.ss" "srfi" "43"))
  40:
  41:
  42:;;; Global variables
  43:(define bigramcount 0.0)                      ; total number of bigrams
  44:(define bigrams     (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
  45:
  46:
  47:;;; print-wordlist
  48:;;; <- hash-table of key-value pairs
  49:;;; side effect: print out of tab-delimited key-value pairs per line
  50:;;; ----------------------------------------------------
  51:;;; Pretty print wordlist as tab-delimited key-value lines.
  52:(define print-bigramlist!
  53:  (lambda (table)
  54:    (hash-table-for-each table
  55:                         (lambda (key value)
  56:                           (printf "~a ~a\t~a\n" (car key) (cadr key) value)))))
  57:
  58:
  59:;;; sort-by-value
  60:;;; <- hash-table
  61:;;; -> list of key-value tuples (lists)
  62:;;; ----------------------------------------------------
  63:;;; Sort a hash-table of key-value pairs by value, by converting it
  64:;;; into a list of key-value tuples and sorting on the value.
  65:(define sort-by-value
  66:  (lambda (table)
  67:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  68:      (sort keyval (lambda (a b)
  69:                     (< (cadr a) (cadr b)))))))
  70:
  71:
  72:;;; add-bigrams
  73:;;; <- list of strings, i.e. token list
  74:;;; !-> updated hash-table bigram-hash
  75:;;; !-> updated count-bigrams counter
  76:;;; ----------------------------------------------------
  77:;;; Add word-bigrams from an ordered list of tokens to the hash-table
  78:;;; container and keep track of their count.
  79:(define add-bigrams
  80:  (lambda (words)
  81:    (let ([count (- (length words) 1)])      ; how many bigrams
  82:      (set! bigramcount (+ bigramcount count)) ; remember the total count
  83:      (let loop ([i 1])
  84:        (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))]  ; create bigram
  85:               [value  (hash-table-get bigrams bigram 0.0)])             ; get the calue for bigram
  86:          (hash-table-put! bigrams bigram (+ value 1)))
  87:        (if (< i count)
  88:            (loop (+ i 1)))))))
  89:
  90:
  91:;;; load-file
  92:;;; <- string filename
  93:;;; -> string file content
  94:;;; ----------------------------------------------------
  95:;;; Load text from file into a string variable and return it.
  96:(define load-file
  97:  (lambda (name)
  98:      (call-with-input-file name
  99:        (lambda (p)
 100:          (read-string (file-size name) p)))))
 101:
 102:
 103:;;; main steps
 104:(begin 
 105:  (vector-for-each (lambda (i fname)
 106:                     (printf "Loading file: ~a\n" fname)
 107:                     (add-bigrams (string-tokenize (load-file fname))))
 108:                   argv)
 109:  (printf "Number of tokens: ~a\n" bigramcount)
 110:  (printf "Number of types: ~a\n" (hash-table-count bigrams))
 111:  (printf "---------------------------------------------------------\n")
 112:  ;(print-bigramlist! bigrams)
 113:  (let ([result (sort-by-value bigrams)])
 114:    (printf "Decreasing frequency profile:\n")
 115:    (for-each (lambda (a)
 116:                (let ([bigram (car a)])
 117:                  (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a))))
 118:              (reverse result))))