1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countbigrams4.scm
   5:;;; Author:   Damir Cavar <dcavar@unizd.hr>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of bigrams
  26:;;;    and their frequency is printed out.
  27:;;;
  28:;;; If the command line parameters contain more than one text file,
  29:;;; the above results are accumulated over all the input text files.
  30:;;;
  31:;;; Usage:
  32:;;; mzscheme -r countbigrams4.scm test1.txt test2.txt ...
  33:;;; ----------------------------------------------------
  34:
  35:
  36:;;; all necessary libraries and functions
  37:(require (lib "list.ss"))
  38:(require (lib "string.ss"))   ; for string-uppercase!
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "vector-lib.ss" "srfi" "43"))  ; for vector-for-each
  41:(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions
  42:
  43:(load "english.scm")
  44:
  45:;;; Global variables
  46:(define bigramcount 0.0)                      ; total number of bigrams
  47:(define bigrams     (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
  48:(define tokencount  0.0)                      ; total number of tokens
  49:(define types       (make-hash-table 'equal)) ; hash-table container for type-count pairs
  50:
  51:
  52:;;; sort-by-value
  53:;;; <- hash-table
  54:;;; -> list of key-value tuples (lists)
  55:;;; ----------------------------------------------------
  56:;;; Sort a hash-table of key-value pairs by value, by converting it
  57:;;; into a list of key-value tuples and sorting on the value.
  58:(define sort-by-value
  59:  (lambda (table)
  60:    (let ([keyval (hash-table-map table (lambda (key val)
  61:                                          (list key val)))])
  62:      (sort keyval (lambda (a b)
  63:                     (< (cadr a) (cadr b)))))))
  64:
  65:
  66:;;; add-bigrams
  67:;;; <- list of strings, i.e. token list
  68:;;; !-> updated hash-table bigram-hash
  69:;;; !-> updated count-bigrams counter
  70:;;; ----------------------------------------------------
  71:;;; Add word-bigrams from an ordered list of tokens to the hash-table
  72:;;; container and keep track of their count.
  73:(define add-bigrams
  74:  (lambda (words)
  75:    (let ([count (- (length words) 1)])          ; how many bigrams
  76:      (set! bigramcount (+ bigramcount count))     ; remember total count of bigrams
  77:      (set! tokencount  (+ tokencount  (length words))) ; remember total count of tokens
  78:      ; add first token to hash-table
  79:      (let ([val (hash-table-get types (car words) 0.0)])
  80:        (hash-table-put! types (car words) (+ val 1)))
  81:      ; loop over the rest and create tokens and bigrams
  82:      (let loop ([i 1])
  83:        ; add next token to hash-table
  84:        (let ([val (hash-table-get types (list-ref words i) 0.0)])
  85:          (hash-table-put! types (list-ref words i) (+ val 1)))
  86:        ; add next bigram to hash-table
  87:        (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))]  ; create bigram
  88:               [value  (hash-table-get bigrams bigram 0.0)])             ; get the calue for bigram
  89:          (hash-table-put! bigrams bigram (+ value 1)))
  90:        (if (< i count)
  91:            (loop (+ i 1)))))))
  92:
  93:
  94:;;; load-file
  95:;;; <- string filename
  96:;;; -> string file content
  97:;;; ----------------------------------------------------
  98:;;; Load text from file into a string variable and return it.
  99:(define load-file
 100:  (lambda (name)
 101:    (call-with-input-file name
 102:      (lambda (p)
 103:        (read-string (file-size name) p)))))
 104:
 105:
 106:;;; main steps
 107:(begin 
 108:  (vector-for-each (lambda (i fname)
 109:                     (let ([text (load-file fname)])
 110:                       (string-lowercase! text)
 111:                       (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
 112:                   argv)
 113:  (let ([result (sort-by-value bigrams)])
 114:    (for-each (lambda (a)
 115:                (let ([bigram (car a)])
 116:                  (unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
 117:                    (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a)))))
 118:              (reverse result)))
 119:  (printf "---------------------------------------------------------\n")
 120:  (printf "Number of bigrams\t~a\n" bigramcount)
 121:  (printf "Number of bigram-types\t~a\n" (hash-table-count bigrams))
 122:  (printf "Number of tokens\t~a\n" tokencount)
 123:  (printf "Number of types\t~a\n" (hash-table-count types)))