1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countunibigrams1.scm
   5:;;; Author:   Damir Cavar <dcavar@unizd.hr>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. Single tokens are placed in a second hash-table.
  25:;;;
  26:;;; If the command line parameters contain more than one text file,
  27:;;; the above results are accumulated over all the input text files.
  28:;;;
  29:;;; Usage:
  30:;;; mzscheme -r countunibigrams1.scm test1.txt test2.txt ...
  31:;;; ----------------------------------------------------
  32:
  33:
  34:(require (lib "list.ss"))
  35:(require (lib "string.ss")) ; for string-uppercase!
  36:(require (lib "string.ss"     "srfi" "13"))
  37:(require (lib "vector-lib.ss" "srfi" "43"))
  38:
  39:
  40:;;; global counters
  41:(define tokencount  0.0) ; total number of words
  42:(define bigramcount 0.0) ; total number of bigrams
  43:
  44:;;; hash-table containers
  45:(define types   (make-hash-table 'equal)) ; words and their counts
  46:(define bigrams (make-hash-table 'equal)) ; bigrams and their counts
  47:
  48:
  49:;;; sort hash table with by value
  50:;;; assuming values = reals/ints
  51:;;; returning a sorted list of key-value tuples (lists)
  52:(define sort-by-value
  53:  (lambda (table)
  54:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  55:      (sort keyval (lambda (a b)
  56:                     (< (cadr a) (cadr b)))))))
  57:
  58:
  59:(define add-data
  60:  (lambda (tokenlist)
  61:    (let ([count (- (length tokenlist) 1)])
  62:
  63:      ; remember the total count of tokens and bigrams
  64:      (set! tokencount  (+ tokencount  (length tokenlist)))
  65:      (set! bigramcount (+ bigramcount count))
  66:
  67:      ; count the first token in the list
  68:      (let ([value (hash-table-get types (car tokenlist) 0.0)])
  69:        (hash-table-put! types (car tokenlist) (+ value 1)))
  70:
  71:      ; loop over the rest of the tokens
  72:      (let loop ([i 1])
  73:        (let* ([token  (list-ref       tokenlist i)]              ; token = second element of bigram 
  74:               [bigram (list (list-ref tokenlist (- i 1)) token)] ; bigram = previous and current token as list
  75:               [wvalue (hash-table-get types   token  0.0)]       ; get value for token
  76:               [bvalue (hash-table-get bigrams bigram 0.0)])      ; get value for bigram
  77:          (hash-table-put! types   token  (+ wvalue 1))    ; increment counter for token
  78:          (hash-table-put! bigrams bigram (+ bvalue 1)))   ; increment counter for bigram
  79:        (if (< i count)
  80:            (loop (+ i 1)))))))
  81:
  82:
  83:;;; load text from file into a string variable and return it
  84:(define load-file
  85:  (lambda (name)
  86:      (call-with-input-file name
  87:        (lambda (p)
  88:          (read-string (file-size name) p)))))
  89:
  90:
  91:(begin 
  92:  (vector-for-each (lambda (i fname)
  93:                     ; (printf "Loading file: ~a\n" fname)
  94:                     (add-data (string-tokenize (load-file fname))))
  95:                   argv)
  96:  (let ([result (sort-by-value bigrams)])
  97:    (printf "token\tabs. freq.\trel. freq.\n")
  98:    (for-each (lambda (a)
  99:                (printf "~a\t~a\t~a\n" (car a) (cadr a) (/ (cadr a) bigramcount)))
 100:              (reverse result)))
 101:  (printf "---------------------------------------------------------\n")
 102:  (printf "Number of tokens: ~a\n"  tokencount)
 103:  (printf "Number of types: ~a\n"   (hash-table-count types))
 104:  (printf "Type/Token ratio: ~a\n"  (/ (hash-table-count types) tokencount))
 105:  (printf "Number of bigrams: ~a\n" bigramcount)
 106:  (printf "Number of bigram types: ~a\n" (hash-table-count bigrams))
 107:  (printf "Bigram type/token ratio: ~a\n"  (/ (hash-table-count bigrams) bigramcount)))