1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: countwords1.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
  21:;;; 3. The tokens are placed into a hash-table as keys, the value
  22:;;;    is the absolute frequency (i.e. count) of each token in the
  23:;;;    text.
  24:;;; 4. The hash-table is converted into a list of key-value tuples.
  25:;;; 5. The key-values are sorted by value, and a list of tokens
  26:;;;    and their frequency is printed out, as well as a reversed
  27:;;;    frequency list.
  28:;;; 6. A list of tokens and their relative frequency is printed out.
  29:;;;
  30:;;; If the command line parameters contain more than one text file,
  31:;;; the above results are accumulated over all the input text files.
  32:;;;
  33:;;; Usage:
  34:;;; mzscheme -r countwords1.scm test1.txt test2.txt ...
  35:;;; ----------------------------------------------------
  36:
  37:;;; all necessary libraries and functions
  38:(require (lib "list.ss"))
  39:(require (lib "string.ss"     "srfi" "13"))
  40:(require (lib "vector-lib.ss" "srfi" "43"))
  41:
  42:
  43:;;; Global variables
  44:(define count-words 0.0)                      ; total number of tokens
  45:(define word-hash   (make-hash-table 'equal)) ; hash-table container for token-count pairs
  46:
  47:
  48:;;; print-wordlist
  49:;;; <- hash-table of key-value pairs
  50:;;; side effect: print out of tab-delimited key-value pairs per line
  51:;;; ----------------------------------------------------
  52:;;; Pretty print wordlist as tab-delimited key-value lines.
  53:(define print-wordlist!
  54:  (lambda (table)
  55:    (hash-table-for-each table
  56:                         (lambda (key value)
  57:                           (printf "~a\t~a\n" key value)))))
  58:
  59:
  60:;;; sort-by-value
  61:;;; <- hash-table
  62:;;; -> list of key-value tuples (lists)
  63:;;; ----------------------------------------------------
  64:;;; Sort a hash-table of key-value pairs by value, by converting it
  65:;;; into a list of key-value tuples and sorting on the value.
  66:(define sort-by-value
  67:  (lambda (table)
  68:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  69:      (sort keyval (lambda (a b)
  70:                     (< (cadr a) (cadr b)))))))
  71:
  72:
  73:;;; add-words
  74:;;; <- list of strings, i.e. token list
  75:;;; <- hash-table
  76:;;; !-> updated hash-table word-hash
  77:;;; !-> updated count-words counter
  78:;;; ----------------------------------------------------
  79:;;; Add words/tokens from an ordered list of tokens to the hash-table
  80:;;; container and keep track of their count.
  81:(define add-words
  82:  (lambda (words)
  83:    (set! count-words (+ count-words (length words)))  ; increment the total number of words counter
  84:    (for-each (lambda (token)
  85:                (let ((value (hash-table-get word-hash token 0.0)))
  86:                  (hash-table-put! word-hash token (+ value 1))))
  87:              words)))
  88:
  89:
  90:;;; load-file
  91:;;; <- string filename
  92:;;; -> string file content
  93:;;; ----------------------------------------------------
  94:;;; Load text from file into a string variable and return it.
  95:(define load-file
  96:  (lambda (name)
  97:    (let ([size (file-size name)])
  98:      (call-with-input-file name
  99:        (lambda (p)
 100:          (read-string size p))))))
 101:
 102:
 103:;;; ----------------------------------------------------
 104:;;; main steps
 105:(begin 
 106:  (vector-for-each (lambda (i fname)
 107:                     (printf "Loading file: ~a\n" fname)
 108:                     (add-words (string-tokenize (load-file fname))))
 109:                   argv)
 110:  (printf "Number of tokens: ~a\n" count-words)
 111:  (printf "Number of types: ~a\n" (hash-table-count word-hash))
 112:  (printf "Type/Token ratio: ~a\n" (/ (hash-table-count word-hash) count-words))
 113:  (printf "---------------------------------------------------------\n")
 114:  (print-wordlist! word-hash)
 115:  (let ((result (sort-by-value word-hash)))
 116:    (printf "---------------------------------------------------------\n")
 117:    (printf "Sorted increasing:\n")
 118:    (for-each (lambda (a)
 119:                (printf "~a\t~a\n" (car a) (cadr a)))
 120:              result)
 121:    (printf "---------------------------------------------------------\n")
 122:    (printf "Sorted decreasing:\n")
 123:    (for-each (lambda (a)
 124:                (printf "~a\t~a\n" (car a) (cadr a)))
 125:              (reverse result))
 126:    (printf "---------------------------------------------------------\n")
 127:    (printf "Sorted decreasing with relative frequency:\n")
 128:    (printf "token\tabsolute frequency\trelative frequency\n")
 129:    (for-each (lambda (a)
 130:                (printf "~a\t~a\t~a\n" (car a) (cadr a) (/ (cadr a) count-words)))
 131:              (reverse result))))