1:":"; exec mzscheme -r $0 "$@"
   2:
   3:;;; ----------------------------------------------------
   4:;;; Filename: chartrigrams.scm
   5:;;; Author:   Damir Cavar <dcavar@me.com>
   6:;;;
   7:;;; (C) 2006 by Damir Cavar
   8:;;;
   9:;;; This code is published under the restrictive GPL!
  10:;;; Please find the text of the GPL here:
  11:;;; http://www.gnu.org/licenses/gpl.txt
  12:;;; 
  13:;;; It is free for use, change, etc. as long as the copyright
  14:;;; note above is included in any modified version of the code.
  15:;;; 
  16:;;; This script assumes that the text is raw and encoded in UTF8.
  17:;;;
  18:;;; Functions:
  19:;;; 1. The text file is loaded into memory.
  20:;;; 2. Trigrams of characters are created from the corpus.
  21:;;; 3. The hash-table is converted into a list of key-value tuples.
  22:;;; 4. The key-values are sorted by value, and a list of tokens
  23:;;;    and their relative frequency is printed out.
  24:;;;
  25:;;; If the command line parameters contain more than one text file,
  26:;;; the above results are accumulated over all the input text files.
  27:;;;
  28:;;; Usage:
  29:;;; mzscheme -r chartrigrams.scm test1.txt test2.txt ...
  30:;;; ----------------------------------------------------
  31:
  32:
  33:;;; all required libraries and functions
  34:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
  35:(require (lib "list.ss"))                   ; for sort
  36:
  37:
  38:;;; Global variables
  39:(define trigramcount 0.0)                      ; counter of total number tokens
  40:(define trigrams     (make-hash-table 'equal)) ; hash-table for tokens and counts
  41:
  42:
  43:;;; sort-by-value
  44:;;; <- hash-table
  45:;;; -> list of key-value tuples (lists)
  46:;;; ----------------------------------------------------
  47:;;; Sort a hash-table of key-value pairs by value, by converting it
  48:;;; into a list of key-value tuples and sorting on the value.
  49:(define sort-by-value
  50:  (lambda (table)
  51:    (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
  52:      (sort keyval (lambda (a b)
  53:                     (< (cadr a) (cadr b)))))))
  54:
  55:
  56:;;; add-words
  57:;;; <- list of characters, i.e. string
  58:;;; !-> updated hash-table trigrams
  59:;;; !-> updated trigramcount counter
  60:;;; ----------------------------------------------------
  61:;;; Add words/tokens from an ordered list of tokens to the hash-table
  62:;;; container and keep track of their count.
  63:(define add-trigrams
  64:  (lambda (text)
  65:    (let ([max (- (string-length text) 2)])
  66:      (set! trigramcount (+ trigramcount max))  ; increment the total number of tokens
  67:      (let loop ([i 0])
  68:        (let* ([token (substring text i (+ i 3))]
  69:               [value (hash-table-get trigrams token 0.0)])
  70:          (hash-table-put! trigrams token (+ value 1)))
  71:        (if (< i (- max 1))
  72:            (loop (+ i 1)))))))
  73:
  74:
  75:;;; load-file
  76:;;; <- string filename
  77:;;; -> string file content
  78:;;; ----------------------------------------------------
  79:;;; Load text from file into a string variable and return it.
  80:(define load-file
  81:  (lambda (name)
  82:      (call-with-input-file name
  83:        (lambda (p)
  84:          (read-string (file-size name) p)))))
  85:
  86:
  87:;;; ----------------------------------------------------
  88:;;; main steps
  89:(begin 
  90:  (vector-for-each (lambda (i fname)
  91:                     (printf "Loading file: ~a\n" fname)
  92:                     (add-trigrams (load-file fname)))
  93:                   argv)
  94:  (printf "Number of tokens: ~a\n" trigramcount)
  95:  (printf "Number of types: ~a\n" (hash-table-count trigrams))
  96:  (printf "Type/Token ratio: ~a\n" (/ (hash-table-count trigrams) trigramcount))
  97:  (let ([result (sort-by-value trigrams)])
  98:    (printf "---------------------------------------------------------\n")
  99:    (printf "Sorted decreasing with relative frequency:\n")
 100:    (printf "token\tabsolute frequency\trelative frequency\n")
 101:    (for-each (lambda (a)
 102:                (write (car a))
 103:                (printf "\t~a\t~a\n" (cadr a) (/ (cadr a) trigramcount)))
 104:              (reverse result))))
 105: