1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: chartrigrams.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. Trigrams of characters are created from the corpus.
21:;;; 3. The hash-table is converted into a list of key-value tuples.
22:;;; 4. The key-values are sorted by value, and a list of tokens
23:;;; and their relative frequency is printed out.
24:;;;
25:;;; If the command line parameters contain more than one text file,
26:;;; the above results are accumulated over all the input text files.
27:;;;
28:;;; Usage:
29:;;; mzscheme -r chartrigrams.scm test1.txt test2.txt ...
30:;;; ----------------------------------------------------
31:
32:
33:;;; all required libraries and functions
34:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
35:(require (lib "list.ss")) ; for sort
36:
37:
38:;;; Global variables
39:(define trigramcount 0.0) ; counter of total number tokens
40:(define trigrams (make-hash-table 'equal)) ; hash-table for tokens and counts
41:
42:
43:;;; sort-by-value
44:;;; <- hash-table
45:;;; -> list of key-value tuples (lists)
46:;;; ----------------------------------------------------
47:;;; Sort a hash-table of key-value pairs by value, by converting it
48:;;; into a list of key-value tuples and sorting on the value.
49:(define sort-by-value
50: (lambda (table)
51: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
52: (sort keyval (lambda (a b)
53: (< (cadr a) (cadr b)))))))
54:
55:
56:;;; add-words
57:;;; <- list of characters, i.e. string
58:;;; !-> updated hash-table trigrams
59:;;; !-> updated trigramcount counter
60:;;; ----------------------------------------------------
61:;;; Add words/tokens from an ordered list of tokens to the hash-table
62:;;; container and keep track of their count.
63:(define add-trigrams
64: (lambda (text)
65: (let ([max (- (string-length text) 2)])
66: (set! trigramcount (+ trigramcount max)) ; increment the total number of tokens
67: (let loop ([i 0])
68: (let* ([token (substring text i (+ i 3))]
69: [value (hash-table-get trigrams token 0.0)])
70: (hash-table-put! trigrams token (+ value 1)))
71: (if (< i (- max 1))
72: (loop (+ i 1)))))))
73:
74:
75:;;; load-file
76:;;; <- string filename
77:;;; -> string file content
78:;;; ----------------------------------------------------
79:;;; Load text from file into a string variable and return it.
80:(define load-file
81: (lambda (name)
82: (call-with-input-file name
83: (lambda (p)
84: (read-string (file-size name) p)))))
85:
86:
87:;;; ----------------------------------------------------
88:;;; main steps
89:(begin
90: (vector-for-each (lambda (i fname)
91: (printf "Loading file: ~a\n" fname)
92: (add-trigrams (load-file fname)))
93: argv)
94: (printf "Number of tokens: ~a\n" trigramcount)
95: (printf "Number of types: ~a\n" (hash-table-count trigrams))
96: (printf "Type/Token ratio: ~a\n" (/ (hash-table-count trigrams) trigramcount))
97: (let ([result (sort-by-value trigrams)])
98: (printf "---------------------------------------------------------\n")
99: (printf "Sorted decreasing with relative frequency:\n")
100: (printf "token\tabsolute frequency\trelative frequency\n")
101: (for-each (lambda (a)
102: (write (car a))
103: (printf "\t~a\t~a\n" (cadr a) (/ (cadr a) trigramcount)))
104: (reverse result))))
105: