1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countwords1.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. The tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of tokens
26:;;; and their frequency is printed out, as well as a reversed
27:;;; frequency list.
28:;;; 6. A list of tokens and their relative frequency is printed out.
29:;;;
30:;;; If the command line parameters contain more than one text file,
31:;;; the above results are accumulated over all the input text files.
32:;;;
33:;;; Usage:
34:;;; mzscheme -r countwords1.scm test1.txt test2.txt ...
35:;;; ----------------------------------------------------
36:
37:;;; all necessary libraries and functions
38:(require (lib "list.ss"))
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "vector-lib.ss" "srfi" "43"))
41:
42:
43:;;; Global variables
44:(define count-words 0.0) ; total number of tokens
45:(define word-hash (make-hash-table 'equal)) ; hash-table container for token-count pairs
46:
47:
48:;;; print-wordlist
49:;;; <- hash-table of key-value pairs
50:;;; side effect: print out of tab-delimited key-value pairs per line
51:;;; ----------------------------------------------------
52:;;; Pretty print wordlist as tab-delimited key-value lines.
53:(define print-wordlist!
54: (lambda (table)
55: (hash-table-for-each table
56: (lambda (key value)
57: (printf "~a\t~a\n" key value)))))
58:
59:
60:;;; sort-by-value
61:;;; <- hash-table
62:;;; -> list of key-value tuples (lists)
63:;;; ----------------------------------------------------
64:;;; Sort a hash-table of key-value pairs by value, by converting it
65:;;; into a list of key-value tuples and sorting on the value.
66:(define sort-by-value
67: (lambda (table)
68: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
69: (sort keyval (lambda (a b)
70: (< (cadr a) (cadr b)))))))
71:
72:
73:;;; add-words
74:;;; <- list of strings, i.e. token list
75:;;; <- hash-table
76:;;; !-> updated hash-table word-hash
77:;;; !-> updated count-words counter
78:;;; ----------------------------------------------------
79:;;; Add words/tokens from an ordered list of tokens to the hash-table
80:;;; container and keep track of their count.
81:(define add-words
82: (lambda (words)
83: (set! count-words (+ count-words (length words))) ; increment the total number of words counter
84: (for-each (lambda (token)
85: (let ((value (hash-table-get word-hash token 0.0)))
86: (hash-table-put! word-hash token (+ value 1))))
87: words)))
88:
89:
90:;;; load-file
91:;;; <- string filename
92:;;; -> string file content
93:;;; ----------------------------------------------------
94:;;; Load text from file into a string variable and return it.
95:(define load-file
96: (lambda (name)
97: (let ([size (file-size name)])
98: (call-with-input-file name
99: (lambda (p)
100: (read-string size p))))))
101:
102:
103:;;; ----------------------------------------------------
104:;;; main steps
105:(begin
106: (vector-for-each (lambda (i fname)
107: (printf "Loading file: ~a\n" fname)
108: (add-words (string-tokenize (load-file fname))))
109: argv)
110: (printf "Number of tokens: ~a\n" count-words)
111: (printf "Number of types: ~a\n" (hash-table-count word-hash))
112: (printf "Type/Token ratio: ~a\n" (/ (hash-table-count word-hash) count-words))
113: (printf "---------------------------------------------------------\n")
114: (print-wordlist! word-hash)
115: (let ((result (sort-by-value word-hash)))
116: (printf "---------------------------------------------------------\n")
117: (printf "Sorted increasing:\n")
118: (for-each (lambda (a)
119: (printf "~a\t~a\n" (car a) (cadr a)))
120: result)
121: (printf "---------------------------------------------------------\n")
122: (printf "Sorted decreasing:\n")
123: (for-each (lambda (a)
124: (printf "~a\t~a\n" (car a) (cadr a)))
125: (reverse result))
126: (printf "---------------------------------------------------------\n")
127: (printf "Sorted decreasing with relative frequency:\n")
128: (printf "token\tabsolute frequency\trelative frequency\n")
129: (for-each (lambda (a)
130: (printf "~a\t~a\t~a\n" (car a) (cadr a) (/ (cadr a) count-words)))
131: (reverse result))))