1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countunibigrams1.scm
5:;;; Author: Damir Cavar <dcavar@unizd.hr>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. Single tokens are placed in a second hash-table.
25:;;;
26:;;; If the command line parameters contain more than one text file,
27:;;; the above results are accumulated over all the input text files.
28:;;;
29:;;; Usage:
30:;;; mzscheme -r countunibigrams1.scm test1.txt test2.txt ...
31:;;; ----------------------------------------------------
32:
33:
34:(require (lib "list.ss"))
35:(require (lib "string.ss")) ; for string-uppercase!
36:(require (lib "string.ss" "srfi" "13"))
37:(require (lib "vector-lib.ss" "srfi" "43"))
38:
39:
40:;;; global counters
41:(define tokencount 0.0) ; total number of words
42:(define bigramcount 0.0) ; total number of bigrams
43:
44:;;; hash-table containers
45:(define types (make-hash-table 'equal)) ; words and their counts
46:(define bigrams (make-hash-table 'equal)) ; bigrams and their counts
47:
48:
49:;;; sort hash table with by value
50:;;; assuming values = reals/ints
51:;;; returning a sorted list of key-value tuples (lists)
52:(define sort-by-value
53: (lambda (table)
54: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
55: (sort keyval (lambda (a b)
56: (< (cadr a) (cadr b)))))))
57:
58:
59:(define add-data
60: (lambda (tokenlist)
61: (let ([count (- (length tokenlist) 1)])
62:
63: ; remember the total count of tokens and bigrams
64: (set! tokencount (+ tokencount (length tokenlist)))
65: (set! bigramcount (+ bigramcount count))
66:
67: ; count the first token in the list
68: (let ([value (hash-table-get types (car tokenlist) 0.0)])
69: (hash-table-put! types (car tokenlist) (+ value 1)))
70:
71: ; loop over the rest of the tokens
72: (let loop ([i 1])
73: (let* ([token (list-ref tokenlist i)] ; token = second element of bigram
74: [bigram (list (list-ref tokenlist (- i 1)) token)] ; bigram = previous and current token as list
75: [wvalue (hash-table-get types token 0.0)] ; get value for token
76: [bvalue (hash-table-get bigrams bigram 0.0)]) ; get value for bigram
77: (hash-table-put! types token (+ wvalue 1)) ; increment counter for token
78: (hash-table-put! bigrams bigram (+ bvalue 1))) ; increment counter for bigram
79: (if (< i count)
80: (loop (+ i 1)))))))
81:
82:
83:;;; load text from file into a string variable and return it
84:(define load-file
85: (lambda (name)
86: (call-with-input-file name
87: (lambda (p)
88: (read-string (file-size name) p)))))
89:
90:
91:(begin
92: (vector-for-each (lambda (i fname)
93: ; (printf "Loading file: ~a\n" fname)
94: (add-data (string-tokenize (load-file fname))))
95: argv)
96: (let ([result (sort-by-value bigrams)])
97: (printf "token\tabs. freq.\trel. freq.\n")
98: (for-each (lambda (a)
99: (printf "~a\t~a\t~a\n" (car a) (cadr a) (/ (cadr a) bigramcount)))
100: (reverse result)))
101: (printf "---------------------------------------------------------\n")
102: (printf "Number of tokens: ~a\n" tokencount)
103: (printf "Number of types: ~a\n" (hash-table-count types))
104: (printf "Type/Token ratio: ~a\n" (/ (hash-table-count types) tokencount))
105: (printf "Number of bigrams: ~a\n" bigramcount)
106: (printf "Number of bigram types: ~a\n" (hash-table-count bigrams))
107: (printf "Bigram type/token ratio: ~a\n" (/ (hash-table-count bigrams) bigramcount)))