1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countbigrams2.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of bigrams
26:;;; and their frequency is printed out.
27:;;;
28:;;; If the command line parameters contain more than one text file,
29:;;; the above results are accumulated over all the input text files.
30:;;;
31:;;; Usage:
32:;;; mzscheme -r countbigrams2.scm test1.txt test2.txt ...
33:;;; ----------------------------------------------------
34:
35:
36:;;; all necessary libraries and functions
37:(require (lib "list.ss"))
38:(require (lib "string.ss")) ; for string-uppercase!
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
41:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
42:
43:
44:;;; Global variables
45:(define bigramcount 0.0) ; total number of bigrams
46:(define bigrams (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
47:
48:
49:;;; print-wordlist
50:;;; <- hash-table of key-value pairs
51:;;; side effect: print out of tab-delimited key-value pairs per line
52:;;; ----------------------------------------------------
53:;;; Pretty print wordlist as tab-delimited key-value lines.
54:(define print-bigramlist!
55: (lambda (table)
56: (hash-table-for-each table
57: (lambda (key value)
58: (printf "~a ~a\t~a\n" (car key) (cadr key) value)))))
59:
60:
61:;;; sort-by-value
62:;;; <- hash-table
63:;;; -> list of key-value tuples (lists)
64:;;; ----------------------------------------------------
65:;;; Sort a hash-table of key-value pairs by value, by converting it
66:;;; into a list of key-value tuples and sorting on the value.
67:(define sort-by-value
68: (lambda (table)
69: (let ([keyval (hash-table-map table (lambda (key val)
70: (list key val)))])
71: (sort keyval (lambda (a b)
72: (< (cadr a) (cadr b)))))))
73:
74:
75:;;; add-bigrams
76:;;; <- list of strings, i.e. token list
77:;;; !-> updated hash-table bigram-hash
78:;;; !-> updated count-bigrams counter
79:;;; ----------------------------------------------------
80:;;; Add word-bigrams from an ordered list of tokens to the hash-table
81:;;; container and keep track of their count.
82:(define add-bigrams
83: (lambda (words)
84: (let ([count (- (length words) 1)]) ; how many bigrams
85: (set! bigramcount (+ bigramcount count)) ; remember the total count
86: (let loop ([i 1])
87: (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))] ; create bigram
88: [value (hash-table-get bigrams bigram 0.0)]) ; get the calue for bigram
89: (hash-table-put! bigrams bigram (+ value 1)))
90: (if (< i count)
91: (loop (+ i 1)))))))
92:
93:
94:;;; load-file
95:;;; <- string filename
96:;;; -> string file content
97:;;; ----------------------------------------------------
98:;;; Load text from file into a string variable and return it.
99:(define load-file
100: (lambda (name)
101: (call-with-input-file name
102: (lambda (p)
103: (read-string (file-size name) p)))))
104:
105:
106:;;; main steps
107:(begin
108: (vector-for-each (lambda (i fname)
109: (printf "Loading file: ~a\n" fname)
110: (let ([text (load-file fname)])
111: (string-lowercase! text)
112: (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
113: argv)
114: (printf "Number of tokens: ~a\n" bigramcount)
115: (printf "Number of types: ~a\n" (hash-table-count bigrams))
116: (printf "---------------------------------------------------------\n")
117: (print-bigramlist! bigrams)
118: (let ([result (sort-by-value bigrams)])
119: (printf "Sorted decreasing:\n")
120: (for-each (lambda (a)
121: (let ([bigram (car a)])
122: (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a))))
123: (reverse result))))