1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countbigrams3.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of bigrams
26:;;; and their frequency is printed out.
27:;;;
28:;;; If the command line parameters contain more than one text file,
29:;;; the above results are accumulated over all the input text files.
30:;;;
31:;;; Usage:
32:;;; mzscheme -r countbigrams3.scm test1.txt test2.txt ...
33:;;; ----------------------------------------------------
34:
35:
36:;;; all necessary libraries and functions
37:(require (lib "list.ss"))
38:(require (lib "string.ss")) ; for string-uppercase!
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
41:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
42:
43:(load "english.scm")
44:
45:;;; Global variables
46:(define bigramcount 0.0) ; total number of bigrams
47:(define bigrams (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
48:
49:
50:;;; sort-by-value
51:;;; <- hash-table
52:;;; -> list of key-value tuples (lists)
53:;;; ----------------------------------------------------
54:;;; Sort a hash-table of key-value pairs by value, by converting it
55:;;; into a list of key-value tuples and sorting on the value.
56:(define sort-by-value
57: (lambda (table)
58: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
59: (sort keyval (lambda (a b)
60: (< (cadr a) (cadr b)))))))
61:
62:
63:;;; add-bigrams
64:;;; <- list of strings, i.e. token list
65:;;; !-> updated hash-table bigram-hash
66:;;; !-> updated count-bigrams counter
67:;;; ----------------------------------------------------
68:;;; Add word-bigrams from an ordered list of tokens to the hash-table
69:;;; container and keep track of their count.
70:(define add-bigrams
71: (lambda (words)
72: (let ([count (- (length words) 1)]) ; how many bigrams
73: (set! bigramcount (+ bigramcount count)) ; remember the total count
74: (let loop ([i 1])
75: (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))] ; create bigram
76: [value (hash-table-get bigrams bigram 0.0)]) ; get the calue for bigram
77: (hash-table-put! bigrams bigram (+ value 1)))
78: (if (< i count)
79: (loop (+ i 1)))))))
80:
81:
82:;;; load-file
83:;;; <- string filename
84:;;; -> string file content
85:;;; ----------------------------------------------------
86:;;; Load text from file into a string variable and return it.
87:(define load-file
88: (lambda (name)
89: (call-with-input-file name
90: (lambda (p)
91: (read-string (file-size name) p)))))
92:
93:
94:;;; main steps
95:(begin
96: (vector-for-each (lambda (i fname)
97: (let ([text (load-file fname)])
98: (string-lowercase! text) ; normalize the tokens to lowercase
99: ; replace punctuation marks in text before tokenization
100: (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
101: argv)
102: (let ([result (sort-by-value bigrams)])
103: (for-each (lambda (a)
104: (let ([bigram (car a)])
105: (unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
106: (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a)))))
107: (reverse result)))
108: (printf "---------------------------------------------------------\n")
109: (printf "Number of tokens\t~a\n" bigramcount)
110: (printf "Number of types\t~a\n" (hash-table-count bigrams)))