1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countbigrams4.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of bigrams
26:;;; and their frequency is printed out.
27:;;;
28:;;; If the command line parameters contain more than one text file,
29:;;; the above results are accumulated over all the input text files.
30:;;;
31:;;; Usage:
32:;;; mzscheme -r countbigrams4.scm test1.txt test2.txt ...
33:;;; ----------------------------------------------------
34:
35:
36:;;; all necessary libraries and functions
37:(require (lib "list.ss"))
38:(require (lib "string.ss")) ; for string-uppercase!
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
41:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
42:
43:(load "english.scm")
44:
45:;;; Global variables
46:(define bigramcount 0.0) ; total number of bigrams
47:(define bigrams (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
48:(define tokencount 0.0) ; total number of tokens
49:(define types (make-hash-table 'equal)) ; hash-table container for type-count pairs
50:
51:
52:;;; sort-by-value
53:;;; <- hash-table
54:;;; -> list of key-value tuples (lists)
55:;;; ----------------------------------------------------
56:;;; Sort a hash-table of key-value pairs by value, by converting it
57:;;; into a list of key-value tuples and sorting on the value.
58:(define sort-by-value
59: (lambda (table)
60: (let ([keyval (hash-table-map table (lambda (key val)
61: (list key val)))])
62: (sort keyval (lambda (a b)
63: (< (cadr a) (cadr b)))))))
64:
65:
66:;;; log2 of value
67:;;; Base transformation:
68:;;; log2 is the natural log divided by the natural log of 2 (the base)
69:(define log2
70: (lambda (x)
71: (/ (log x) (log 2))))
72:
73:
74:;;; mutual-information of P(x), P(y), P(xy)
75:;;; calculate pointwise MI as
76:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
77:(define mutual-information
78: (lambda (px py pxy)
79: (* pxy (log2 (/ pxy (* px py))))))
80:
81:
82:
83:;;; add-bigrams
84:;;; <- list of strings, i.e. token list
85:;;; !-> updated hash-table bigram-hash
86:;;; !-> updated count-bigrams counter
87:;;; ----------------------------------------------------
88:;;; Add word-bigrams from an ordered list of tokens to the hash-table
89:;;; container and keep track of their count.
90:(define add-bigrams
91: (lambda (words)
92: (let ([count (- (length words) 1)]) ; how many bigrams
93: (set! bigramcount (+ bigramcount count)) ; remember total count of bigrams
94: (set! tokencount (+ tokencount (length words))) ; remember total count of tokens
95: ; add first token to hash-table
96: (let ([val (hash-table-get types (car words) 0.0)])
97: (hash-table-put! types (car words) (+ val 1)))
98: ; loop over the rest and create tokens and bigrams
99: (let loop ([i 1])
100: ; add next token to hash-table
101: (let ([val (hash-table-get types (list-ref words i) 0.0)])
102: (hash-table-put! types (list-ref words i) (+ val 1)))
103: ; add next bigram to hash-table
104: (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))] ; create bigram
105: [value (hash-table-get bigrams bigram 0.0)]) ; get the calue for bigram
106: (hash-table-put! bigrams bigram (+ value 1)))
107: (if (< i count)
108: (loop (+ i 1)))))))
109:
110:
111:;;; load-file
112:;;; <- string filename
113:;;; -> string file content
114:;;; ----------------------------------------------------
115:;;; Load text from file into a string variable and return it.
116:(define load-file
117: (lambda (name)
118: (call-with-input-file name
119: (lambda (p)
120: (read-string (file-size name) p)))))
121:
122:
123:;;; main steps
124:(begin
125: (vector-for-each (lambda (i fname)
126: (let ([text (load-file fname)])
127: (string-lowercase! text)
128: (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))
129: ;(add-bigrams (string-tokenize text))
130: ))
131: argv)
132: (printf "bigram\tfrequency\trelative frequency\tmutual information\n")
133: (let ([result (sort-by-value bigrams)])
134: (for-each (lambda (a)
135: (let ([bigram (car a)])
136: ;(unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
137: (printf "~a ~a\t~a\t~a\t~a\n"
138: (car bigram)
139: (cadr bigram)
140: (cadr a)
141: (/ (cadr a) bigramcount)
142: (mutual-information (/ (hash-table-get types (car bigram)) tokencount)
143: (/ (hash-table-get types (cadr bigram)) tokencount)
144: (/ (cadr a) bigramcount)))
145: ;)
146: ))
147: (reverse result)))
148: (printf "---------------------------------------------------------\n")
149: (printf "Number of bigrams\t~a\n" bigramcount)
150: (printf "Number of bigram-types\t~a\n" (hash-table-count bigrams))
151: (printf "Number of tokens\t~a\n" tokencount)
152: (printf "Number of types\t~a\n" (hash-table-count types)))