1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countunibigrams2.scm
5:;;; Author: Damir Cavar <dcavar@unizd.hr>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. Punctuation marks are eliminated from the text.
21:;;; 3. The text is tokenized.
22:;;; 4. Two adjacent tokens are placed into a hash-table as keys, the value
23:;;; is the absolute frequency (i.e. count) of each token in the
24:;;; text.
25:;;; 5. Single tokens are placed in a second hash-table.
26:;;; 6. Bigrams that do not contain stopwords are printed out, together
27:;;; with their Mutual Information score.
28:;;;
29:;;; If the command line parameters contain more than one text file,
30:;;; the above results are accumulated over all the input text files.
31:;;;
32:;;; Usage:
33:;;; mzscheme -r countunibigrams2.scm test1.txt test2.txt ...
34:;;; ----------------------------------------------------
35:
36:(require (lib "list.ss"))
37:(require (lib "string.ss")) ; for string-uppercase!
38:(require (lib "string.ss" "srfi" "13"))
39:(require (lib "vector-lib.ss" "srfi" "43"))
40:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
41:(load "english.scm")
42:
43:
44:;;; global counters
45:(define tokencount 0.0) ; total number of words
46:(define bigramcount 0.0) ; total number of bigrams
47:
48:;;; hash-table containers
49:(define types (make-hash-table 'equal)) ; words and their counts
50:(define bigrams (make-hash-table 'equal)) ; bigrams and their counts
51:
52:
53:;;; sort hash table with by value
54:;;; assuming values = reals/ints
55:;;; returning a sorted list of key-value tuples (lists)
56:(define sort-by-value
57: (lambda (table)
58: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
59: (sort keyval (lambda (a b)
60: (< (cadr a) (cadr b)))))))
61:
62:
63:(define add-data
64: (lambda (tokenlist)
65: (let ([count (- (length tokenlist) 1)])
66: (set! tokencount (+ tokencount (length tokenlist)))
67: (set! bigramcount (+ bigramcount count))
68: ; count the first token in the list
69: (let ([value (hash-table-get types (car tokenlist) 0.0)])
70: (hash-table-put! types (car tokenlist) (+ value 1)))
71: ; loop over the rest of the tokens
72: (let loop ([i 1])
73: (let* ([token (list-ref tokenlist i)] ; token = second element of bigram
74: [bigram (list (list-ref tokenlist (- i 1)) token)] ; bigram = previous and current token as list
75: [wvalue (hash-table-get types token 0.0)] ; get value for token
76: [bvalue (hash-table-get bigrams bigram 0.0)]) ; get value for bigram
77: (hash-table-put! types token (+ wvalue 1)) ; increment counter for token
78: (hash-table-put! bigrams bigram (+ bvalue 1))) ; increment counter for bigram
79: (if (< i count)
80: (loop (+ i 1)))))))
81:
82:
83:;;; log2 of value
84:;;; Base transformation:
85:;;; log2 is the natural log divided by the natural log of 2 (the base)
86:(define log2
87: (lambda (x)
88: (/ (log x) (log 2))))
89:
90:
91:;;; mutual-information of P(x), P(y), P(xy)
92:;;; calculate pointwise MI as
93:;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
94:(define mutual-information
95: (lambda (px py pxy)
96: (* pxy (log2 (/ pxy (* px py))))))
97:
98:
99:;;; load-file (filename)
100:;;; load text from file into a string variable and return it
101:(define load-file
102: (lambda (name)
103: (call-with-input-file name
104: (lambda (p)
105: (read-string (file-size name) p)))))
106:
107:
108:(begin
109: (vector-for-each (lambda (i fname)
110: (printf "Loading file: ~a\n" fname)
111: (let ([text (load-file fname)])
112: (string-lowercase! text)
113: (add-data (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
114: argv)
115: (let ((result (sort-by-value bigrams)))
116: (printf "bigram\tfreq\trel freq\tMI\n")
117: (for-each (lambda (a)
118: (unless (or (member (caar a) stopwords)
119: (member (cadar a) stopwords))
120: (printf "~a ~a\t~a\t~a\t~a\n"
121: (caar a)
122: (cadar a)
123: (cadr a)
124: (/ (cadr a) bigramcount)
125: (mutual-information
126: (/ (hash-table-get types (caar a)) tokencount)
127: (/ (hash-table-get types (cadar a)) tokencount)
128: (/ (cadr a) bigramcount)))))
129: (reverse result)))
130: (printf "---------------------------------------------------------\n")
131: (printf "Number of tokens: ~a\n" tokencount)
132: (printf "Number of types: ~a\n" (hash-table-count types))
133: (printf "Type/Token ratio: ~a\n" (/ (hash-table-count types) tokencount))
134: (printf "Number of bigrams: ~a\n" bigramcount)
135: (printf "Number of bigram types: ~a\n" (hash-table-count bigrams))
136: (printf "Bigram type/token ratio: ~a\n" (/ (hash-table-count bigrams) bigramcount)))