1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countbigrams4.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of bigrams
26:;;; and their frequency is printed out.
27:;;;
28:;;; If the command line parameters contain more than one text file,
29:;;; the above results are accumulated over all the input text files.
30:;;;
31:;;; Usage:
32:;;; mzscheme -r countbigrams4.scm test1.txt test2.txt ...
33:;;; ----------------------------------------------------
34:
35:
36:;;; all necessary libraries and functions
37:(require (lib "list.ss"))
38:(require (lib "string.ss")) ; for string-uppercase!
39:(require (lib "string.ss" "srfi" "13"))
40:(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
41:(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
42:
43:(load "english.scm")
44:
45:;;; Global variables
46:(define bigramcount 0.0) ; total number of bigrams
47:(define bigrams (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
48:(define tokencount 0.0) ; total number of tokens
49:(define types (make-hash-table 'equal)) ; hash-table container for type-count pairs
50:
51:
52:;;; sort-by-value
53:;;; <- hash-table
54:;;; -> list of key-value tuples (lists)
55:;;; ----------------------------------------------------
56:;;; Sort a hash-table of key-value pairs by value, by converting it
57:;;; into a list of key-value tuples and sorting on the value.
58:(define sort-by-value
59: (lambda (table)
60: (let ([keyval (hash-table-map table (lambda (key val)
61: (list key val)))])
62: (sort keyval (lambda (a b)
63: (< (cadr a) (cadr b)))))))
64:
65:
66:;;; add-bigrams
67:;;; <- list of strings, i.e. token list
68:;;; !-> updated hash-table bigram-hash
69:;;; !-> updated count-bigrams counter
70:;;; ----------------------------------------------------
71:;;; Add word-bigrams from an ordered list of tokens to the hash-table
72:;;; container and keep track of their count.
73:(define add-bigrams
74: (lambda (words)
75: (let ([count (- (length words) 1)]) ; how many bigrams
76: (set! bigramcount (+ bigramcount count)) ; remember total count of bigrams
77: (set! tokencount (+ tokencount (length words))) ; remember total count of tokens
78: ; add first token to hash-table
79: (let ([val (hash-table-get types (car words) 0.0)])
80: (hash-table-put! types (car words) (+ val 1)))
81: ; loop over the rest and create tokens and bigrams
82: (let loop ([i 1])
83: ; add next token to hash-table
84: (let ([val (hash-table-get types (list-ref words i) 0.0)])
85: (hash-table-put! types (list-ref words i) (+ val 1)))
86: ; add next bigram to hash-table
87: (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))] ; create bigram
88: [value (hash-table-get bigrams bigram 0.0)]) ; get the calue for bigram
89: (hash-table-put! bigrams bigram (+ value 1)))
90: (if (< i count)
91: (loop (+ i 1)))))))
92:
93:
94:;;; load-file
95:;;; <- string filename
96:;;; -> string file content
97:;;; ----------------------------------------------------
98:;;; Load text from file into a string variable and return it.
99:(define load-file
100: (lambda (name)
101: (call-with-input-file name
102: (lambda (p)
103: (read-string (file-size name) p)))))
104:
105:
106:;;; main steps
107:(begin
108: (vector-for-each (lambda (i fname)
109: (let ([text (load-file fname)])
110: (string-lowercase! text)
111: (add-bigrams (string-tokenize (pregexp-replace* "[`'-.,!?;:<>()|\"\\]\\[$%/]+" text " ")))))
112: argv)
113: (let ([result (sort-by-value bigrams)])
114: (for-each (lambda (a)
115: (let ([bigram (car a)])
116: (unless (or (member (car bigram) stopwords) (member (cadr bigram) stopwords))
117: (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a)))))
118: (reverse result)))
119: (printf "---------------------------------------------------------\n")
120: (printf "Number of bigrams\t~a\n" bigramcount)
121: (printf "Number of bigram-types\t~a\n" (hash-table-count bigrams))
122: (printf "Number of tokens\t~a\n" tokencount)
123: (printf "Number of types\t~a\n" (hash-table-count types)))