1:":"; exec mzscheme -r $0 "$@"
2:
3:;;; ----------------------------------------------------
4:;;; Filename: countbigrams1.scm
5:;;; Author: Damir Cavar <dcavar@me.com>
6:;;;
7:;;; (C) 2006 by Damir Cavar
8:;;;
9:;;; This code is published under the restrictive GPL!
10:;;; Please find the text of the GPL here:
11:;;; http://www.gnu.org/licenses/gpl.txt
12:;;;
13:;;; It is free for use, change, etc. as long as the copyright
14:;;; note above is included in any modified version of the code.
15:;;;
16:;;; This script assumes that the text is raw and encoded in UTF8.
17:;;;
18:;;; Functions:
19:;;; 1. The text file is loaded into memory.
20:;;; 2. The text is tokenized, i.e. converted into a list of tokens.
21:;;; 3. Two adjacent tokens are placed into a hash-table as keys, the value
22:;;; is the absolute frequency (i.e. count) of each token in the
23:;;; text.
24:;;; 4. The hash-table is converted into a list of key-value tuples.
25:;;; 5. The key-values are sorted by value, and a list of bigrams
26:;;; and their frequency is printed out.
27:;;;
28:;;; If the command line parameters contain more than one text file,
29:;;; the above results are accumulated over all the input text files.
30:;;;
31:;;; Usage:
32:;;; mzscheme -r countbigrams1.scm test1.txt test2.txt ...
33:;;; ----------------------------------------------------
34:
35:
36:;;; all necessary libraries and functions
37:(require (lib "list.ss"))
38:(require (lib "string.ss" "srfi" "13"))
39:(require (lib "vector-lib.ss" "srfi" "43"))
40:
41:
42:;;; Global variables
43:(define bigramcount 0.0) ; total number of bigrams
44:(define bigrams (make-hash-table 'equal)) ; hash-table container for bigram-count pairs
45:
46:
47:;;; print-wordlist
48:;;; <- hash-table of key-value pairs
49:;;; side effect: print out of tab-delimited key-value pairs per line
50:;;; ----------------------------------------------------
51:;;; Pretty print wordlist as tab-delimited key-value lines.
52:(define print-bigramlist!
53: (lambda (table)
54: (hash-table-for-each table
55: (lambda (key value)
56: (printf "~a ~a\t~a\n" (car key) (cadr key) value)))))
57:
58:
59:;;; sort-by-value
60:;;; <- hash-table
61:;;; -> list of key-value tuples (lists)
62:;;; ----------------------------------------------------
63:;;; Sort a hash-table of key-value pairs by value, by converting it
64:;;; into a list of key-value tuples and sorting on the value.
65:(define sort-by-value
66: (lambda (table)
67: (let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
68: (sort keyval (lambda (a b)
69: (< (cadr a) (cadr b)))))))
70:
71:
72:;;; add-bigrams
73:;;; <- list of strings, i.e. token list
74:;;; !-> updated hash-table bigram-hash
75:;;; !-> updated count-bigrams counter
76:;;; ----------------------------------------------------
77:;;; Add word-bigrams from an ordered list of tokens to the hash-table
78:;;; container and keep track of their count.
79:(define add-bigrams
80: (lambda (words)
81: (let ([count (- (length words) 1)]) ; how many bigrams
82: (set! bigramcount (+ bigramcount count)) ; remember the total count
83: (let loop ([i 1])
84: (let* ([bigram (list (list-ref words (- i 1)) (list-ref words i))] ; create bigram
85: [value (hash-table-get bigrams bigram 0.0)]) ; get the calue for bigram
86: (hash-table-put! bigrams bigram (+ value 1)))
87: (if (< i count)
88: (loop (+ i 1)))))))
89:
90:
91:;;; load-file
92:;;; <- string filename
93:;;; -> string file content
94:;;; ----------------------------------------------------
95:;;; Load text from file into a string variable and return it.
96:(define load-file
97: (lambda (name)
98: (call-with-input-file name
99: (lambda (p)
100: (read-string (file-size name) p)))))
101:
102:
103:;;; main steps
104:(begin
105: (vector-for-each (lambda (i fname)
106: (printf "Loading file: ~a\n" fname)
107: (add-bigrams (string-tokenize (load-file fname))))
108: argv)
109: (printf "Number of tokens: ~a\n" bigramcount)
110: (printf "Number of types: ~a\n" (hash-table-count bigrams))
111: (printf "---------------------------------------------------------\n")
112: ;(print-bigramlist! bigrams)
113: (let ([result (sort-by-value bigrams)])
114: (printf "Decreasing frequency profile:\n")
115: (for-each (lambda (a)
116: (let ([bigram (car a)])
117: (printf "~a ~a\t~a\n" (car bigram) (cadr bigram) (cadr a))))
118: (reverse result))))