":"; exec mzscheme -r $0 "$@"
;;; ----------------------------------------------------
;;; Filename: average-re.scm
;;; Author: Damir Cavar <dcavar@me.com>
;;;
;;; (C) 2006 by Damir Cavar
;;;
;;; This code is published under the restrictive GPL!
;;; Please find the text of the GPL here:
;;; http://www.gnu.org/licenses/gpl.txt
;;;
;;; It is free for use, change, etc. as long as the copyright
;;; note above is included in any modified version of the code.
;;;
;;; This script assumes that the text is raw and encoded in UTF8.
;;;
;;; If the command line parameters contain more than one text file,
;;; the above results are accumulated over all the input text files.
;;;
;;; Usage:
;;; mzscheme -r average-re.scm test1.txt test2.txt ...
;;; ----------------------------------------------------
(require (lib "list.ss"))
(require (lib "string.ss")) ; for string-uppercase!
(require (lib "string.ss" "srfi" "13"))
(require (lib "vector-lib.ss" "srfi" "43"))
(require (lib "pregexp.ss")) ; for Perl compatible regular expressions
(load "english.scm")
;;; global counters
(define tokencount 0.0) ; total number of words
(define bigramcount 0.0) ; total number of bigrams
;;; hash-table containers
(define types (make-hash-table 'equal)) ; words and their counts
(define bigrams (make-hash-table 'equal)) ; bigrams and their counts
;;; extra hash-tables
(define lefttoken (make-hash-table 'equal)) ; key = left token in bigram, value = list of bigrams with key left
(define righttoken (make-hash-table 'equal)) ; key = right token in bigram, value = list of bigrams with key right
;;; sort hash table with by value
;;; assuming values = reals/ints
;;; returning a sorted list of key-value tuples (lists)
(define sort-by-value
(lambda (table)
(let ([keyval (hash-table-map table (lambda (key val) (list key val)))])
(sort keyval (lambda (a b)
(< (cadr a) (cadr b)))))))
(define add-data
(lambda (tokenlist)
(let ([count (- (length tokenlist) 1)])
; remember the total counts of tokens and bigrams
(set! tokencount (+ tokencount (length tokenlist)))
(set! bigramcount (+ bigramcount count))
; count the first token in the list
(let ([value (hash-table-get types (car tokenlist) 0.0)])
(hash-table-put! types (car tokenlist) (+ value 1)))
; loop over the rest of the tokens
(let loop ([i 1])
(let* ([token (list-ref tokenlist i)] ; token = second element of bigram
[bigram (list (list-ref tokenlist (- i 1)) token)] ; bigram = previous and current token as list
[wvalue (hash-table-get types token 0.0)] ; get value for token
[bvalue (hash-table-get bigrams bigram 0.0)] ; get value for bigram
[leftval (hash-table-get lefttoken (car bigram) '())] ; list of bigrams where token is left
[rightval (hash-table-get righttoken (cadr bigram) '())]) ; list of bigrams where token is right
; store the bigram in the value for left and right
(unless (member leftval bigram)
(set! leftval (cons bigram leftval))
(hash-table-put! lefttoken (car bigram) leftval))
(unless (member rightval bigram)
(set! rightval (cons bigram rightval))
(hash-table-put! righttoken (cadr bigram) rightval))
; store tokens and bigrams in their hash-tables
(hash-table-put! types token (+ wvalue 1)) ; increment counter for token
(hash-table-put! bigrams bigram (+ bvalue 1))) ; increment counter for bigram
; go back to loop, if more tokens left
(if (< i count)
(loop (+ i 1)))))))
;;; log2 of value
;;; Base transformation:
;;; log2 is the natural log divided by the natural log of 2 (the base)
(define log2
(lambda (x)
(/ (log x) (log 2))))
;;; mutual-information of P(x), P(y), P(xy)
;;; calculate pointwise MI as
;;; P(XY) * log2( P(XY) / (P(X) * P(Y)) )
(define mutual-information
(lambda (px py pxy)
(* pxy (log2 (/ pxy (* px py))))))
;;; relative-entropy of P(y), P(x), P(y)
;;; p(y) log2 (P(y) / P(y|x))
;;; P(y) log2 (P(y) P(x)/P(xy))
(define relative-entropy
(lambda (px py pxy)
(* py (log2 (/ (* px py) pxy)))))
;;; load-file (filename)
;;; load text from file into a string variable and return it
(define load-file
(lambda (name)
(call-with-input-file name
(lambda (p)
(read-string (file-size name) p)))))
(begin
(vector-for-each (lambda (i fname)
(let ([text (load-file fname)])
(string-lowercase! text)
(add-data (string-tokenize (pregexp-replace* "([`'-.,!?;:<>()|\"\\]\\[$%/]+)" text " ")))))
argv)
; print out the bigram statistics
(let ([result (sort-by-value bigrams)])
(printf "bigram\tfreq\trel freq\tlRE\trRE\n")
(for-each (lambda (a)
(unless (or (member (caar a) stopwords)
(member (cadar a) stopwords))
(let ([px (/ (hash-table-get types (caar a)) tokencount)]
[py (/ (hash-table-get types (cadar a)) tokencount)]
[pxy (/ (cadr a) bigramcount)])
(printf "~a ~a\t~a\t~a\t~a\t~a\n"
(caar a)
(cadar a)
(cadr a)
pxy
(relative-entropy px py pxy)
(relative-entropy py px pxy)))))
(reverse result)))
(printf "---------------------------------------------------------\n"))