":"; exec mzscheme -r $0 "$@"

;;; ----------------------------------------------------
;;; Filename: lid.ss
;;; Author:   Damir Cavar <dcavar@me.com>
;;;
;;; (C) 2006 by Damir Cavar
;;;
;;; This code is published under the restrictive GPL!
;;; Please find the text of the GPL here:
;;; http://www.gnu.org/licenses/gpl.txt
;;;
;;; It is free for use, change, etc. as long as the copyright
;;; note above is included in any modified version of the code.
;;;
;;; This script assumes that the text is raw and encoded in UTF8.
;;;
;;; Functions:
;;; 1. loads language models (trigraph statistics)
;;; 2. loads a text and creates a trigraph model
;;; 3. calculates the distance between the trigraph model
;;;    of the unknow text and the trained models
;;; 4. returns the language ID, which corresponds to
;;;    a part of the filename of the closest model
;;;
;;; If the command line parameters contain more than one text file,
;;; the above steps (2.-3.) are performed for each file.
;;;
;;; Usage:
;;; mzscheme -r lid.ss test1.txt test2.txt ...
;;; ----------------------------------------------------

;;; all required libraries and functions
(require (lib "vector-lib.ss" "srfi" "43")) ; for vector-for-each
(require (lib "list.ss"))                   ; for sort
(require (lib "list.ss"       "srfi" "1" ))
(require (lib "serialize.ss"))              ; for serialize
(require (lib "file.ss"))                   ; for fold-files
(require (lib "pregexp.ss"))  ; for Perl compatible regular expressions

;;; Global variables
(define models (make-vector 0)) ; initial vector for language models


;;; load-model
(define load-model
  (lambda (fname)
    (let* ([data  (read (open-input-file fname))]
           [model (deserialize data)]
           [lang  (path->string fname)])
      (printf "Deserialized file: ~a  model for language: ~a\n" lang (substring lang 5 (- (string-length lang) 3)))
      (set! models (vector-append models (vector (list (substring lang 5 (- (string-length lang) 3)) model)))))))


;;; load-file
;;; <- string filename
;;; -> string file content
;;; ----------------------------------------------------
;;; Load text from file into a string variable and return it.
(define load-file
  (lambda (name)
      (call-with-input-file name
        (lambda (p)
          (read-string (file-size name) p)))))


;;; relativize
;;; <- hash-table with keys and their absolute count
;;; <- total number of tokens
;;; ----------------------------------------------------
;;; side effect: overwrites the value with the relativized count.
(define relativize!
  (lambda (table total)
    (hash-table-for-each table (lambda (key value)
                                 (hash-table-put! table key (/ value total))))))


;;; check-lang
;;; <- string text
;;; -> string id of language
;;; ----------------------------------------------------
(define check-lang
  (lambda (text)
    (let* ([trigramcount (- (string-length text) 2)]
           [trigrams     (make-hash-table 'equal)]
           [countmodels  (vector-length models)]
           [distances (make-vector countmodels)])

      ; create trigraph model of current text
      (let loop ([0])
        (let* ([token (substring text i (+ i 3))]
               [value (hash-table-get trigrams token 0.0)])
          (hash-table-put! trigrams token (+ value 1)))
        (if (< i (- trigramcount 1))
            (loop (+ i 1))))
      (relativize! trigrams trigramcount)

      ; calculate distance to each model
      (let modelloop ([0])
        (let ([curmodel (cadr (vector-ref models j))]
              [distance 0.0])
          (hash-table-for-each
           trigrams
           (lambda (key value)
             (let ([val (hash-table-get curmodel key -1.0)])
               (if (= val -1.0)
                   (set! distance (+ distance 1))
                   (set! distance (+ distance (abs (- value val))))))))
          (vector-set! distances j distance))
        (if (< j (- countmodels 1))
            (modelloop (+ j 1))))
      (printf "~a\n" distances)

      ; find the smallest distance
      (let* ([counter -1]
             [distl   (map (lambda (a)
                             (set! counter (+ counter 1))
                             (list a counter))
                           (vector->list distances))]
             [smallest (car (sort distl (lambda (a b)
                                               (< (car a) (car b)))))])
        (car (vector-ref models (cadr smallest)))))))


; main procedures
(begin
  ; find language models in the file system (current working directory)
  (for-each (lambda (a)
              (if (pregexp-match "lang-.+\\.ss" (path->string (file-name-from-path a)))
                  (load-model a)))
            (directory-list))
  (printf "Loaded ~a language models.\n" (vector-length models))
  (vector-for-each (lambda (i fname)
                     (printf "File ~a has language ID ~a\n" fname (check-lang (load-file fname))))
                   argv))