~/Documents/Teaching/DGfS Herbstschule 2005/Code/KMeans.py.html

#!/usr/bin/env python

"""
VectorSpace.py
(C) 2005 by Damir Cavar

This code is free; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This is an implementation of a simple vector space generator for documents and
words in documents.

The columns are words and their relative frequency. The rows are documents.
The program only keeps track of which word appears in which document
how many times. The vectors contain relative frequencies for each word
occurance. The sum over all numbers in a vector is 1.
"""


import sys, math, glob, os.path
from VectorSpace import *


def EuclideanDistance(v1, v2):
   """Returns the Euclidean distance between two vectors."""
   distance = 0.0
   for m in range(len(v1)):
      distance += math.pow(float(v1[m]-v2[m]), 2)
   return math.sqrt(distance)


def findMostDistant(vectorspace, k):
   """Find the most distant k vectors."""
   distances = [ ]
   for x in range(len(vectorspace)):
      dv = [ ]
      for y in range(x + 1, len(vectorspace)):
         dv.append(EuclideanDistance(vectorspace[x], vectorspace[y]))
      distances.append(dv)

   dv = [ ]
   for x in distances:
      dv += x
   dv.sort()

   vectors = [ ]
   for x in range(len(distances)):
      for y in range(len(distances[x])):
         if distances[x][y] in dv[-k:]:
            if x not in vectors:
               vectors.append(x)
            if y not in vectors:
               vectors.append(y)
   return vectors[:k]


def getCentroid(v1, v2):
   """Return the centroid of two vectors."""
   res = []
   for i in range(len(v1)):
      res.append(float(v1[i] + v2[i])/2.0)
   return res
   # return [ (i+j)/2 for i in v1 for j in v2 ]


def recalcCentroid(cval, vectorspace):
   """Recalculate centroid."""
   tmp = vectorspace[cval[0]][:]
   for i in cval[1:]:
      tmp = getCentroid(tmp, vectorspace[i])
   return tmp


def kMeansCluster(vectorspace, k):
   """K-Means clustering."""
   # no vector is assigned to a centroid
   notassigned = range(len(vectorspace))
   assigned    = {}
   cvectors    = {}
   # create centroid lists
   centroids   = []
   for x in range(k):
      centroids.append([])

   print "Initialization..."
   # initialize centroids
   c = findMostDistant(vectorspace, k)
   for x in range(len(c)):
      del notassigned[notassigned.index(c[x])]
      assigned[c[x]] = x        # remember which vector belongs to which cluster
      cvectors[x]    = [ c[x] ] # remember which cluster has which vectors
      centroids[x]   = vectorspace[c[x]]

   print "Initial assignment..."
   # assign the rest
   for x in notassigned:
      distances = []
      for i in centroids:
         distances.append(EuclideanDistance(vectorspace[x], i))
      c = distances.index(min(distances))
      assigned[x] = c
      value       = cvectors[c]
      value.append(x)
      cvectors[c]  = value
      centroids[c] = getCentroid(vectorspace[x], centroids[c])
   notassigned = [ ]

   print assigned
   print "Optimization loop..."
   # loop over the vectors and check whether they are still closest to their centroid
   change = True
   while change:
      change = False
      for x in assigned.keys():
         distances = []
         for i in centroids:
            distances.append(EuclideanDistance(vectorspace[x], i))
         c = distances.index(min(distances))
         if c != assigned[x]:
            print "Moving centroid..."
            change = True
            value  = assigned[x]
            cval   = cvectors[value]
            cval.remove(x)
            cvectors[value] = cval
            # recalculate centroid 1
            centroids[value] = recalcCentroid(cval, vectorspace)
            assigned[x]  = c
            value        = cvectors[c]
            value.append(x)
            cvectors[c]  = value
            # recalculate centroid 2
            centroids[c] = recalcCentroid(value, vectorspace)
            centroids[c] = getCentroid(vectorspace[x], centroids[c])
   return cvectors


if __name__ == "__main__":
   if len(sys.argv) > 1:
      docnames = []
      for x in sys.argv[1:]:
         for y in glob.glob(os.path.normcase(x)):
            print "Loading file:", y
            docnames.append(y)
            collectWords(y)
      vectorspace = makeVectorSpace()
      documents = {}  # clean up memory
      k = 4
      clusters = kMeansCluster(vectorspace, k)
      for i in range(k):
         print "Cluster", i
         for x in clusters[i]:
            print docnames[x]
         print ""
   else:
      print "Usage:"
      print "python KMeans.py filename[s]"