```#!/usr/bin/env python

"""
VectorSpace.py
(C) 2005 by Damir Cavar

This code is free; you can redistribute it and/or modify
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This is an implementation of a simple vector space generator for documents and
words in documents.

The columns are words and their relative frequency. The rows are documents.
The program only keeps track of which word appears in which document
how many times. The vectors contain relative frequencies for each word
occurance. The sum over all numbers in a vector is 1.
"""

import sys, math, glob, os.path
from VectorSpace import *

def EuclideanDistance(v1, v2):
"""Returns the Euclidean distance between two vectors."""
distance = 0.0
for m in range(len(v1)):
distance += math.pow(float(v1[m]-v2[m]), 2)
return math.sqrt(distance)

def findMostDistant(vectorspace, k):
"""Find the most distant k vectors."""
distances = [ ]
for x in range(len(vectorspace)):
dv = [ ]
for y in range(x + 1, len(vectorspace)):
dv.append(EuclideanDistance(vectorspace[x], vectorspace[y]))
distances.append(dv)

dv = [ ]
for x in distances:
dv += x
dv.sort()

vectors = [ ]
for x in range(len(distances)):
for y in range(len(distances[x])):
if distances[x][y] in dv[-k:]:
if x not in vectors:
vectors.append(x)
if y not in vectors:
vectors.append(y)
return vectors[:k]

def getCentroid(v1, v2):
"""Return the centroid of two vectors."""
res = []
for i in range(len(v1)):
res.append(float(v1[i] + v2[i])/2.0)
return res
# return [ (i+j)/2 for i in v1 for j in v2 ]

def recalcCentroid(cval, vectorspace):
"""Recalculate centroid."""
tmp = vectorspace[cval[0]][:]
for i in cval[1:]:
tmp = getCentroid(tmp, vectorspace[i])
return tmp

def kMeansCluster(vectorspace, k):
"""K-Means clustering."""
# no vector is assigned to a centroid
notassigned = range(len(vectorspace))
assigned    = {}
cvectors    = {}
# create centroid lists
centroids   = []
for x in range(k):
centroids.append([])

print "Initialization..."
# initialize centroids
c = findMostDistant(vectorspace, k)
for x in range(len(c)):
del notassigned[notassigned.index(c[x])]
assigned[c[x]] = x        # remember which vector belongs to which cluster
cvectors[x]    = [ c[x] ] # remember which cluster has which vectors
centroids[x]   = vectorspace[c[x]]

print "Initial assignment..."
# assign the rest
for x in notassigned:
distances = []
for i in centroids:
distances.append(EuclideanDistance(vectorspace[x], i))
c = distances.index(min(distances))
assigned[x] = c
value       = cvectors[c]
value.append(x)
cvectors[c]  = value
centroids[c] = getCentroid(vectorspace[x], centroids[c])
notassigned = [ ]

print assigned
print "Optimization loop..."
# loop over the vectors and check whether they are still closest to their centroid
change = True
while change:
change = False
for x in assigned.keys():
distances = []
for i in centroids:
distances.append(EuclideanDistance(vectorspace[x], i))
c = distances.index(min(distances))
if c != assigned[x]:
print "Moving centroid..."
change = True
value  = assigned[x]
cval   = cvectors[value]
cval.remove(x)
cvectors[value] = cval
# recalculate centroid 1
centroids[value] = recalcCentroid(cval, vectorspace)
assigned[x]  = c
value        = cvectors[c]
value.append(x)
cvectors[c]  = value
# recalculate centroid 2
centroids[c] = recalcCentroid(value, vectorspace)
centroids[c] = getCentroid(vectorspace[x], centroids[c])
return cvectors

if __name__ == "__main__":
if len(sys.argv) > 1:
docnames = []
for x in sys.argv[1:]:
for y in glob.glob(os.path.normcase(x)):
docnames.append(y)
collectWords(y)
vectorspace = makeVectorSpace()
documents = {}  # clean up memory
k = 4
clusters = kMeansCluster(vectorspace, k)
for i in range(k):
print "Cluster", i
for x in clusters[i]:
print docnames[x]
print ""
else:
print "Usage:"
print "python KMeans.py filename[s]"
```