#!/usr/bin/env python """ VectorSpace.py (C) 2005 by Damir Cavar This code is free; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This is an implementation of a simple vector space generator for documents and words in documents. The columns are words and their relative frequency. The rows are documents. The program only keeps track of which word appears in which document how many times. The vectors contain relative frequencies for each word occurance. The sum over all numbers in a vector is 1. """ import sys, math, glob, os.path from VectorSpace import *defEuclideanDistance(v1, v2): """Returns the Euclidean distance between two vectors.""" distance = 0.0forminrange(len(v1)): distance += math.pow(float(v1[m]-v2[m]), 2)returnmath.sqrt(distance)deffindMostDistant(vectorspace, k): """Find the most distant k vectors.""" distances = [ ]forxinrange(len(vectorspace)): dv = [ ]foryinrange(x + 1, len(vectorspace)): dv.append(EuclideanDistance(vectorspace[x], vectorspace[y])) distances.append(dv) dv = [ ]forxindistances: dv += x dv.sort() vectors = [ ]forxinrange(len(distances)):foryinrange(len(distances[x])):ifdistances[x][y]indv[-k:]:ifxnotinvectors: vectors.append(x)ifynotinvectors: vectors.append(y)returnvectors[:k]defgetCentroid(v1, v2): """Return the centroid of two vectors.""" res = []foriinrange(len(v1)): res.append(float(v1[i] + v2[i])/2.0)returnres # return [ (i+j)/2 for i in v1 for j in v2 ]defrecalcCentroid(cval, vectorspace): """Recalculate centroid.""" tmp = vectorspace[cval[0]][:]foriincval[1:]: tmp = getCentroid(tmp, vectorspace[i])returntmpdefkMeansCluster(vectorspace, k): """K-Means clustering.""" # no vector is assigned to a centroid notassigned = range(len(vectorspace)) assigned = {} cvectors = {} # create centroid lists centroids = []forxinrange(k): centroids.append([])forxinrange(len(c)):delnotassigned[notassigned.index(c[x])] assigned[c[x]] = x # remember which vector belongs to which cluster cvectors[x] = [ c[x] ] # remember which cluster has which vectors centroids[x] = vectorspace[c[x]]forxinnotassigned: distances = []foriincentroids: distances.append(EuclideanDistance(vectorspace[x], i)) c = distances.index(min(distances)) assigned[x] = c value = cvectors[c] value.append(x) cvectors[c] = value centroids[c] = getCentroid(vectorspace[x], centroids[c]) notassigned = [ ]whilechange: change = Falseforxinassigned.keys(): distances = []foriincentroids: distances.append(EuclideanDistance(vectorspace[x], i)) c = distances.index(min(distances))ifc != assigned[x]:returncvectorsif__name__ == "__main__":iflen(sys.argv) > 1: docnames = []forxinsys.argv[1:]:foryinglob.glob(os.path.normcase(x)):foriinrange(k):forxinclusters[i]:else: