SHOGUN  v3.0.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
KMeans.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2008 Gunnar Raetsch
8  * Written (W) 2007-2009 Soeren Sonnenburg
9  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
10  */
11 
14 #include <shogun/labels/Labels.h>
17 #include <shogun/base/Parallel.h>
18 
19 #ifdef HAVE_PTHREAD
20 #include <pthread.h>
21 #endif
22 
23 #define MUSRECALC
24 
25 #define PAR_THRESH 10
26 
27 using namespace shogun;
28 
31 {
32  init();
33 }
34 
35 CKMeans::CKMeans(int32_t k_, CDistance* d)
37 {
38  init();
39  k=k_;
40  set_distance(d);
41 }
42 
43 CKMeans::CKMeans(int32_t k_i, CDistance* d_i, SGVector<float64_t> centers_i)
45 {
46  init();
47  k = k_i;
48  set_distance(d_i);
49  set_initial_centers(centers_i);
50 }
51 
53 {
54 }
55 
57 {
58  dimensions = ((CDenseFeatures<float64_t>*) distance->get_lhs())->get_num_features();
59  REQUIRE(centers.vlen == k*dimensions, "Vector dimension of initial cluster centers supplied does not match expectation");
60  mus_initial = centers;
61  return true;
62 }
63 
65 {
67 
68  if (data)
69  distance->init(data, data);
70 
72 
75  ASSERT(lhs)
76  int32_t num=lhs->get_num_vectors();
77  SG_UNREF(lhs);
78 
79  Weights=SGVector<float64_t>(num);
80  for (int32_t i=0; i<num; i++)
81  Weights.vector[i]=1.0;
82 
83  if (mus_initial.vlen>0)
84  clustknb(true, mus_initial);
85  else
86  clustknb(false, NULL);
87 
88  return true;
89 }
90 
91 bool CKMeans::load(FILE* srcfile)
92 {
95  return false;
96 }
97 
98 bool CKMeans::save(FILE* dstfile)
99 {
102  return false;
103 }
104 
105 
106 void CKMeans::set_k(int32_t p_k)
107 {
108  ASSERT(p_k>0)
109  this->k=p_k;
110 }
111 
112 int32_t CKMeans::get_k()
113 {
114  return k;
115 }
116 
117 void CKMeans::set_max_iter(int32_t iter)
118 {
119  ASSERT(iter>0)
120  max_iter=iter;
121 }
122 
124 {
125  return max_iter;
126 }
127 
129 {
130  return R;
131 }
132 
134 {
135  if (!R.vector)
136  return SGMatrix<float64_t>();
137 
140  SGMatrix<float64_t> centers=lhs->get_feature_matrix();
141  SG_UNREF(lhs);
142  return centers;
143 }
144 
146 {
147  return dimensions;
148 }
149 
150 #ifndef DOXYGEN_SHOULD_SKIP_THIS
151 struct thread_data
152 {
153  float64_t* x;
155  float64_t* z;
156  int32_t n1, n2, m;
157  int32_t js, je; /* defines the matrix stripe */
158  int32_t offs;
159 };
160 #endif // DOXYGEN_SHOULD_SKIP_THIS
161 
162 namespace shogun
163 {
164 void *sqdist_thread_func(void * P)
165 {
166  struct thread_data *TD=(struct thread_data*) P;
167  float64_t* x=TD->x;
168  CDenseFeatures<float64_t>* y=TD->y;
169  float64_t* z=TD->z;
170  int32_t n1=TD->n1,
171  m=TD->m,
172  js=TD->js,
173  je=TD->je,
174  offs=TD->offs,
175  j,i,k;
176 
177  for (j=js; j<je; j++)
178  {
179  int32_t vlen=0;
180  bool vfree=false;
181  float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
182 
183  for (i=0; i<n1; i++)
184  {
185  float64_t sum=0;
186  for (k=0; k<m; k++)
187  sum = sum + CMath::sq(x[i*m + k] - vec[k]);
188  z[j*n1 + i] = sum;
189  }
190 
191  y->free_feature_vector(vec, j, vfree);
192  }
193  return NULL;
194 }
195 }
196 
197 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
198 {
201  ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0)
202 
203  int32_t XSize=lhs->get_num_vectors();
204  dimensions=lhs->get_num_features();
205  int32_t i, changed=1;
206  const int32_t XDimk=dimensions*k;
207  int32_t iter=0;
208 
210 
212 
213  int32_t *ClList=SG_CALLOC(int32_t, XSize);
214  float64_t *weights_set=SG_CALLOC(float64_t, k);
215  float64_t *dists=SG_CALLOC(float64_t, k*XSize);
216 
219  CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
220 
221  int32_t vlen=0;
222  bool vfree=false;
223  float64_t* vec=NULL;
224 
225  /* ClList=zeros(XSize,1) ; */
226  memset(ClList, 0, sizeof(int32_t)*XSize);
227  /* weights_set=zeros(k,1) ; */
228  memset(weights_set, 0, sizeof(float64_t)*k);
229 
230  /* cluster_centers=zeros(dimensions, k) ; */
231  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
232 
233  if (!use_old_mus)
234  {
235  for (i=0; i<XSize; i++)
236  {
237  const int32_t Cl=CMath::random(0, k-1);
238  int32_t j;
239  float64_t weight=Weights.vector[i];
240 
241  weights_set[Cl]+=weight;
242  ClList[i]=Cl;
243 
244  vec=lhs->get_feature_vector(i, vlen, vfree);
245 
246  for (j=0; j<dimensions; j++)
247  mus.matrix[Cl*dimensions+j] += weight*vec[j];
248 
249  lhs->free_feature_vector(vec, i, vfree);
250  }
251  for (i=0; i<k; i++)
252  {
253  int32_t j;
254 
255  if (weights_set[i]!=0.0)
256  for (j=0; j<dimensions; j++)
257  mus.matrix[i*dimensions+j] /= weights_set[i];
258  }
259  }
260  else
261  {
262  ASSERT(mus_start)
263 
264 
265  rhs_mus->copy_feature_matrix(SGMatrix<float64_t>(mus_start,dimensions,k,false));
266 
267  for(int32_t idx=0;idx<XSize;idx++)
268  {
269  for(int32_t m=0;m<k;m++)
270  dists[k*idx+m] = distance->distance(idx,m);
271  }
272 
273  for (i=0; i<XSize; i++)
274  {
275  float64_t mini=dists[i*k];
276  int32_t Cl = 0, j;
277 
278  for (j=1; j<k; j++)
279  {
280  if (dists[i*k+j]<mini)
281  {
282  Cl=j;
283  mini=dists[i*k+j];
284  }
285  }
286  ClList[i]=Cl;
287  }
288 
289  /* Compute the sum of all points belonging to a cluster
290  * and count the points */
291  for (i=0; i<XSize; i++)
292  {
293  const int32_t Cl = ClList[i];
294  float64_t weight=Weights.vector[i];
295  weights_set[Cl]+=weight;
296 #ifndef MUSRECALC
297  vec=lhs->get_feature_vector(i, vlen, vfree);
298 
299  for (j=0; j<dimensions; j++)
300  mus.matrix[Cl*dimensions+j] += weight*vec[j];
301 
302  lhs->free_feature_vector(vec, i, vfree);
303 #endif
304  }
305 #ifndef MUSRECALC
306  /* normalization to get the mean */
307  for (i=0; i<k; i++)
308  {
309  if (weights_set[i]!=0.0)
310  {
311  int32_t j;
312  for (j=0; j<dimensions; j++)
313  mus.matrix[i*dimensions+j] /= weights_set[i];
314  }
315  }
316 #endif
317  }
318 
319 
320 
321  while (changed && (iter<max_iter))
322  {
323  iter++;
324  if (iter==max_iter-1)
325  SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1)
326 
327  if (iter%1000 == 0)
328  SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed)
329  changed=0;
330 
331 #ifdef MUSRECALC
332  /* mus=zeros(dimensions, k) ; */
333  memset(mus.matrix, 0, sizeof(float64_t)*XDimk);
334 
335  for (i=0; i<XSize; i++)
336  {
337  int32_t j;
338  int32_t Cl=ClList[i];
339  float64_t weight=Weights.vector[i];
340 
341  vec=lhs->get_feature_vector(i, vlen, vfree);
342 
343  for (j=0; j<dimensions; j++)
344  mus.matrix[Cl*dimensions+j] += weight*vec[j];
345 
346  lhs->free_feature_vector(vec, i, vfree);
347  }
348  for (i=0; i<k; i++)
349  {
350  int32_t j;
351 
352  if (weights_set[i]!=0.0)
353  for (j=0; j<dimensions; j++)
354  mus.matrix[i*dimensions+j] /= weights_set[i];
355  }
356 #endif
357  rhs_mus->copy_feature_matrix(mus);
359 
360  for (i=0; i<XSize; i++)
361  {
362  /* ks=ceil(rand(1,XSize)*XSize) ; */
363  const int32_t Pat= CMath::random(0, XSize-1);
364  const int32_t ClList_Pat=ClList[Pat];
365  int32_t imini, j;
366  float64_t mini, weight;
367 
368  weight=Weights.vector[Pat];
369 
370  /* compute the distance of this point to all centers */
371  for(int32_t idx_k=0;idx_k<k;idx_k++)
372  dists[idx_k]=distance->distance(Pat,idx_k);
373 
374  /* [mini,imini]=min(dists(:,i)) ; */
375  imini=0 ; mini=dists[0];
376  for (j=1; j<k; j++)
377  if (dists[j]<mini)
378  {
379  mini=dists[j];
380  imini=j;
381  }
382 
383  if (imini!=ClList_Pat)
384  {
385  changed= changed + 1;
386 
387  /* weights_set(imini) = weights_set(imini) + weight ; */
388  weights_set[imini]+= weight;
389  /* weights_set(j) = weights_set(j) - weight ; */
390  weights_set[ClList_Pat]-= weight;
391 
392  vec=lhs->get_feature_vector(Pat, vlen, vfree);
393 
394  for (j=0; j<dimensions; j++)
395  {
396  mus.matrix[imini*dimensions+j]-=(vec[j]
397  -mus.matrix[imini*dimensions+j])
398  *(weight/weights_set[imini]);
399  }
400 
401  lhs->free_feature_vector(vec, Pat, vfree);
402 
403  /* mu_new = mu_old - (x - mu_old)/(n-1) */
404  /* if weights_set(j)~=0 */
405  if (weights_set[ClList_Pat]!=0.0)
406  {
407  vec=lhs->get_feature_vector(Pat, vlen, vfree);
408 
409  for (j=0; j<dimensions; j++)
410  {
411  mus.matrix[ClList_Pat*dimensions+j]-=
412  (vec[j]
413  -mus.matrix[ClList_Pat
414  *dimensions+j])
415  *(weight/weights_set[ClList_Pat]);
416  }
417  lhs->free_feature_vector(vec, Pat, vfree);
418  }
419  else
420  /* mus(:,j)=zeros(dimensions,1) ; */
421  for (j=0; j<dimensions; j++)
422  mus.matrix[ClList_Pat*dimensions+j]=0;
423 
424  /* ClList(i)= imini ; */
425  ClList[Pat] = imini;
426  }
427  }
428  }
429 
430  /* compute the ,,variances'' of the clusters */
431  for (i=0; i<k; i++)
432  {
433  float64_t rmin1=0;
434  float64_t rmin2=0;
435 
436  bool first_round=true;
437 
438  for (int32_t j=0; j<k; j++)
439  {
440  if (j!=i)
441  {
442  int32_t l;
443  float64_t dist = 0;
444 
445  for (l=0; l<dimensions; l++)
446  {
447  dist+=CMath::sq(
448  mus.matrix[i*dimensions+l]
449  -mus.matrix[j*dimensions+l]);
450  }
451 
452  if (first_round)
453  {
454  rmin1=dist;
455  rmin2=dist;
456  first_round=false;
457  }
458  else
459  {
460  if ((dist<rmin2) && (dist>=rmin1))
461  rmin2=dist;
462 
463  if (dist<rmin1)
464  {
465  rmin2=rmin1;
466  rmin1=dist;
467  }
468  }
469  }
470  }
471 
472  R.vector[i]=(0.7*CMath::sqrt(rmin1)+0.3*CMath::sqrt(rmin2));
473  }
474 
475  distance->replace_rhs(rhs_cache);
476  delete rhs_mus;
477  SG_FREE(ClList);
478  SG_FREE(weights_set);
479  SG_FREE(dists);
480  SG_UNREF(lhs);
481 }
482 
484 {
485  /* set lhs of underlying distance to cluster centers */
487  mus);
488 
489  /* store cluster centers in lhs of distance variable */
490  CFeatures* rhs=distance->get_rhs();
491  distance->init(cluster_centers, rhs);
492  SG_UNREF(rhs);
493 }
494 
495 void CKMeans::init()
496 {
497  max_iter=10000;
498  k=3;
499  dimensions=0;
500 
501  SG_ADD(&max_iter, "max_iter", "Maximum number of iterations", MS_AVAILABLE);
502  SG_ADD(&k, "k", "k, the number of clusters", MS_AVAILABLE);
503  SG_ADD(&dimensions, "dimensions", "Dimensions of data", MS_NOT_AVAILABLE);
504  SG_ADD(&R, "R", "Cluster radiuses", MS_NOT_AVAILABLE);
505 }
506 
#define SG_INFO(...)
Definition: SGIO.h:120
#define SG_RESET_LOCALE
Definition: SGIO.h:88
virtual bool save(FILE *dstfile)
Definition: KMeans.cpp:98
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:80
int32_t max_iter
maximum number of iterations
Definition: KMeans.h:159
CFeatures * get_lhs()
Definition: Distance.h:194
int32_t dimensions
number of dimensions
Definition: KMeans.h:165
SGVector< float64_t > R
radi of the clusters (size k)
Definition: KMeans.h:168
static T sq(T x)
x^2
Definition: Math.h:246
#define REQUIRE(x,...)
Definition: SGIO.h:208
CFeatures * get_rhs()
Definition: Distance.h:200
void set_k(int32_t p_k)
Definition: KMeans.cpp:106
virtual bool set_initial_centers(SGVector< float64_t > centers)
Definition: KMeans.cpp:56
int32_t get_dimensions()
Definition: KMeans.cpp:145
#define SG_SET_LOCALE_C
Definition: SGIO.h:87
A generic DistanceMachine interface.
ST * get_feature_vector(int32_t num, int32_t &len, bool &dofree)
static uint64_t random()
Definition: Math.h:534
virtual ~CKMeans()
Definition: KMeans.cpp:52
void clustknb(bool use_old_mus, float64_t *mus_start)
Definition: KMeans.cpp:197
#define ASSERT(x)
Definition: SGIO.h:203
SGVector< float64_t > get_radiuses()
Definition: KMeans.cpp:128
SGVector< float64_t > mus_initial
initial centers supplied
Definition: KMeans.h:171
float64_t get_max_iter()
Definition: KMeans.cpp:123
double float64_t
Definition: common.h:48
virtual void copy_feature_matrix(SGMatrix< ST > src)
virtual bool load(FILE *srcfile)
Definition: KMeans.cpp:91
void free_feature_vector(ST *feat_vec, int32_t num, bool dofree)
void set_max_iter(int32_t iter)
Definition: KMeans.cpp:117
CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:145
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
#define SG_UNREF(x)
Definition: SGObject.h:54
The class Features is the base class of all feature objects.
Definition: Features.h:62
void set_distance(CDistance *d)
virtual void store_model_features()
Definition: KMeans.cpp:483
virtual EFeatureType get_feature_type()=0
#define SG_WARNING(...)
Definition: SGIO.h:130
#define SG_ADD(...)
Definition: SGObject.h:83
static float32_t sqrt(float32_t x)
x^0.5
Definition: Math.h:252
virtual bool train_machine(CFeatures *data=NULL)
Definition: KMeans.cpp:64
int32_t get_k()
Definition: KMeans.cpp:112
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
int32_t k
the k parameter in KMeans
Definition: KMeans.h:162
index_t vlen
Definition: SGVector.h:706
void * sqdist_thread_func(void *P)
Definition: KMeans.cpp:164
SGMatrix< float64_t > get_cluster_centers()
Definition: KMeans.cpp:133

SHOGUN Machine Learning Toolbox - Documentation