SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
MultidimensionalScaling.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2011 Sergey Lisitsyn
8  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
9  */
10 
12 #ifdef HAVE_LAPACK
17 #include <shogun/lib/common.h>
19 #include <shogun/io/SGIO.h>
21 
22 #ifdef HAVE_PTHREAD
23 #include <pthread.h>
24 #endif
25 
26 using namespace shogun;
27 
28 #ifndef DOXYGEN_SHOULD_SKIP_THIS
29 struct TRIANGULATION_THREAD_PARAM
30 {
32  int32_t idx_start;
34  int32_t idx_stop;
36  int32_t idx_step;
38  int32_t lmk_N;
40  int32_t total_N;
42  int32_t m_target_dim;
44  float64_t* current_dist_to_lmks;
46  float64_t* lmk_feature_matrix;
48  float64_t* new_feature_matrix;
50  const float64_t* distance_matrix;
52  const float64_t* mean_sq_dist_vector;
54  const int32_t* lmk_idxs;
56  const bool* to_process;
57 };
58 #endif
59 
61 {
62  m_eigenvalues = SGVector<float64_t>(NULL,0,false);
64  m_landmark = false;
65 
66  init();
67 }
68 
70 {
71  m_parameters->add(&m_eigenvalues, "eigenvalues", "eigenvalues of last embedding");
72  m_parameters->add(&m_landmark, "landmark", "indicates if landmark approximation should be used");
73  m_parameters->add(&m_landmark_number, "landmark number", "the number of landmarks for approximation");
74 }
75 
77 {
79 }
80 
82 {
83  return m_eigenvalues;
84 }
85 
87 {
88  if (num<3)
89  SG_ERROR("Number of landmarks should be greater than 3 to make triangulation possible while %d given.",
90  num);
91  m_landmark_number = num;
92 }
93 
95 {
96  return m_landmark_number;
97 }
98 
100 {
101  m_landmark = landmark;
102 }
103 
105 {
106  return m_landmark;
107 }
108 
110 {
111  return "MultidimensionalScaling";
112 };
113 
115 {
116  ASSERT(distance);
117 
118  // compute feature_matrix by landmark or classic embedding of distance matrix
119  SGMatrix<float64_t> distance_matrix = process_distance_matrix(distance->get_distance_matrix());
120  SGMatrix<float64_t> feature_matrix;
121  if (m_landmark)
122  feature_matrix = landmark_embedding(distance_matrix);
123  else
124  feature_matrix = classic_embedding(distance_matrix);
125 
126  distance_matrix.destroy_matrix();
127 
128  return new CSimpleFeatures<float64_t>(feature_matrix);
129 }
130 
132 {
133  return distance_matrix;
134 }
135 
137 {
138  SG_REF(features);
140 
141  m_distance->init(features,features);
144 
145  SG_UNREF(features);
146  return (CFeatures*)embedding;
147 }
148 
150 {
151  ASSERT(m_target_dim>0);
152  ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
153  int32_t N = distance_matrix.num_cols;
154 
155  // loop variables
156  int32_t i,j;
157 
158  // double center distance_matrix
159  float64_t dsq;
160  for (i=0; i<N; i++)
161  {
162  for (j=i; j<N; j++)
163  {
164  dsq = CMath::sq(distance_matrix[i*N+j]);
165  distance_matrix[i*N+j] = dsq;
166  distance_matrix[j*N+i] = dsq;
167  }
168  }
169  CMath::center_matrix(distance_matrix.matrix,N,N);
170  for (i=0; i<N; i++)
171  {
172  distance_matrix[i*N+i] *= -0.5;
173  for (j=i+1; j<N; j++)
174  {
175  distance_matrix[i*N+j] *= -0.5;
176  distance_matrix[j*N+i] *= -0.5;
177  }
178  }
179 
180  // feature matrix representing given distance
181  float64_t* replace_feature_matrix = SG_MALLOC(float64_t, N*m_target_dim);
182 
183  // status of eigenproblem to be solved
184  int eigenproblem_status = 0;
185 #ifdef HAVE_ARPACK
186  // using ARPACK
187  float64_t* eigenvalues_vector = SG_MALLOC(float64_t, m_target_dim);
188  // solve eigenproblem with ARPACK (faster)
189  arpack_dsxupd(distance_matrix.matrix,NULL,false,N,m_target_dim,"LM",false,1,false,0.0,0.0,
190  eigenvalues_vector,replace_feature_matrix,eigenproblem_status);
191  // check for failure
192  ASSERT(eigenproblem_status == 0);
193  // reverse eigenvectors order
194  float64_t tmp;
195  for (j=0; j<N; j++)
196  {
197  for (i=0; i<m_target_dim/2; i++)
198  {
199  tmp = replace_feature_matrix[j*m_target_dim+i];
200  replace_feature_matrix[j*m_target_dim+i] =
201  replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)];
202  replace_feature_matrix[j*m_target_dim+(m_target_dim-i-1)] = tmp;
203  }
204  }
205  // reverse eigenvalues order
206  for (i=0; i<m_target_dim/2; i++)
207  {
208  tmp = eigenvalues_vector[i];
209  eigenvalues_vector[i] = eigenvalues_vector[m_target_dim-i-1];
210  eigenvalues_vector[m_target_dim-i-1] = tmp;
211  }
212 
213  // finally construct embedding
214  for (i=0; i<m_target_dim; i++)
215  {
216  for (j=0; j<N; j++)
217  replace_feature_matrix[j*m_target_dim+i] *=
218  CMath::sqrt(eigenvalues_vector[i]);
219  }
220 
221  // set eigenvalues vector
223  m_eigenvalues = SGVector<float64_t>(eigenvalues_vector,m_target_dim,true);
224 #else /* not HAVE_ARPACK */
225  // using LAPACK
226  float64_t* eigenvalues_vector = SG_MALLOC(float64_t, N);
227  float64_t* eigenvectors = SG_MALLOC(float64_t, m_target_dim*N);
228  // solve eigenproblem with LAPACK
229  wrap_dsyevr('V','U',N,distance_matrix.matrix,N,N-m_target_dim+1,N,eigenvalues_vector,eigenvectors,&eigenproblem_status);
230  // check for failure
231  ASSERT(eigenproblem_status==0);
232 
233  // set eigenvalues vector
236  m_eigenvalues.do_free = false;
237 
238  // fill eigenvalues vector in backwards order
239  for (i=0; i<m_target_dim; i++)
240  m_eigenvalues.vector[i] = eigenvalues_vector[m_target_dim-i-1];
241 
242  SG_FREE(eigenvalues_vector);
243 
244  // construct embedding
245  for (i=0; i<m_target_dim; i++)
246  {
247  for (j=0; j<N; j++)
248  {
249  replace_feature_matrix[j*m_target_dim+i] =
250  eigenvectors[(m_target_dim-i-1)*N+j] * CMath::sqrt(m_eigenvalues.vector[i]);
251  }
252  }
253  SG_FREE(eigenvectors);
254 #endif /* HAVE_ARPACK else */
255 
256  // warn user if there are negative or zero eigenvalues
257  for (i=0; i<m_eigenvalues.vlen; i++)
258  {
259  if (m_eigenvalues.vector[i]<=0.0)
260  {
261  SG_WARNING("Embedding is not consistent (got neg eigenvalues): features %d-%d are wrong",
262  i, m_eigenvalues.vlen-1);
263  break;
264  }
265  }
266 
267  return SGMatrix<float64_t>(replace_feature_matrix,m_target_dim,N);
268 }
269 
271 {
272  ASSERT(m_target_dim>0);
273  ASSERT(distance_matrix.num_cols==distance_matrix.num_rows);
274  int32_t lmk_N = m_landmark_number;
275  int32_t i,j,t;
276  int32_t total_N = distance_matrix.num_cols;
277  if (lmk_N<3)
278  {
279  SG_ERROR("Number of landmarks (%d) should be greater than 3 for proper triangulation.\n",
280  lmk_N);
281  }
282  if (lmk_N>total_N)
283  {
284  SG_ERROR("Number of landmarks (%d) should be less than total number of vectors (%d).\n",
285  lmk_N, total_N);
286  }
287 
288  // get landmark indexes with random permutation
289  SGVector<int32_t> lmk_idxs = shuffle(lmk_N,total_N);
290  // compute distances between landmarks
291  float64_t* lmk_dist_matrix = SG_MALLOC(float64_t, lmk_N*lmk_N);
292  for (i=0; i<lmk_N; i++)
293  {
294  for (j=0; j<lmk_N; j++)
295  lmk_dist_matrix[i*lmk_N+j] =
296  distance_matrix[lmk_idxs.vector[i]*total_N+lmk_idxs.vector[j]];
297  }
298 
299  // get landmarks embedding
300  SGMatrix<float64_t> lmk_dist_sgmatrix(lmk_dist_matrix,lmk_N,lmk_N);
301  // compute mean vector of squared distances
302  float64_t* mean_sq_dist_vector = SG_CALLOC(float64_t, lmk_N);
303  for (i=0; i<lmk_N; i++)
304  {
305  for (j=0; j<lmk_N; j++)
306  mean_sq_dist_vector[i] += CMath::sq(lmk_dist_matrix[i*lmk_N+j]);
307 
308  mean_sq_dist_vector[i] /= lmk_N;
309  }
310  SGMatrix<float64_t> lmk_feature_matrix = classic_embedding(lmk_dist_sgmatrix);
311 
312  lmk_dist_sgmatrix.destroy_matrix();
313 
314  // construct new feature matrix
315  float64_t* new_feature_matrix = SG_CALLOC(float64_t, m_target_dim*total_N);
316 
317  // fill new feature matrix with embedded landmarks
318  for (i=0; i<lmk_N; i++)
319  {
320  for (j=0; j<m_target_dim; j++)
321  new_feature_matrix[lmk_idxs.vector[i]*m_target_dim+j] =
322  lmk_feature_matrix[i*m_target_dim+j];
323  }
324 
325  // get exactly defined pseudoinverse of landmarks feature matrix
327  for (i=0; i<lmk_N; i++)
328  {
329  for (j=0; j<m_target_dim; j++)
330  lmk_feature_matrix[i*m_target_dim+j] /= m_eigenvalues.vector[j];
331  }
332 
333 
334  // set to_process els true if should be processed
335  bool* to_process = SG_MALLOC(bool, total_N);
336  for (j=0; j<total_N; j++)
337  to_process[j] = true;
338  for (j=0; j<lmk_N; j++)
339  to_process[lmk_idxs.vector[j]] = false;
340 
341  // get embedding for non-landmark vectors
342 #ifdef HAVE_PTHREAD
343  int32_t num_threads = parallel->get_num_threads();
344  ASSERT(num_threads>0);
345  // allocate threads and it's parameters
346  pthread_t* threads = SG_MALLOC(pthread_t, num_threads);
347  TRIANGULATION_THREAD_PARAM* parameters = SG_MALLOC(TRIANGULATION_THREAD_PARAM, num_threads);
348  pthread_attr_t attr;
349  pthread_attr_init(&attr);
350  pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
351  float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N*num_threads);
352  // run threads
353  for (t=0; t<num_threads; t++)
354  {
355  parameters[t].idx_start = t;
356  parameters[t].idx_stop = total_N;
357  parameters[t].idx_step = num_threads;
358  parameters[t].lmk_N = lmk_N;
359  parameters[t].total_N = total_N;
360  parameters[t].m_target_dim = m_target_dim;
361  parameters[t].current_dist_to_lmks = current_dist_to_lmks+t*lmk_N;
362  parameters[t].distance_matrix = distance_matrix.matrix;
363  parameters[t].mean_sq_dist_vector = mean_sq_dist_vector;
364  parameters[t].lmk_idxs = lmk_idxs.vector;
365  parameters[t].lmk_feature_matrix = lmk_feature_matrix.matrix;
366  parameters[t].new_feature_matrix = new_feature_matrix;
367  parameters[t].to_process = to_process;
368  pthread_create(&threads[t], &attr, run_triangulation_thread, (void*)&parameters[t]);
369  }
370  // join threads
371  for (t=0; t<num_threads; t++)
372  pthread_join(threads[t], NULL);
373  pthread_attr_destroy(&attr);
374  SG_FREE(parameters);
375  SG_FREE(threads);
376 #else
377  // run single 'thread'
378  float64_t* current_dist_to_lmks = SG_MALLOC(float64_t, lmk_N);
379  TRIANGULATION_THREAD_PARAM single_thread_param;
380  single_thread_param.idx_start = 0;
381  single_thread_param.idx_stop = total_N;
382  single_thread_param.idx_step = 1;
383  single_thread_param.lmk_N = lmk_N;
384  single_thread_param.total_N = total_N;
385  single_thread_param.m_target_dim = m_target_dim;
386  single_thread_param.current_dist_to_lmks = current_dist_to_lmks;
387  single_thread_param.distance_matrix = distance_matrix.matrix;
388  single_thread_param.mean_sq_dist_vector = mean_sq_dist_vector;
389  single_thread_param.lmk_idxs = lmk_idxs.vector;
390  single_thread_param.lmk_feature_matrix = lmk_feature_matrix.matrix;
391  single_thread_param.new_feature_matrix = new_feature_matrix;
392  single_thread_param.to_process = to_process;
393  run_triangulation_thread((void*)&single_thread_param);
394 #endif
395  // cleanup
396  lmk_feature_matrix.destroy_matrix();
397  SG_FREE(current_dist_to_lmks);
398  lmk_idxs.destroy_vector();
399  SG_FREE(mean_sq_dist_vector);
400  SG_FREE(to_process);
401  lmk_idxs.destroy_vector();
402 
403  return SGMatrix<float64_t>(new_feature_matrix,m_target_dim,total_N);
404 }
405 
407 {
408  TRIANGULATION_THREAD_PARAM* parameters = (TRIANGULATION_THREAD_PARAM*)p;
409  int32_t idx_start = parameters->idx_start;
410  int32_t idx_step = parameters->idx_step;
411  int32_t idx_stop = parameters->idx_stop;
412  const int32_t* lmk_idxs = parameters->lmk_idxs;
413  const float64_t* distance_matrix = parameters->distance_matrix;
414  const float64_t* mean_sq_dist_vector = parameters->mean_sq_dist_vector;
415  float64_t* current_dist_to_lmks = parameters->current_dist_to_lmks;
416  int32_t m_target_dim = parameters->m_target_dim;
417  int32_t lmk_N = parameters->lmk_N;
418  int32_t total_N = parameters->total_N;
419  const bool* to_process = parameters->to_process;
420  float64_t* lmk_feature_matrix = parameters->lmk_feature_matrix;
421  float64_t* new_feature_matrix = parameters->new_feature_matrix;
422 
423  int32_t i,k;
424  for (i=idx_start; i<idx_stop; i+=idx_step)
425  {
426  // skip if landmark
427  if (!to_process[i])
428  continue;
429 
430  // compute difference from mean landmark distance vector
431  for (k=0; k<lmk_N; k++)
432  {
433  current_dist_to_lmks[k] =
434  CMath::sq(distance_matrix[i*total_N+lmk_idxs[k]]) -
435  mean_sq_dist_vector[k];
436  }
437  // compute embedding
438  cblas_dgemv(CblasColMajor,CblasNoTrans,
439  m_target_dim,lmk_N,
440  -0.5,lmk_feature_matrix,m_target_dim,
441  current_dist_to_lmks,1,
442  0.0,(new_feature_matrix+i*m_target_dim),1);
443  }
444  return NULL;
445 }
446 
447 
448 SGVector<int32_t> CMultidimensionalScaling::shuffle(int32_t count, int32_t total_count)
449 {
450  int32_t* idxs = SG_MALLOC(int32_t, total_count);
451  int32_t i,rnd;
452  int32_t* permuted_idxs = SG_MALLOC(int32_t, count);
453 
454  // reservoir sampling
455  for (i=0; i<total_count; i++)
456  idxs[i] = i;
457  for (i=0; i<count; i++)
458  permuted_idxs[i] = idxs[i];
459  for (i=count; i<total_count; i++)
460  {
461  rnd = CMath::random(1,i);
462  if (rnd<count)
463  permuted_idxs[rnd] = idxs[i];
464  }
465  SG_FREE(idxs);
466 
467  CMath::qsort(permuted_idxs,count);
468  return SGVector<int32_t>(permuted_idxs, count);
469 }
470 
471 #endif /* HAVE_LAPACK */

SHOGUN Machine Learning Toolbox - Documentation