SHOGUN  v3.0.1
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义  
KNN.cpp
浏览该文件的文档.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2006 Christian Gehl
8  * Written (W) 2006-2009 Soeren Sonnenburg
9  * Written (W) 2011 Sergey Lisitsyn
10  * Written (W) 2012 Fernando José Iglesias García, cover tree support
11  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society
12  */
13 
14 #include <shogun/multiclass/KNN.h>
15 #include <shogun/labels/Labels.h>
18 #include <shogun/lib/Signal.h>
19 #include <shogun/lib/JLCoverTree.h>
20 #include <shogun/lib/Time.h>
21 #include <shogun/base/Parameter.h>
22 
23 //#define BENCHMARK_KNN
24 //#define DEBUG_KNN
25 
26 using namespace shogun;
27 
30 {
31  init();
32 }
33 
34 CKNN::CKNN(int32_t k, CDistance* d, CLabels* trainlab)
36 {
37  init();
38 
39  m_k=k;
40 
41  ASSERT(d)
42  ASSERT(trainlab)
43 
44  set_distance(d);
45  set_labels(trainlab);
47 }
48 
49 void CKNN::init()
50 {
51  /* do not store model features by default (CDistanceMachine::apply(...) is
52  * overwritten */
54 
55  m_k=3;
56  m_q=1.0;
57  m_use_covertree=false;
58  m_num_classes=0;
59 
60  /* use the method classify_multiply_k to experiment with different values
61  * of k */
62  SG_ADD(&m_k, "m_k", "Parameter k", MS_NOT_AVAILABLE);
63  SG_ADD(&m_q, "m_q", "Parameter q", MS_AVAILABLE);
64  SG_ADD(&m_use_covertree, "m_use_covertree", "Parameter use_covertree", MS_NOT_AVAILABLE);
65  SG_ADD(&m_num_classes, "m_num_classes", "Number of classes", MS_NOT_AVAILABLE);
66 }
67 
69 {
70 }
71 
73 {
76 
77  if (data)
78  {
79  if (m_labels->get_num_labels() != data->get_num_vectors())
80  SG_ERROR("Number of training vectors does not match number of labels\n")
81  distance->init(data, data);
82  }
83 
84  SGVector<int32_t> lab=((CMulticlassLabels*) m_labels)->get_int_labels();
88 
89  int32_t max_class=m_train_labels.vector[0];
90  int32_t min_class=m_train_labels.vector[0];
91 
92  for (int32_t i=1; i<m_train_labels.vlen; i++)
93  {
94  max_class=CMath::max(max_class, m_train_labels.vector[i]);
95  min_class=CMath::min(min_class, m_train_labels.vector[i]);
96  }
97 
98  for (int32_t i=0; i<m_train_labels.vlen; i++)
99  m_train_labels.vector[i]-=min_class;
100 
101  m_min_label=min_class;
102  m_num_classes=max_class-min_class+1;
103 
104  SG_INFO("m_num_classes: %d (%+d to %+d) num_train: %d\n", m_num_classes,
105  min_class, max_class, m_train_labels.vlen);
106 
107  return true;
108 }
109 
111 {
112  //number of examples to which kNN is applied
113  int32_t n=distance->get_num_vec_rhs();
114  //distances to train data
115  float64_t* dists=SG_MALLOC(float64_t, m_train_labels.vlen);
116  //indices to train data
117  index_t* train_idxs=SG_MALLOC(index_t, m_train_labels.vlen);
118  //pre-allocation of the nearest neighbors
119  SGMatrix<index_t> NN(m_k, n);
120 
121  //for each test example
122  for (int32_t i=0; i<n && (!CSignal::cancel_computations()); i++)
123  {
124  SG_PROGRESS(i, 0, n)
125 
126  //lhs idx 0..num train examples-1 (i.e., all train examples) and rhs idx i
127  distances_lhs(dists,0,m_train_labels.vlen-1,i);
128 
129  //fill in an array with 0..num train examples-1
130  for (int32_t j=0; j<m_train_labels.vlen; j++)
131  train_idxs[j]=j;
132 
133  //sort the distance vector between test example i and all train examples
134  CMath::qsort_index(dists, train_idxs, m_train_labels.vlen);
135 
136 #ifdef DEBUG_KNN
137  SG_PRINT("\nQuick sort query %d\n", i)
138  for (int32_t j=0; j<m_k; j++)
139  SG_PRINT("%d ", train_idxs[j])
140  SG_PRINT("\n")
141 #endif
142 
143  //fill in the output the indices of the nearest neighbors
144  for (int32_t j=0; j<m_k; j++)
145  NN(j,i) = train_idxs[j];
146  }
147 
148  SG_FREE(train_idxs);
149  SG_FREE(dists);
150 
151  return NN;
152 }
153 
155 {
156  if (data)
157  init_distance(data);
158 
159  //redirecting to fast (without sorting) classify if k==1
160  if (m_k == 1)
161  return classify_NN();
162 
166 
167  int32_t num_lab=distance->get_num_vec_rhs();
168  ASSERT(m_k<=distance->get_num_vec_lhs())
169 
170  CMulticlassLabels* output=new CMulticlassLabels(num_lab);
171 
172  //labels of the k nearest neighbors
173  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
174 
175  SG_INFO("%d test examples\n", num_lab)
177 
178  //histogram of classes and returned output
179  float64_t* classes=SG_MALLOC(float64_t, m_num_classes);
180 
181 #ifdef BENCHMARK_KNN
182  CTime tstart;
183  float64_t tfinish, tparsed, tcreated, tqueried;
184 #endif
185 
186  if ( ! m_use_covertree )
187  {
188  //get the k nearest neighbors of each example
190 
191  //from the indices to the nearest neighbors, compute the class labels
192  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
193  {
194  //write the labels of the k nearest neighbors from theirs indices
195  for (int32_t j=0; j<m_k; j++)
196  train_lab[j] = m_train_labels[ NN(j,i) ];
197 
198  //get the index of the 'nearest' class
199  int32_t out_idx = choose_class(classes, train_lab);
200  //write the label of 'nearest' in the output
201  output->set_label(i, out_idx + m_min_label);
202  }
203 
204 #ifdef BENCHMARK_KNN
205  SG_PRINT(">>>> Quick sort applied in %9.4f\n",
206  (tfinish = tstart.cur_time_diff(false)));
207 #endif
208  }
209  else // Use cover tree
210  {
211  // m_q != 1.0 not supported with cover tree because the neighbors
212  // are not retrieved in increasing order of distance to the query
213  float64_t old_q = m_q;
214  if ( old_q != 1.0 )
215  SG_INFO("q != 1.0 not supported with cover tree, using q = 1\n")
216 
217  // From the sets of features (lhs and rhs) stored in distance,
218  // build arrays of cover tree points
219  v_array< CJLCoverTreePoint > set_of_points =
221  v_array< CJLCoverTreePoint > set_of_queries =
223 
224 #ifdef BENCHMARK_KNN
225  SG_PRINT(">>>> JL parsed in %9.4f\n",
226  ( tparsed = tstart.cur_time_diff(false) ) - tfinish);
227 #endif
228  // Build the cover trees, one for the test vectors (rhs features)
229  // and another for the training vectors (lhs features)
231  node< CJLCoverTreePoint > top = batch_create(set_of_points);
232  CFeatures* l = distance->replace_lhs(r);
233  distance->replace_rhs(r);
234  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
235 
236 #ifdef BENCHMARK_KNN
237  SG_PRINT(">>>> Cover trees created in %9.4f\n",
238  (tcreated = tstart.cur_time_diff(false)) - tparsed);
239 #endif
240 
241  // Get the k nearest neighbors to all the test vectors (batch method)
242  distance->replace_lhs(l);
244  k_nearest_neighbor(top, top_query, res, m_k);
245 
246 #ifdef BENCHMARK_KNN
247  SG_PRINT(">>>> Query finished in %9.4f\n",
248  (tqueried = tstart.cur_time_diff(false)) - tcreated);
249 #endif
250 
251 #ifdef DEBUG_KNN
252  SG_PRINT("\nJL Results:\n")
253  for ( int32_t i = 0 ; i < res.index ; ++i )
254  {
255  for ( int32_t j = 0 ; j < res[i].index ; ++j )
256  {
257  printf("%d ", res[i][j].m_index);
258  }
259  printf("\n");
260  }
261  SG_PRINT("\n")
262 #endif
263 
264  for ( int32_t i = 0 ; i < res.index ; ++i )
265  {
266  // Translate from indices to labels of the nearest neighbors
267  for ( int32_t j = 0; j < m_k; ++j )
268  // The first index in res[i] points to the test vector
269  train_lab[j] = m_train_labels.vector[ res[i][j+1].m_index ];
270 
271  // Get the index of the 'nearest' class
272  int32_t out_idx = choose_class(classes, train_lab);
273  output->set_label(res[i][0].m_index, out_idx+m_min_label);
274  }
275 
276  m_q = old_q;
277 
278 #ifdef BENCHMARK_KNN
279  SG_PRINT(">>>> JL applied in %9.4f\n", tstart.cur_time_diff(false))
280 #endif
281  }
282 
283  SG_FREE(classes);
284  SG_FREE(train_lab);
285 
286  return output;
287 }
288 
290 {
293 
294  int32_t num_lab = distance->get_num_vec_rhs();
295  ASSERT(num_lab)
296 
297  CMulticlassLabels* output = new CMulticlassLabels(num_lab);
298  float64_t* distances = SG_MALLOC(float64_t, m_train_labels.vlen);
299 
300  SG_INFO("%d test examples\n", num_lab)
302 
303  // for each test example
304  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
305  {
306  SG_PROGRESS(i,0,num_lab)
307 
308  // get distances from i-th test example to 0..num_m_train_labels-1 train examples
309  distances_lhs(distances,0,m_train_labels.vlen-1,i);
310  int32_t j;
311 
312  // assuming 0th train examples as nearest to i-th test example
313  int32_t out_idx = 0;
314  float64_t min_dist = distances[0];
315 
316  // searching for nearest neighbor by comparing distances
317  for (j=0; j<m_train_labels.vlen; j++)
318  {
319  if (distances[j]<min_dist)
320  {
321  min_dist = distances[j];
322  out_idx = j;
323  }
324  }
325 
326  // label i-th test example with label of nearest neighbor with out_idx index
327  output->set_label(i,m_train_labels.vector[out_idx]+m_min_label);
328  }
329 
330  SG_FREE(distances);
331  return output;
332 }
333 
335 {
339 
340  int32_t num_lab=distance->get_num_vec_rhs();
341  ASSERT(m_k<=num_lab)
342 
343  int32_t* output=SG_MALLOC(int32_t, m_k*num_lab);
344 
345  //working buffer of m_train_labels
346  int32_t* train_lab=SG_MALLOC(int32_t, m_k);
347 
348  //histogram of classes and returned output
349  int32_t* classes=SG_MALLOC(int32_t, m_num_classes);
350 
351  SG_INFO("%d test examples\n", num_lab)
353 
354  if ( ! m_use_covertree )
355  {
356  //get the k nearest neighbors of each example
358 
359  for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++)
360  {
361  //write the labels of the k nearest neighbors from theirs indices
362  for (int32_t j=0; j<m_k; j++)
363  train_lab[j] = m_train_labels[ NN(j,i) ];
364 
365  choose_class_for_multiple_k(output+i, classes, train_lab, num_lab);
366  }
367  }
368  else // Use cover tree
369  {
370  //allocation for distances to nearest neighbors
371  float64_t* dists=SG_MALLOC(float64_t, m_k);
372 
373  // From the sets of features (lhs and rhs) stored in distance,
374  // build arrays of cover tree points
375  v_array< CJLCoverTreePoint > set_of_points =
377  v_array< CJLCoverTreePoint > set_of_queries =
379 
380  // Build the cover trees, one for the test vectors (rhs features)
381  // and another for the training vectors (lhs features)
383  node< CJLCoverTreePoint > top = batch_create(set_of_points);
384  CFeatures* l = distance->replace_lhs(r);
385  distance->replace_rhs(r);
386  node< CJLCoverTreePoint > top_query = batch_create(set_of_queries);
387 
388  // Get the k nearest neighbors to all the test vectors (batch method)
389  distance->replace_lhs(l);
391  k_nearest_neighbor(top, top_query, res, m_k);
392 
393  for ( int32_t i = 0 ; i < res.index ; ++i )
394  {
395  // Handle the fact that cover tree doesn't return neighbors
396  // ordered by distance
397 
398  for ( int32_t j = 0 ; j < m_k ; ++j )
399  {
400  // The first index in res[i] points to the test vector
401  dists[j] = distance->distance(res[i][j+1].m_index,
402  res[i][0].m_index);
403  train_lab[j] = m_train_labels.vector[
404  res[i][j+1].m_index ];
405  }
406 
407  // Now we get the indices to the neighbors sorted by distance
408  CMath::qsort_index(dists, train_lab, m_k);
409 
410  choose_class_for_multiple_k(output+res[i][0].m_index, classes,
411  train_lab, num_lab);
412  }
413 
414  SG_FREE(dists);
415  }
416 
417  SG_FREE(train_lab);
418  SG_FREE(classes);
419 
420  return SGMatrix<int32_t>(output,num_lab,m_k,true);
421 }
422 
424 {
425  if (!distance)
426  SG_ERROR("No distance assigned!\n")
427  CFeatures* lhs=distance->get_lhs();
428  if (!lhs || !lhs->get_num_vectors())
429  {
430  SG_UNREF(lhs);
431  SG_ERROR("No vectors on left hand side\n")
432  }
433  distance->init(lhs, data);
434  SG_UNREF(lhs);
435 }
436 
437 bool CKNN::load(FILE* srcfile)
438 {
441  return false;
442 }
443 
444 bool CKNN::save(FILE* dstfile)
445 {
448  return false;
449 }
450 
452 {
453  CFeatures* d_lhs=distance->get_lhs();
454  CFeatures* d_rhs=distance->get_rhs();
455 
456  /* copy lhs of underlying distance */
457  distance->init(d_lhs->duplicate(), d_rhs);
458 
459  SG_UNREF(d_lhs);
460  SG_UNREF(d_rhs);
461 }
462 
463 int32_t CKNN::choose_class(float64_t* classes, int32_t* train_lab)
464 {
465  memset(classes, 0, sizeof(float64_t)*m_num_classes);
466 
467  float64_t multiplier = m_q;
468  for (int32_t j=0; j<m_k; j++)
469  {
470  classes[train_lab[j]]+= multiplier;
471  multiplier*= multiplier;
472  }
473 
474  //choose the class that got 'outputted' most often
475  int32_t out_idx=0;
476  float64_t out_max=0;
477 
478  for (int32_t j=0; j<m_num_classes; j++)
479  {
480  if (out_max< classes[j])
481  {
482  out_idx= j;
483  out_max= classes[j];
484  }
485  }
486 
487  return out_idx;
488 }
489 
490 void CKNN::choose_class_for_multiple_k(int32_t* output, int32_t* classes, int32_t* train_lab, int32_t step)
491 {
492  //compute histogram of class outputs of the first k nearest neighbours
493  memset(classes, 0, sizeof(int32_t)*m_num_classes);
494 
495  for (int32_t j=0; j<m_k; j++)
496  {
497  classes[train_lab[j]]++;
498 
499  //choose the class that got 'outputted' most often
500  int32_t out_idx=0;
501  int32_t out_max=0;
502 
503  for (int32_t c=0; c<m_num_classes; c++)
504  {
505  if (out_max< classes[c])
506  {
507  out_idx= c;
508  out_max= classes[c];
509  }
510  }
511 
512  output[j*step]=out_idx+m_min_label;
513  }
514 }
Class Time that implements a stopwatch based on either cpu time or wall clock time.
Definition: Time.h:46
virtual void store_model_features()
Definition: KNN.cpp:451
#define SG_INFO(...)
Definition: SGIO.h:120
#define SG_RESET_LOCALE
Definition: SGIO.h:88
static T * clone_vector(const T *vec, int32_t len)
Definition: SGVector.cpp:263
virtual bool save(FILE *dstfile)
Definition: KNN.cpp:444
Class Distance, a base class for all the distances used in the Shogun toolbox.
Definition: Distance.h:80
int32_t index_t
Definition: common.h:60
#define SG_PROGRESS(...)
Definition: SGIO.h:144
void init_distance(CFeatures *data)
Definition: KNN.cpp:423
CFeatures * get_lhs()
Definition: Distance.h:194
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:35
virtual int32_t get_num_labels() const =0
static void qsort_index(T1 *output, T2 *index, uint32_t size)
Definition: Math.h:1499
node< P > batch_create(v_array< P > points)
Definition: JLCoverTree.h:292
CFeatures * replace_lhs(CFeatures *lhs)
Definition: Distance.cpp:167
SGMatrix< int32_t > classify_for_multiple_k()
Definition: KNN.cpp:334
Class v_array taken directly from JL&#39;s implementation.
virtual int32_t get_num_vectors() const =0
CLabels * m_labels
Definition: Machine.h:356
void distances_lhs(float64_t *result, int32_t idx_a1, int32_t idx_a2, int32_t idx_b)
#define SG_ERROR(...)
Definition: SGIO.h:131
CFeatures * get_rhs()
Definition: Distance.h:200
virtual CFeatures * duplicate() const =0
int32_t m_min_label
smallest label, i.e. -1
Definition: KNN.h:243
virtual bool train_machine(CFeatures *data=NULL)
Definition: KNN.cpp:72
SGMatrix< index_t > nearest_neighbors()
Definition: KNN.cpp:110
#define SG_SET_LOCALE_C
Definition: SGIO.h:87
A generic DistanceMachine interface.
bool set_label(int32_t idx, float64_t label)
virtual bool load(FILE *srcfile)
Definition: KNN.cpp:437
v_array< CJLCoverTreePoint > parse_points(CDistance *distance, EFeaturesContainer fc)
int32_t m_num_classes
number of classes (i.e. number of values labels can take)
Definition: KNN.h:240
Multiclass Labels for multi-class classification.
float64_t cur_time_diff(bool verbose=false)
Definition: Time.cpp:67
int32_t m_k
the k parameter in KNN
Definition: KNN.h:231
#define SG_PRINT(...)
Definition: SGIO.h:139
virtual void set_store_model_features(bool store_model)
Definition: Machine.cpp:117
#define ASSERT(x)
Definition: SGIO.h:203
static void clear_cancel()
Definition: Signal.cpp:128
double float64_t
Definition: common.h:48
static T max(T a, T b)
return the maximum of two integers
Definition: Math.h:167
virtual int32_t get_num_vec_rhs()
Definition: Distance.h:291
static bool cancel_computations()
Definition: Signal.h:85
CFeatures * replace_rhs(CFeatures *rhs)
Definition: Distance.cpp:145
float64_t m_q
parameter q of rank weighting
Definition: KNN.h:234
SGVector< int32_t > m_train_labels
Definition: KNN.h:246
shogun matrix
Definition: DenseFeatures.h:27
virtual float64_t distance(int32_t idx_a, int32_t idx_b)
Definition: Distance.cpp:189
#define SG_UNREF(x)
Definition: SGObject.h:54
virtual ~CKNN()
Definition: KNN.cpp:68
The class Features is the base class of all feature objects.
Definition: Features.h:62
static T min(T a, T b)
return the minimum of two integers
Definition: Math.h:160
void set_distance(CDistance *d)
virtual CMulticlassLabels * classify_NN()
Definition: KNN.cpp:289
virtual CMulticlassLabels * apply_multiclass(CFeatures *data=NULL)
Definition: KNN.cpp:154
void k_nearest_neighbor(const node< P > &top_node, const node< P > &query, v_array< v_array< P > > &results, int k)
Definition: JLCoverTree.h:825
#define SG_ADD(...)
Definition: SGObject.h:83
virtual bool init(CFeatures *lhs, CFeatures *rhs)
Definition: Distance.cpp:78
virtual void set_labels(CLabels *lab)
Definition: Machine.cpp:75
bool m_use_covertree
parameter to enable cover tree support
Definition: KNN.h:237
index_t vlen
Definition: SGVector.h:706

SHOGUN Machine Learning Toolbox - Documentation