SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SparseFeatures.cpp
Go to the documentation of this file.
4 #include <shogun/lib/DataType.h>
5 #include <shogun/io/SGIO.h>
6 
7 #include <string.h>
8 #include <stdlib.h>
9 
10 namespace shogun
11 {
12 
13 template<class ST> CSparseFeatures<ST>::CSparseFeatures(int32_t size)
14 : CDotFeatures(size), num_vectors(0), num_features(0),
15  sparse_feature_matrix(NULL), feature_cache(NULL)
16 {
17  init();
18 }
19 
21  int32_t num_feat, int32_t num_vec, bool copy)
22 : CDotFeatures(0), num_vectors(0), num_features(0),
23  sparse_feature_matrix(NULL), feature_cache(NULL)
24 {
25  init();
26 
27  if (!copy)
28  set_sparse_feature_matrix(SGSparseMatrix<ST>(src, num_feat, num_vec));
29  else
30  {
32  memcpy(sparse_feature_matrix, src, sizeof(SGSparseVector<ST>)*num_vec);
33  for (int32_t i=0; i< num_vec; i++)
34  {
36  memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(SGSparseVectorEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
37 
38  }
39  }
40 }
41 
43 : CDotFeatures(0), num_vectors(0), num_features(0),
44  sparse_feature_matrix(NULL), feature_cache(NULL)
45 {
46  init();
47 
49 }
50 
52 : CDotFeatures(0), num_vectors(0), num_features(0),
53  sparse_feature_matrix(NULL), feature_cache(NULL)
54 {
55  init();
56 
58 }
59 
60 template<class ST> CSparseFeatures<ST>::CSparseFeatures(const CSparseFeatures & orig)
61 : CDotFeatures(orig), num_vectors(orig.num_vectors),
62  num_features(orig.num_features),
63  sparse_feature_matrix(orig.sparse_feature_matrix),
64  feature_cache(orig.feature_cache)
65 {
66  init();
67 
68  if (orig.sparse_feature_matrix)
69  {
73  for (int32_t i=0; i< num_vectors; i++)
74  {
77 
78  }
79  }
80 
81  m_subset=orig.m_subset->duplicate();
82 }
83 template<class ST> CSparseFeatures<ST>::CSparseFeatures(CFile* loader)
84 : CDotFeatures(loader), num_vectors(0), num_features(0),
85  sparse_feature_matrix(NULL), feature_cache(NULL)
86 {
87  init();
88 
89  load(loader);
90 }
91 
93 {
94  free_sparse_features();
95 }
97 {
98  clean_tsparse(sparse_feature_matrix, num_vectors);
99  sparse_feature_matrix = NULL;
100  num_vectors=0;
101  num_features=0;
102  remove_subset();
103 }
105 {
106  free_sparse_feature_matrix();
107  delete feature_cache;
108  feature_cache = NULL;
109 }
110 template<class ST> CFeatures* CSparseFeatures<ST>::duplicate() const
111 {
112  return new CSparseFeatures<ST>(*this);
113 }
114 
115 template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index)
116 {
117  ASSERT(index>=0 && index<num_features) ;
118  ASSERT(num>=0 && num<get_num_vectors()) ;
119 
120  int32_t i;
121  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
122  ST ret = 0 ;
123 
124  if (sv.features)
125  {
126  for (i=0; i<sv.num_feat_entries; i++)
127  if (sv.features[i].feat_index==index)
128  ret+=sv.features[i].entry ;
129  }
130 
131  free_sparse_feature_vector(sv, num);
132 
133  return ret ;
134 }
135 
136 template<class ST> ST* CSparseFeatures<ST>::get_full_feature_vector(int32_t num, int32_t& len)
137 {
138  int32_t i;
139  len=0;
140  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
141  ST* fv=NULL;
142 
143  if (sv.features)
144  {
145  len=num_features;
146  fv=SG_MALLOC(ST, num_features);
147 
148  for (i=0; i<num_features; i++)
149  fv[i]=0;
150 
151  for (i=0; i<sv.num_feat_entries; i++)
152  fv[sv.features[i].feat_index]= sv.features[i].entry;
153  }
154 
155  free_sparse_feature_vector(sv, num);
156 
157  return fv;
158 }
159 
161 {
162  if (num>=num_vectors)
163  {
164  SG_ERROR("Index out of bounds (number of vectors %d, you "
165  "requested %d)\n", num_vectors, num);
166  }
167 
168  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
169 
170  SGVector<ST> dense;
171 
172  if (sv.features)
173  {
174  dense.do_free=true;
175  dense.vlen=num_features;
176  dense.vector=SG_MALLOC(ST, num_features);
177  memset(dense.vector, 0, sizeof(ST)*num_features);
178 
179  for (int32_t i=0; i<sv.num_feat_entries; i++)
180  {
181  dense.vector[sv.features[i].feat_index]= sv.features[i].entry;
182  }
183  }
184 
185  free_sparse_feature_vector(sv, num);
186 
187  return dense;
188 }
189 
190 template<class ST> int32_t CSparseFeatures<ST>::get_nnz_features_for_vector(int32_t num)
191 {
192  SGSparseVector<ST> sv = get_sparse_feature_vector(num);
193  int32_t len=sv.num_feat_entries;
194  free_sparse_feature_vector(sv, num);
195  return len;
196 }
197 
199 {
200  ASSERT(num<get_num_vectors());
201 
202  index_t real_num=subset_idx_conversion(num);
203 
204  SGSparseVector<ST> result;
205 
206  if (sparse_feature_matrix)
207  {
208  result=sparse_feature_matrix[real_num];
209  result.do_free=false;
210  return result;
211  }
212  else
213  {
214  result.do_free=false;
215 
216  if (feature_cache)
217  {
218  result.features=feature_cache->lock_entry(num);
219 
220  if (result.features)
221  return result;
222  else
223  {
224  result.features=feature_cache->set_entry(num);
225  }
226  }
227 
228  if (!result.features)
229  result.do_free=true;
230 
231  result.features=compute_sparse_feature_vector(num,
232  result.num_feat_entries, result.features);
233 
234 
235  if (get_num_preprocessors())
236  {
237  int32_t tmp_len=result.num_feat_entries;
238  SGSparseVectorEntry<ST>* tmp_feat_before=result.features;
239  SGSparseVectorEntry<ST>* tmp_feat_after = NULL;
240 
241  for (int32_t i=0; i<get_num_preprocessors(); i++)
242  {
243  //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
244 
245  if (i!=0) // delete feature vector, except for the the first one, i.e., feat
246  SG_FREE(tmp_feat_before);
247  tmp_feat_before=tmp_feat_after;
248  }
249 
250  memcpy(result.features, tmp_feat_after,
251  sizeof(SGSparseVectorEntry<ST>)*tmp_len);
252 
253  SG_FREE(tmp_feat_after);
254  result.num_feat_entries=tmp_len ;
255  SG_DEBUG( "len: %d len2: %d\n", result.num_feat_entries, num_features);
256  }
257  result.vec_index=num;
258  return result ;
259  }
260 }
261 
262 template<class ST> ST CSparseFeatures<ST>::sparse_dot(ST alpha, SGSparseVectorEntry<ST>* avec, int32_t alen, SGSparseVectorEntry<ST>* bvec, int32_t blen)
263 {
264  ST result=0;
265 
266  //result remains zero when one of the vectors is non existent
267  if (avec && bvec)
268  {
269  if (alen<=blen)
270  {
271  int32_t j=0;
272  for (int32_t i=0; i<alen; i++)
273  {
274  int32_t a_feat_idx=avec[i].feat_index;
275 
276  while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
277  j++;
278 
279  if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
280  {
281  result+= avec[i].entry * bvec[j].entry;
282  j++;
283  }
284  }
285  }
286  else
287  {
288  int32_t j=0;
289  for (int32_t i=0; i<blen; i++)
290  {
291  int32_t b_feat_idx=bvec[i].feat_index;
292 
293  while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
294  j++;
295 
296  if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
297  {
298  result+= bvec[i].entry * avec[j].entry;
299  j++;
300  }
301  }
302  }
303 
304  result*=alpha;
305  }
306 
307  return result;
308 }
309 
310 template<class ST> ST CSparseFeatures<ST>::dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
311 {
312  ASSERT(vec);
313  ASSERT(dim==num_features);
314  ST result=b;
315 
316  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
317 
318  if (sv.features)
319  {
320  for (int32_t i=0; i<sv.num_feat_entries; i++)
321  {
322  result+=alpha*vec[sv.features[i].feat_index]
323  *sv.features[i].entry;
324  }
325  }
326 
327  free_sparse_feature_vector(sv, num);
328  return result;
329 }
330 
331 template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val)
332 {
333  ASSERT(vec);
334  if (dim!=num_features)
335  {
336  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
337  dim, num_features);
338  }
339 
340  SGSparseVector<ST> sv=get_sparse_feature_vector(num);
341 
342  if (sv.features)
343  {
344  if (abs_val)
345  {
346  for (int32_t i=0; i<sv.num_feat_entries; i++)
347  {
348  vec[sv.features[i].feat_index]+=alpha
349  *CMath::abs(sv.features[i].entry);
350  }
351  }
352  else
353  {
354  for (int32_t i=0; i<sv.num_feat_entries; i++)
355  {
356  vec[sv.features[i].feat_index]+=alpha
357  *sv.features[i].entry;
358  }
359  }
360  }
361 
362  free_sparse_feature_vector(sv, num);
363 }
364 
366 {
367  if (feature_cache)
368  feature_cache->unlock_entry(subset_idx_conversion(num));
369 
370  vec.free_vector();
371 }
372 
373 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
374 {
375  if (m_subset)
376  SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
377 
378  num_feat=num_features;
379  num_vec=num_vectors;
380 
381  return sparse_feature_matrix;
382 }
383 
385 {
386  if (m_subset)
387  SG_ERROR("get_sparse_feature_matrix() not allowed with subset\n");
388 
390  sm.sparse_matrix=get_sparse_feature_matrix(sm.num_features, sm.num_vectors);
391  return sm;
392 }
393 
394 template<class ST> void CSparseFeatures<ST>::clean_tsparse(SGSparseVector<ST>* sfm, int32_t num_vec)
395 {
396  if (sfm)
397  {
398  for (int32_t i=0; i<num_vec; i++)
399  SG_FREE(sfm[i].features);
400 
401  SG_FREE(sfm);
402  }
403 }
404 
406 {
407  int32_t num_feat;
408  int32_t num_vec;
409  SGSparseVector<ST>* s=get_transposed(num_feat, num_vec);
410  return new CSparseFeatures<ST>(s, num_feat, num_vec);
411 }
412 
413 template<class ST> SGSparseVector<ST>* CSparseFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
414 {
415  num_feat=get_num_vectors();
416  num_vec=num_features;
417 
418  int32_t* hist=SG_MALLOC(int32_t, num_features);
419  memset(hist, 0, sizeof(int32_t)*num_features);
420 
421  // count how lengths of future feature vectors
422  for (int32_t v=0; v<num_feat; v++)
423  {
424  SGSparseVector<ST> sv=get_sparse_feature_vector(v);
425 
426  for (int32_t i=0; i<sv.num_feat_entries; i++)
427  hist[sv.features[i].feat_index]++;
428 
429  free_sparse_feature_vector(sv, v);
430  }
431 
432  // allocate room for future feature vectors
434  for (int32_t v=0; v<num_vec; v++)
435  {
436  sfm[v].features= SG_MALLOC(SGSparseVectorEntry<ST>, hist[v]);
437  sfm[v].num_feat_entries=hist[v];
438  sfm[v].vec_index=v;
439  }
440 
441  // fill future feature vectors with content
442  memset(hist,0,sizeof(int32_t)*num_features);
443  for (int32_t v=0; v<num_feat; v++)
444  {
445  SGSparseVector<ST> sv=get_sparse_feature_vector(v);
446 
447  for (int32_t i=0; i<sv.num_feat_entries; i++)
448  {
449  int32_t vidx=sv.features[i].feat_index;
450  int32_t fidx=v;
451  sfm[vidx].features[hist[vidx]].feat_index=fidx;
452  sfm[vidx].features[hist[vidx]].entry=sv.features[i].entry;
453  hist[vidx]++;
454  }
455 
456  free_sparse_feature_vector(sv, v);
457  }
458 
459  SG_FREE(hist);
460  return sfm;
461 }
462 
464 {
465  if (m_subset)
466  SG_ERROR("set_sparse_feature_matrix() not allowed with subset\n");
467 
468 
469  free_sparse_feature_matrix();
470  sm.own_matrix();
471 
472  sparse_feature_matrix=sm.sparse_matrix;
473  num_features=sm.num_features;
474  num_vectors=sm.num_vectors;
475 }
476 
478 {
479  SGMatrix<ST> full;
480 
481  SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
482  full.num_rows=num_features;
483  full.num_cols=get_num_vectors();
484  full.do_free=true;
485  full.matrix=SG_MALLOC(ST, int64_t(num_features)*get_num_vectors());
486 
487  memset(full.matrix, 0, size_t(num_features)*size_t(get_num_vectors())*sizeof(ST));
488 
489  for (int32_t v=0; v<full.num_cols; v++)
490  {
491  SGSparseVector<ST> current=
492  sparse_feature_matrix[subset_idx_conversion(v)];
493 
494  for (int32_t f=0; f<current.num_feat_entries; f++)
495  {
496  int64_t offs=(current.vec_index*num_features)
497  +current.features[f].feat_index;
498 
499  full.matrix[offs]=current.features[f].entry;
500  }
501  }
502 
503  return full;
504 }
505 
507 {
508  remove_subset();
509 
510  ST* src=full.matrix;
511  int32_t num_feat=full.num_rows;
512  int32_t num_vec=full.num_cols;
513 
514  free_sparse_feature_matrix();
515  bool result=true;
516  num_features=num_feat;
517  num_vectors=num_vec;
518 
519  SG_INFO("converting dense feature matrix to sparse one\n");
520  int32_t* num_feat_entries=SG_MALLOC(int, num_vectors);
521 
522  if (num_feat_entries)
523  {
524  int64_t num_total_entries=0;
525 
526  // count nr of non sparse features
527  for (int32_t i=0; i< num_vec; i++)
528  {
529  num_feat_entries[i]=0;
530  for (int32_t j=0; j< num_feat; j++)
531  {
532  if (src[i*((int64_t) num_feat) + j] != 0)
533  num_feat_entries[i]++;
534  }
535  }
536 
537  if (num_vec>0)
538  {
539  sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vec);
540 
541  if (sparse_feature_matrix)
542  {
543  for (int32_t i=0; i< num_vec; i++)
544  {
545  sparse_feature_matrix[i].vec_index=i;
546  sparse_feature_matrix[i].num_feat_entries=0;
547  sparse_feature_matrix[i].features= NULL;
548 
549  if (num_feat_entries[i]>0)
550  {
551  sparse_feature_matrix[i].features= SG_MALLOC(SGSparseVectorEntry<ST>, num_feat_entries[i]);
552 
553  if (!sparse_feature_matrix[i].features)
554  {
555  SG_INFO( "allocation of features failed\n");
556  return false;
557  }
558 
559  sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
560  int32_t sparse_feat_idx=0;
561 
562  for (int32_t j=0; j< num_feat; j++)
563  {
564  int64_t pos= i*num_feat + j;
565 
566  if (src[pos] != 0)
567  {
568  sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
569  sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
570  sparse_feat_idx++;
571  num_total_entries++;
572  }
573  }
574  }
575  }
576  }
577  else
578  {
579  SG_ERROR( "allocation of sparse feature matrix failed\n");
580  result=false;
581  }
582 
583  SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
584  num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec));
585  }
586  else
587  {
588  SG_ERROR( "huh ? zero size matrix given ?\n");
589  result=false;
590  }
591  }
592  SG_FREE(num_feat_entries);
593  return result;
594 }
595 
596 template<class ST> bool CSparseFeatures<ST>::apply_preprocessor(bool force_preprocessing)
597 {
598  SG_INFO( "force: %d\n", force_preprocessing);
599 
600  if ( sparse_feature_matrix && get_num_preprocessors() )
601  {
602  for (int32_t i=0; i<get_num_preprocessors(); i++)
603  {
604  if ( (!is_preprocessed(i) || force_preprocessing) )
605  {
606  set_preprocessed(i);
607  SG_INFO( "preprocessing using preproc %s\n", get_preprocessor(i)->get_name());
608  if (((CSparsePreprocessor<ST>*) get_preprocessor(i))->apply_to_sparse_feature_matrix(this) == NULL)
609  return false;
610  }
611  return true;
612  }
613  return true;
614  }
615  else
616  {
617  SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
618  return false;
619  }
620 }
621 
622 template<class ST> int32_t CSparseFeatures<ST>::get_size()
623 {
624  return sizeof(ST);
625 }
626 
628 {
630  ASSERT(fm.matrix && fm.num_cols>0 && fm.num_rows>0);
631 
632  return set_full_feature_matrix(fm);
633 }
634 
635 template<class ST> int32_t CSparseFeatures<ST>::get_num_vectors() const
636 {
637  return m_subset ? m_subset->get_size() : num_vectors;
638 }
639 
640 template<class ST> int32_t CSparseFeatures<ST>::get_num_features()
641 {
642  return num_features;
643 }
644 
645 template<class ST> int32_t CSparseFeatures<ST>::set_num_features(int32_t num)
646 {
647  int32_t n=num_features;
648  ASSERT(n<=num);
649  num_features=num;
650  return num_features;
651 }
652 
654 {
655  return C_SPARSE;
656 }
657 
658 template<class ST> void CSparseFeatures<ST>::free_feature_vector(SGSparseVector<ST> vec, int32_t num)
659 {
660  if (feature_cache)
661  feature_cache->unlock_entry(subset_idx_conversion(num));
662 
663  vec.free_vector();
664 }
665 
667 {
668  int64_t num=0;
669  index_t num_vec=get_num_vectors();
670  for (int32_t i=0; i<num_vec; i++)
671  num+=sparse_feature_matrix[subset_idx_conversion(i)].num_feat_entries;
672 
673  return num;
674 }
675 
677 {
678  ASSERT(sq);
679 
680  index_t num_vec=get_num_vectors();
681  for (int32_t i=0; i<num_vec; i++)
682  {
683  sq[i]=0;
684  SGSparseVector<ST> vec=get_sparse_feature_vector(i);
685 
686  for (int32_t j=0; j<vec.num_feat_entries; j++)
687  sq[i]+=vec.features[j].entry*vec.features[j].entry;
688 
689  free_feature_vector(vec, i);
690  }
691 
692  return sq;
693 }
694 
696  CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a,
697  CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
698 {
699  int32_t i,j;
700  ASSERT(lhs);
701  ASSERT(rhs);
702 
705  ASSERT(avec.features);
706  ASSERT(bvec.features);
707 
708  float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
709 
710  if (avec.num_feat_entries<=bvec.num_feat_entries)
711  {
712  j=0;
713  for (i=0; i<avec.num_feat_entries; i++)
714  {
715  int32_t a_feat_idx=avec.features[i].feat_index;
716 
717  while ((j<bvec.num_feat_entries)
718  &&(bvec.features[j].feat_index<a_feat_idx))
719  j++;
720 
721  if ((j<bvec.num_feat_entries)
722  &&(bvec.features[j].feat_index==a_feat_idx))
723  {
724  result-=2*(avec.features[i].entry*bvec.features[j].entry);
725  j++;
726  }
727  }
728  }
729  else
730  {
731  j=0;
732  for (i=0; i<bvec.num_feat_entries; i++)
733  {
734  int32_t b_feat_idx=bvec.features[i].feat_index;
735 
736  while ((j<avec.num_feat_entries)
737  &&(avec.features[j].feat_index<b_feat_idx))
738  j++;
739 
740  if ((j<avec.num_feat_entries)
741  &&(avec.features[j].feat_index==b_feat_idx))
742  {
743  result-=2*(bvec.features[i].entry*avec.features[j].entry);
744  j++;
745  }
746  }
747  }
748 
749  ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a);
750  ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b);
751 
752  return CMath::abs(result);
753 }
754 
755 template<class ST> CLabels* CSparseFeatures<ST>::load_svmlight_file(char* fname,
756  bool do_sort_features)
757 {
758  remove_subset();
759 
760  CLabels* lab=NULL;
761 
762  size_t blocksize=1024*1024;
763  size_t required_blocksize=blocksize;
764  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
765  FILE* f=fopen(fname, "ro");
766 
767  if (f)
768  {
769  free_sparse_feature_matrix();
770  num_vectors=0;
771  num_features=0;
772 
773  SG_INFO("counting line numbers in file %s\n", fname);
774  size_t sz=blocksize;
775  size_t block_offs=0;
776  size_t old_block_offs=0;
777  fseek(f, 0, SEEK_END);
778  size_t fsize=ftell(f);
779  rewind(f);
780 
781  while (sz == blocksize)
782  {
783  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
784  for (size_t i=0; i<sz; i++)
785  {
786  block_offs++;
787  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
788  {
789  num_vectors++;
790  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
791  old_block_offs=block_offs;
792  }
793  }
794  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
795  }
796 
797  SG_INFO("found %d feature vectors\n", num_vectors);
798  SG_FREE(dummy);
799  blocksize=required_blocksize;
800  dummy = SG_MALLOC(uint8_t, blocksize+1); //allow setting of '\0' at EOL
801 
802  lab=new CLabels(num_vectors);
803  sparse_feature_matrix=SG_MALLOC(SGSparseVector<ST>, num_vectors);
804 
805  rewind(f);
806  sz=blocksize;
807  int32_t lines=0;
808  while (sz == blocksize)
809  {
810  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
811 
812  size_t old_sz=0;
813  for (size_t i=0; i<sz; i++)
814  {
815  if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
816  {
817  size_t len=i-old_sz+1;
818  uint8_t* data=&dummy[old_sz];
819 
820  for (size_t j=0; j<len; j++)
821  dummy[j]=data[j];
822 
823  sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
824  i=0;
825  old_sz=0;
826  sz+=len;
827  }
828 
829  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
830  {
831 
832  size_t len=i-old_sz;
833  uint8_t* data=&dummy[old_sz];
834 
835  int32_t dims=0;
836  for (size_t j=0; j<len; j++)
837  {
838  if (data[j]==':')
839  dims++;
840  }
841 
842  if (dims<=0)
843  {
844  SG_ERROR("Error in line %d - number of"
845  " dimensions is %d line is %d characters"
846  " long\n line_content:'%.*s'\n", lines,
847  dims, len, len, (const char*) data);
848  }
849 
851  size_t j=0;
852  for (; j<len; j++)
853  {
854  if (data[j]==' ')
855  {
856  data[j]='\0';
857 
858  lab->set_label(lines, atof((const char*) data));
859  break;
860  }
861  }
862 
863  int32_t d=0;
864  j++;
865  uint8_t* start=&data[j];
866  for (; j<len; j++)
867  {
868  if (data[j]==':')
869  {
870  data[j]='\0';
871 
872  feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
873  num_features=CMath::max(num_features, feat[d].feat_index+1);
874 
875  j++;
876  start=&data[j];
877  for (; j<len; j++)
878  {
879  if (data[j]==' ' || data[j]=='\n')
880  {
881  data[j]='\0';
882  feat[d].entry=(ST) atof((const char*) start);
883  d++;
884  break;
885  }
886  }
887 
888  if (j==len)
889  {
890  data[j]='\0';
891  feat[dims-1].entry=(ST) atof((const char*) start);
892  }
893 
894  j++;
895  start=&data[j];
896  }
897  }
898 
899  sparse_feature_matrix[lines].vec_index=lines;
900  sparse_feature_matrix[lines].num_feat_entries=dims;
901  sparse_feature_matrix[lines].features=feat;
902 
903  old_sz=i+1;
904  lines++;
905  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
906  }
907  }
908  }
909  SG_INFO("file successfully read\n");
910  fclose(f);
911  }
912 
913  SG_FREE(dummy);
914 
915  if (do_sort_features)
916  sort_features();
917 
918  return lab;
919 }
920 
921 template<class ST> void CSparseFeatures<ST>::sort_features()
922 {
923  if (m_subset)
924  SG_ERROR("sort_features() not allowed with subset\n");
925 
926  ASSERT(get_num_preprocessors()==0);
927 
928  if (!sparse_feature_matrix)
929  SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
930 
931  for (int32_t i=0; i<num_vectors; i++)
932  {
933  int32_t len=sparse_feature_matrix[i].num_feat_entries;
934 
935  if (!len)
936  continue;
937 
938  SGSparseVectorEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
939  int32_t* feat_idx=SG_MALLOC(int32_t, len);
940  int32_t* orig_idx=SG_MALLOC(int32_t, len);
941 
942  for (int j=0; j<len; j++)
943  {
944  feat_idx[j]=sf_orig[j].feat_index;
945  orig_idx[j]=j;
946  }
947 
948  CMath::qsort_index(feat_idx, orig_idx, len);
949 
951  for (int j=0; j<len; j++)
952  sf_new[j]=sf_orig[orig_idx[j]];
953 
954  sparse_feature_matrix[i].features=sf_new;
955 
956  // sanity check
957  for (int j=0; j<len-1; j++)
958  ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
959 
960  SG_FREE(orig_idx);
961  SG_FREE(feat_idx);
962  SG_FREE(sf_orig);
963  }
964 }
965 
966 template<class ST> bool CSparseFeatures<ST>::write_svmlight_file(char* fname,
967  CLabels* label)
968 {
969  if (m_subset)
970  SG_ERROR("write_svmlight_file() not allowed with subset\n");
971 
972  ASSERT(label);
973  int32_t num=label->get_num_labels();
974  ASSERT(num>0);
975  ASSERT(num==num_vectors);
976 
977  FILE* f=fopen(fname, "wb");
978 
979  if (f)
980  {
981  for (int32_t i=0; i<num; i++)
982  {
983  fprintf(f, "%d ", (int32_t) label->get_int_label(i));
984 
985  SGSparseVectorEntry<ST>* vec = sparse_feature_matrix[i].features;
986  int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
987 
988  for (int32_t j=0; j<num_feat; j++)
989  {
990  if (j<num_feat-1)
991  fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
992  else
993  fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
994  }
995  }
996 
997  fclose(f);
998  return true;
999  }
1000  return false;
1001 }
1002 
1003 template<class ST> int32_t CSparseFeatures<ST>::get_dim_feature_space() const
1004 {
1005  return num_features;
1006 }
1007 
1008 template<class ST> float64_t CSparseFeatures<ST>::dot(int32_t vec_idx1,
1009  CDotFeatures* df, int32_t vec_idx2)
1010 {
1011  ASSERT(df);
1012  ASSERT(df->get_feature_type() == get_feature_type());
1013  ASSERT(df->get_feature_class() == get_feature_class());
1015 
1016  SGSparseVector<ST> avec=get_sparse_feature_vector(vec_idx1);
1017  SGSparseVector<ST> bvec=sf->get_sparse_feature_vector(vec_idx2);
1018 
1019  float64_t result=sparse_dot(1, avec.features, avec.num_feat_entries,
1020  bvec.features, bvec.num_feat_entries);
1021 
1022  free_sparse_feature_vector(avec, vec_idx1);
1023  sf->free_sparse_feature_vector(bvec, vec_idx2);
1024 
1025  return result;
1026 }
1027 template<class ST> float64_t CSparseFeatures<ST>::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
1028 {
1029  ASSERT(vec2);
1030  if (vec2_len!=num_features)
1031  {
1032  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
1033  vec2_len, num_features);
1034  }
1035  float64_t result=0;
1036 
1037  SGSparseVector<ST> sv=get_sparse_feature_vector(vec_idx1);
1038 
1039  if (sv.features)
1040  {
1041  for (int32_t i=0; i<sv.num_feat_entries; i++)
1042  result+=vec2[sv.features[i].feat_index]*sv.features[i].entry;
1043  }
1044 
1045  free_sparse_feature_vector(sv, vec_idx1);
1046 
1047  return result;
1048 }
1049 
1050 template<class ST> void* CSparseFeatures<ST>::get_feature_iterator(int32_t vector_index)
1051 {
1052  if (vector_index>=get_num_vectors())
1053  {
1054  SG_ERROR("Index out of bounds (number of vectors %d, you "
1055  "requested %d)\n", get_num_vectors(), vector_index);
1056  }
1057 
1058  if (!sparse_feature_matrix)
1059  SG_ERROR("Requires a in-memory feature matrix\n");
1060 
1061  sparse_feature_iterator* it=SG_MALLOC(sparse_feature_iterator, 1);
1062  it->sv=get_sparse_feature_vector(vector_index);
1063  it->index=0;
1064 
1065  return it;
1066 }
1067 
1068 template<class ST> bool CSparseFeatures<ST>::get_next_feature(int32_t& index, float64_t& value, void* iterator)
1069 {
1070  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1071  if (!it || it->index>=it->sv.num_feat_entries)
1072  return false;
1073 
1074  int32_t i=it->index++;
1075 
1076  index=it->sv.features[i].feat_index;
1077  value=(float64_t) it->sv.features[i].entry;
1078 
1079  return true;
1080 }
1081 
1082 template<class ST> void CSparseFeatures<ST>::free_feature_iterator(void* iterator)
1083 {
1084  if (!iterator)
1085  return;
1086 
1087  sparse_feature_iterator* it=(sparse_feature_iterator*) iterator;
1088  free_sparse_feature_vector(it->sv, it->sv.vec_index);
1089  SG_FREE(it);
1090 }
1091 
1093 {
1094  SGSparseMatrix<ST> matrix_copy=SGSparseMatrix<ST>(indices.vlen,
1095  get_dim_feature_space());
1096 
1097  for (index_t i=0; i<indices.vlen; ++i)
1098  {
1099  /* index to copy */
1100  index_t index=indices.vector[i];
1101 
1102  /* copy sparse vector TODO THINK ABOUT VECTOR INDEX (i or vec.index*/
1103  SGSparseVector<ST> current=get_sparse_feature_vector(index);
1104  matrix_copy.sparse_matrix[i]=SGSparseVector<ST>(
1105  current.num_feat_entries, current.vec_index);
1106 
1107  /* copy entries */
1108  memcpy(matrix_copy.sparse_matrix[i].features, current.features,
1109  sizeof(SGSparseVectorEntry<ST>)*current.num_feat_entries);
1110 
1111  free_sparse_feature_vector(current, index);
1112  }
1113 
1114  return new CSparseFeatures<ST>(matrix_copy);
1115 }
1116 
1118  int32_t& len, SGSparseVectorEntry<ST>* target)
1119 {
1121 
1122  len=0;
1123  return NULL;
1124 }
1125 
1126 template<class ST> void CSparseFeatures<ST>::init()
1127 {
1128  set_generic<ST>();
1129 
1130  m_parameters->add_vector(&sparse_feature_matrix, &num_vectors,
1131  "sparse_feature_matrix",
1132  "Array of sparse vectors.");
1133  m_parameters->add(&num_features, "num_features",
1134  "Total number of features.");
1135 }
1136 
1137 #define GET_FEATURE_TYPE(sg_type, f_type) \
1138 template<> EFeatureType CSparseFeatures<sg_type>::get_feature_type() \
1139 { \
1140  return f_type; \
1141 }
1142 GET_FEATURE_TYPE(bool, F_BOOL)
1143 GET_FEATURE_TYPE(char, F_CHAR)
1144 GET_FEATURE_TYPE(uint8_t, F_BYTE)
1145 GET_FEATURE_TYPE(int8_t, F_BYTE)
1146 GET_FEATURE_TYPE(int16_t, F_SHORT)
1147 GET_FEATURE_TYPE(uint16_t, F_WORD)
1148 GET_FEATURE_TYPE(int32_t, F_INT)
1149 GET_FEATURE_TYPE(uint32_t, F_UINT)
1150 GET_FEATURE_TYPE(int64_t, F_LONG)
1151 GET_FEATURE_TYPE(uint64_t, F_ULONG)
1155 #undef GET_FEATURE_TYPE
1156 
1157 #define LOAD(fname, sg_type) \
1158 template<> void CSparseFeatures<sg_type>::load(CFile* loader) \
1159 { \
1160  remove_subset(); \
1161  SG_SET_LOCALE_C; \
1162  ASSERT(loader); \
1163  SGSparseVector<sg_type>* matrix=NULL; \
1164  int32_t num_feat=0; \
1165  int32_t num_vec=0; \
1166  loader->fname(matrix, num_feat, num_vec); \
1167  set_sparse_feature_matrix(SGSparseMatrix<sg_type>(matrix, num_feat, num_vec)); \
1168  SG_RESET_LOCALE; \
1169 }
1170 LOAD(get_sparse_matrix, bool)
1171 LOAD(get_sparse_matrix, char)
1172 LOAD(get_sparse_matrix, uint8_t)
1173 LOAD(get_int8_sparsematrix, int8_t)
1174 LOAD(get_sparse_matrix, int16_t)
1175 LOAD(get_sparse_matrix, uint16_t)
1176 LOAD(get_sparse_matrix, int32_t)
1177 LOAD(get_uint_sparsematrix, uint32_t)
1178 LOAD(get_long_sparsematrix, int64_t)
1179 LOAD(get_ulong_sparsematrix, uint64_t)
1180 LOAD(get_sparse_matrix, float32_t)
1181 LOAD(get_sparse_matrix, float64_t)
1182 LOAD(get_longreal_sparsematrix, floatmax_t)
1183 #undef LOAD
1184 
1185 #define WRITE(fname, sg_type) \
1186 template<> void CSparseFeatures<sg_type>::save(CFile* writer) \
1187 { \
1188  if (m_subset) \
1189  SG_ERROR("save() not allowed with subset\n"); \
1190  SG_SET_LOCALE_C; \
1191  ASSERT(writer); \
1192  writer->fname(sparse_feature_matrix, num_features, num_vectors); \
1193  SG_RESET_LOCALE; \
1194 }
1195 WRITE(set_sparse_matrix, bool)
1196 WRITE(set_sparse_matrix, char)
1197 WRITE(set_sparse_matrix, uint8_t)
1198 WRITE(set_int8_sparsematrix, int8_t)
1199 WRITE(set_sparse_matrix, int16_t)
1200 WRITE(set_sparse_matrix, uint16_t)
1201 WRITE(set_sparse_matrix, int32_t)
1202 WRITE(set_uint_sparsematrix, uint32_t)
1203 WRITE(set_long_sparsematrix, int64_t)
1204 WRITE(set_ulong_sparsematrix, uint64_t)
1205 WRITE(set_sparse_matrix, float32_t)
1206 WRITE(set_sparse_matrix, float64_t)
1207 WRITE(set_longreal_sparsematrix, floatmax_t)
1208 #undef WRITE
1209 
1210 template class CSparseFeatures<bool>;
1211 template class CSparseFeatures<char>;
1212 template class CSparseFeatures<int8_t>;
1213 template class CSparseFeatures<uint8_t>;
1214 template class CSparseFeatures<int16_t>;
1215 template class CSparseFeatures<uint16_t>;
1216 template class CSparseFeatures<int32_t>;
1217 template class CSparseFeatures<uint32_t>;
1218 template class CSparseFeatures<int64_t>;
1219 template class CSparseFeatures<uint64_t>;
1220 template class CSparseFeatures<float32_t>;
1221 template class CSparseFeatures<float64_t>;
1222 template class CSparseFeatures<floatmax_t>;
1223 }

SHOGUN Machine Learning Toolbox - Documentation