SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StringFeatures.cpp
Go to the documentation of this file.
5 #include <shogun/io/SGIO.h>
8 
9 #include <sys/types.h>
10 #include <sys/stat.h>
11 #include <dirent.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <unistd.h>
15 
16 
17 namespace shogun
18 {
19 
21 {
22  init();
23  alphabet=new CAlphabet();
24 }
25 
27 {
28  init();
29 
30  alphabet=new CAlphabet(alpha);
34 }
35 
36 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, EAlphabet alpha)
37 : CFeatures(0)
38 {
39  init();
40 
41  alphabet=new CAlphabet(alpha);
45  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
46 }
47 
48 template<class ST> CStringFeatures<ST>::CStringFeatures(SGStringList<ST> string_list, CAlphabet* alpha)
49 : CFeatures(0)
50 {
51  init();
52 
53  alphabet=new CAlphabet(alpha);
57  set_features(string_list.strings, string_list.num_strings, string_list.max_string_length);
58 }
59 
61 : CFeatures(0)
62 {
63  init();
64 
65  ASSERT(alpha);
66  SG_REF(alpha);
67  alphabet=alpha;
70 }
71 
72 template<class ST> CStringFeatures<ST>::CStringFeatures(const CStringFeatures & orig)
73 : CFeatures(orig), num_vectors(orig.num_vectors),
74  single_string(orig.single_string),
75  length_of_single_string(orig.length_of_single_string),
76  max_string_length(orig.max_string_length),
77  num_symbols(orig.num_symbols),
78  original_num_symbols(orig.original_num_symbols),
79  order(orig.order), preprocess_on_get(false),
80  feature_cache(NULL)
81 {
82  init();
83 
84  ASSERT(orig.single_string == NULL); //not implemented
85 
86  alphabet=orig.alphabet;
88 
89  if (orig.features)
90  {
92 
93  for (int32_t i=0; i<num_vectors; i++)
94  {
95  features[i].string=SG_MALLOC(ST, orig.features[i].slen);
96  features[i].slen=orig.features[i].slen;
97  memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].slen);
98  }
99  }
100 
101  if (orig.symbol_mask_table)
102  {
103  symbol_mask_table=SG_MALLOC(ST, 256);
104  for (int32_t i=0; i<256; i++)
106  }
107 
108  m_subset=orig.m_subset->duplicate();
109 }
110 
111 template<class ST> CStringFeatures<ST>::CStringFeatures(CFile* loader, EAlphabet alpha)
112 : CFeatures(loader), num_vectors(0),
113  features(NULL), single_string(NULL), length_of_single_string(0),
114  max_string_length(0), order(0),
115  symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
116 {
117  init();
118 
119  alphabet=new CAlphabet(alpha);
120  SG_REF(alphabet);
123  load(loader);
124 }
125 
127 {
128  cleanup();
129 
130  SG_UNREF(alphabet);
131 }
132 
133 template<class ST> void CStringFeatures<ST>::cleanup()
134 {
135  remove_subset();
136 
137  if (single_string)
138  {
139  SG_FREE(single_string);
140  single_string=NULL;
141  }
142  else
143  cleanup_feature_vectors(0, num_vectors-1);
144 
145  num_vectors=0;
146  SG_FREE(features);
147  SG_FREE(symbol_mask_table);
148  features=NULL;
149  symbol_mask_table=NULL;
150 
151  /* start with a fresh alphabet, but instead of emptying the histogram
152  * create a new object (to leave the alphabet object alone if it is used
153  * by others)
154  */
155  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
156  SG_UNREF(alphabet);
157  alphabet=alpha;
158  SG_REF(alphabet);
159 }
160 
161 template<class ST> void CStringFeatures<ST>::cleanup_feature_vector(int32_t num)
162 {
163  ASSERT(num<get_num_vectors());
164 
165  if (features)
166  {
167  int32_t real_num=subset_idx_conversion(num);
168  SG_FREE(features[real_num].string);
169  features[real_num].string=NULL;
170  features[real_num].slen=0;
171 
172  determine_maximum_string_length();
173  }
174 }
175 
176 template<class ST> void CStringFeatures<ST>::cleanup_feature_vectors(int32_t start, int32_t stop)
177 {
178  if (features && get_num_vectors())
179  {
180  ASSERT(start<get_num_vectors());
181  ASSERT(stop<get_num_vectors());
182 
183  for (int32_t i=start; i<=stop; i++)
184  {
185  int32_t real_num=subset_idx_conversion(i);
186  SG_FREE(features[real_num].string);
187  features[real_num].string=NULL;
188  features[real_num].slen=0;
189  }
190  determine_maximum_string_length();
191  }
192 }
193 
195 
197 
199 {
200  SG_REF(alphabet);
201  return alphabet;
202 }
203 
204 template<class ST> CFeatures* CStringFeatures<ST>::duplicate() const
205 {
206  return new CStringFeatures<ST>(*this);
207 }
208 
210 {
211  ASSERT(features);
212  if (num>=get_num_vectors())
213  {
214  SG_ERROR("Index out of bounds (number of strings %d, you "
215  "requested %d)\n", get_num_vectors(), num);
216  }
217 
218  int32_t l;
219  bool free_vec;
220  ST* vec=get_feature_vector(num, l, free_vec);
221  ST* dst=SG_MALLOC(ST, l);
222  memcpy(dst, vec, l*sizeof(ST));
223  free_feature_vector(vec, num, free_vec);
224  return SGVector<ST>(dst, l);
225 }
226 
227 template<class ST> void CStringFeatures<ST>::set_feature_vector(SGVector<ST> vector, int32_t num)
228 {
229  ASSERT(features);
230 
231  if (m_subset)
232  SG_ERROR("A subset is set, cannot set feature vector\n");
233 
234  if (num>=num_vectors)
235  {
236  SG_ERROR("Index out of bounds (number of strings %d, you "
237  "requested %d)\n", num_vectors, num);
238  }
239 
240  if (vector.vlen<=0)
241  SG_ERROR("String has zero or negative length\n");
242 
243  cleanup_feature_vector(num);
244  features[num].slen=vector.vlen;
245  features[num].string=SG_MALLOC(ST, vector.vlen);
246  memcpy(features[num].string, vector.vector, vector.vlen*sizeof(ST));
247 
248  determine_maximum_string_length();
249 }
250 
252 {
253  preprocess_on_get=true;
254 }
255 
257 {
258  preprocess_on_get=false;
259 }
260 
261 template<class ST> ST* CStringFeatures<ST>::get_feature_vector(int32_t num, int32_t& len, bool& dofree)
262 {
263  ASSERT(features);
264  ASSERT(num<get_num_vectors());
265 
266 
267  int32_t real_num=subset_idx_conversion(num);
268 
269  if (!preprocess_on_get)
270  {
271  dofree=false;
272  len=features[real_num].slen;
273  return features[real_num].string;
274  }
275  else
276  {
277  SG_DEBUG( "computing feature vector!\n") ;
278  ST* feat=compute_feature_vector(num, len);
279  dofree=true;
280 
281  if (get_num_preprocessors())
282  {
283  ST* tmp_feat_before=feat;
284 
285  for (int32_t i=0; i<get_num_preprocessors(); i++)
286  {
287  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
288  feat=p->apply_to_string(tmp_feat_before, len);
289  SG_UNREF(p);
290  SG_FREE(tmp_feat_before);
291  tmp_feat_before=feat;
292  }
293  }
294  // TODO: implement caching
295  return feat;
296  }
297 }
298 
300 {
301  int32_t num_feat;
302  int32_t num_vec;
303  SGString<ST>* s=get_transposed(num_feat, num_vec);
304  SGStringList<ST> string_list;
305  string_list.strings = s;
306  string_list.num_strings = num_vec;
307  string_list.max_string_length = num_feat;
308 
309  return new CStringFeatures<ST>(string_list, alphabet);
310 }
311 
312 template<class ST> SGString<ST>* CStringFeatures<ST>::get_transposed(int32_t &num_feat, int32_t &num_vec)
313 {
314  num_feat=get_num_vectors();
315  num_vec=get_max_vector_length();
316  ASSERT(have_same_length());
317 
318  SG_DEBUG("Allocating memory for transposed string features of size %ld\n",
319  int64_t(num_feat)*num_vec);
320 
321  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
322 
323  for (int32_t i=0; i<num_vec; i++)
324  {
325  sf[i].string=SG_MALLOC(ST, num_feat);
326  sf[i].slen=num_feat;
327  }
328 
329  for (int32_t i=0; i<num_feat; i++)
330  {
331  int32_t len=0;
332  bool free_vec=false;
333  ST* vec=get_feature_vector(i, len, free_vec);
334 
335  for (int32_t j=0; j<num_vec; j++)
336  sf[j].string[i]=vec[j];
337 
338  free_feature_vector(vec, i, free_vec);
339  }
340  return sf;
341 }
342 
343 template<class ST> void CStringFeatures<ST>::free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
344 {
345  if (num>=get_num_vectors())
346  {
347  SG_ERROR(
348  "Trying to access string[%d] but num_str=%d\n", num,
349  get_num_vectors());
350  }
351 
352  int32_t real_num=subset_idx_conversion(num);
353 
354  if (feature_cache)
355  feature_cache->unlock_entry(real_num);
356 
357  if (dofree)
358  SG_FREE(feat_vec);
359 }
360 
361 template<class ST> void CStringFeatures<ST>::free_feature_vector(SGVector<ST> feat_vec, int32_t num)
362 {
363  if (num>=get_num_vectors())
364  {
365  SG_ERROR(
366  "Trying to access string[%d] but num_str=%d\n", num,
367  get_num_vectors());
368  }
369 
370  int32_t real_num=subset_idx_conversion(num);
371 
372  if (feature_cache)
373  feature_cache->unlock_entry(real_num);
374 
375  if (feat_vec.do_free)
376  SG_FREE(feat_vec.vector);
377 }
378 
379 template<class ST> ST CStringFeatures<ST>::get_feature(int32_t vec_num, int32_t feat_num)
380 {
381  ASSERT(vec_num<get_num_vectors());
382 
383  int32_t len;
384  bool free_vec;
385  ST* vec=get_feature_vector(vec_num, len, free_vec);
386  ASSERT(feat_num<len);
387  ST result=vec[feat_num];
388  free_feature_vector(vec, vec_num, free_vec);
389 
390  return result;
391 }
392 
393 template<class ST> int32_t CStringFeatures<ST>::get_vector_length(int32_t vec_num)
394 {
395  ASSERT(vec_num<get_num_vectors());
396 
397  int32_t len;
398  bool free_vec;
399  ST* vec=get_feature_vector(vec_num, len, free_vec);
400  free_feature_vector(vec, vec_num, free_vec);
401  return len;
402 }
403 
404 template<class ST> int32_t CStringFeatures<ST>::get_max_vector_length()
405 {
406  return max_string_length;
407 }
408 
409 template<class ST> int32_t CStringFeatures<ST>::get_num_vectors() const
410 {
411  return m_subset ? m_subset->get_size() : num_vectors;
412 }
413 
414 template<class ST> floatmax_t CStringFeatures<ST>::get_num_symbols() { return num_symbols; }
415 
416 template<class ST> floatmax_t CStringFeatures<ST>::get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
417 
418 template<class ST> floatmax_t CStringFeatures<ST>::get_original_num_symbols() { return original_num_symbols; }
419 
420 template<class ST> int32_t CStringFeatures<ST>::get_order() { return order; }
421 
422 template<class ST> ST CStringFeatures<ST>::get_masked_symbols(ST symbol, uint8_t mask)
423 {
424  ASSERT(symbol_mask_table);
425  return symbol_mask_table[mask] & symbol;
426 }
427 
428 template<class ST> ST CStringFeatures<ST>::shift_offset(ST offset, int32_t amount)
429 {
430  ASSERT(alphabet);
431  return (offset << (amount*alphabet->get_num_bits()));
432 }
433 
434 template<class ST> ST CStringFeatures<ST>::shift_symbol(ST symbol, int32_t amount)
435 {
436  ASSERT(alphabet);
437  return (symbol >> (amount*alphabet->get_num_bits()));
438 }
439 
440 template<class ST> void CStringFeatures<ST>::load_ascii_file(char* fname, bool remap_to_bin,
441  EAlphabet ascii_alphabet, EAlphabet binary_alphabet)
442 {
443  remove_subset();
444 
445  size_t blocksize=1024*1024;
446  size_t required_blocksize=0;
447  uint8_t* dummy=SG_MALLOC(uint8_t, blocksize);
448  uint8_t* overflow=NULL;
449  int32_t overflow_len=0;
450 
451  cleanup();
452 
453  CAlphabet* alpha=new CAlphabet(ascii_alphabet);
454  CAlphabet* alpha_bin=new CAlphabet(binary_alphabet);
455 
456  FILE* f=fopen(fname, "ro");
457 
458  if (f)
459  {
460  num_vectors=0;
461  max_string_length=0;
462 
463  SG_INFO("counting line numbers in file %s\n", fname);
464  size_t block_offs=0;
465  size_t old_block_offs=0;
466  fseek(f, 0, SEEK_END);
467  size_t fsize=ftell(f);
468  rewind(f);
469 
470  if (blocksize>fsize)
471  blocksize=fsize;
472 
473  SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
474 
475  size_t sz=blocksize;
476  while (sz == blocksize)
477  {
478  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
479  for (size_t i=0; i<sz; i++)
480  {
481  block_offs++;
482  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
483  {
484  num_vectors++;
485  required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
486  old_block_offs=block_offs;
487  }
488  }
489  SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
490  }
491 
492  SG_INFO("found %d strings\n", num_vectors);
493  SG_FREE(dummy);
494  blocksize=required_blocksize;
495  dummy=SG_MALLOC(uint8_t, blocksize);
496  overflow=SG_MALLOC(uint8_t, blocksize);
497  features=SG_MALLOC(SGString<ST>, num_vectors);
498 
499  rewind(f);
500  sz=blocksize;
501  int32_t lines=0;
502  while (sz == blocksize)
503  {
504  sz=fread(dummy, sizeof(uint8_t), blocksize, f);
505 
506  size_t old_sz=0;
507  for (size_t i=0; i<sz; i++)
508  {
509  if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
510  {
511  int32_t len=i-old_sz;
512  //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
513  max_string_length=CMath::max(max_string_length, len+overflow_len);
514 
515  features[lines].slen=len;
516  features[lines].string=SG_MALLOC(ST, len);
517 
518  if (remap_to_bin)
519  {
520  for (int32_t j=0; j<overflow_len; j++)
521  features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
522  for (int32_t j=0; j<len; j++)
523  features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
524  alpha->add_string_to_histogram(&dummy[old_sz], len);
525  alpha_bin->add_string_to_histogram(features[lines].string, features[lines].slen);
526  }
527  else
528  {
529  for (int32_t j=0; j<overflow_len; j++)
530  features[lines].string[j]=overflow[j];
531  for (int32_t j=0; j<len; j++)
532  features[lines].string[j+overflow_len]=dummy[old_sz+j];
533  alpha->add_string_to_histogram(&dummy[old_sz], len);
534  alpha->add_string_to_histogram(features[lines].string, features[lines].slen);
535  }
536 
537  // clear overflow
538  overflow_len=0;
539 
540  //CMath::display_vector(features[lines].string, len);
541  old_sz=i+1;
542  lines++;
543  SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
544  }
545  }
546  for (size_t i=old_sz; i<sz; i++)
547  overflow[i-old_sz]=dummy[i];
548 
549  overflow_len=sz-old_sz;
550  }
551 
552  if (alpha->check_alphabet_size() && alpha->check_alphabet())
553  {
554  SG_INFO("file successfully read\n");
555  SG_INFO("max_string_length=%d\n", max_string_length);
556  SG_INFO("num_strings=%d\n", num_vectors);
557  }
558  fclose(f);
559  }
560 
561  SG_FREE(dummy);
562 
563  SG_UNREF(alphabet);
564 
565  if (remap_to_bin)
566  alphabet=alpha_bin;
567  else
568  alphabet=alpha;
569  SG_REF(alphabet);
570  num_symbols=alphabet->get_num_symbols();
571 }
572 
573 template<class ST> bool CStringFeatures<ST>::load_fasta_file(const char* fname, bool ignore_invalid)
574 {
575  remove_subset();
576 
577  int32_t i=0;
578  uint64_t len=0;
579  uint64_t offs=0;
580  int32_t num=0;
581  int32_t max_len=0;
582 
583  CMemoryMappedFile<char> f(fname);
584 
585  while (true)
586  {
587  char* s=f.get_line(len, offs);
588  if (!s)
589  break;
590 
591  if (len>0 && s[0]=='>')
592  num++;
593  }
594 
595  if (num==0)
596  SG_ERROR("No fasta hunks (lines starting with '>') found\n");
597 
598  cleanup();
599  SG_UNREF(alphabet);
600  alphabet=new CAlphabet(DNA);
601  num_symbols=alphabet->get_num_symbols();
602 
603  SGString<ST>* strings=SG_MALLOC(SGString<ST>, num);
604  offs=0;
605 
606  for (i=0;i<num; i++)
607  {
608  uint64_t id_len=0;
609  char* id=f.get_line(id_len, offs);
610 
611  char* fasta=f.get_line(len, offs);
612  char* s=fasta;
613  int32_t fasta_len=0;
614  int32_t spanned_lines=0;
615 
616  while (true)
617  {
618  if (!s || len==0)
619  SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
620 
621  if (s[0]=='>' || offs==f.get_size())
622  {
623  offs-=len+1; // seek to beginning
624  if (offs==f.get_size())
625  {
626  SG_DEBUG("at EOF\n");
627  fasta_len+=len;
628  }
629 
630  len=fasta_len-spanned_lines;
631  strings[i].string=SG_MALLOC(ST, len);
632  strings[i].slen=len;
633 
634  ST* str=strings[i].string;
635  int32_t idx=0;
636  SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
637 
638  for (int32_t j=0; j<fasta_len; j++)
639  {
640  if (fasta[j]=='\n')
641  continue;
642 
643  ST c=(ST) fasta[j];
644 
645  if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
646  c=(ST) 'A';
647 
648  if (uint64_t(idx)>=len)
649  SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
650  str[idx++]=c;
651  }
652  max_len=CMath::max(max_len, strings[i].slen);
653 
654 
655  break;
656  }
657 
658  spanned_lines++;
659  fasta_len+=len+1; // including '\n'
660  s=f.get_line(len, offs);
661  }
662  }
663  return set_features(strings, num, max_len);
664 }
665 
666 template<class ST> bool CStringFeatures<ST>::load_fastq_file(const char* fname,
667  bool ignore_invalid, bool bitremap_in_single_string)
668 {
669  remove_subset();
670 
671  CMemoryMappedFile<char> f(fname);
672 
673  int32_t i=0;
674  uint64_t len=0;
675  uint64_t offs=0;
676 
677  int32_t num=f.get_num_lines();
678  int32_t max_len=0;
679 
680  if (num%4)
681  SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
682  num/=4;
683 
684  cleanup();
685  SG_UNREF(alphabet);
686  alphabet=new CAlphabet(DNA);
687 
688  SGString<ST>* strings;
689 
690  ST* str=NULL;
691  if (bitremap_in_single_string)
692  {
693  strings=SG_MALLOC(SGString<ST>, 1);
694  strings[0].string=SG_MALLOC(ST, num);
695  strings[0].slen=num;
696  f.get_line(len, offs);
697  f.get_line(len, offs);
698  order=len;
699  max_len=num;
700  offs=0;
701  original_num_symbols=alphabet->get_num_symbols();
702  str=SG_MALLOC(ST, len);
703  }
704  else
705  strings=SG_MALLOC(SGString<ST>, num);
706 
707  for (i=0;i<num; i++)
708  {
709  if (!f.get_line(len, offs))
710  SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
711 
712  char* s=f.get_line(len, offs);
713  if (!s || len==0)
714  SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
715 
716  if (bitremap_in_single_string)
717  {
718  if (len!=(uint64_t) order)
719  SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
720  for (int32_t j=0; j<order; j++)
721  str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
722 
723  strings[0].string[i]=embed_word(str, order);
724  }
725  else
726  {
727  strings[i].string=SG_MALLOC(ST, len);
728  strings[i].slen=len;
729  str=strings[i].string;
730 
731  if (ignore_invalid)
732  {
733  for (uint64_t j=0; j<len; j++)
734  {
735  if (alphabet->is_valid((uint8_t) s[j]))
736  str[j]= (ST) s[j];
737  else
738  str[j]= (ST) 'A';
739  }
740  }
741  else
742  {
743  for (uint64_t j=0; j<len; j++)
744  str[j]= (ST) s[j];
745  }
746  max_len=CMath::max(max_len, (int32_t) len);
747  }
748 
749 
750  if (!f.get_line(len, offs))
751  SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
752 
753  if (!f.get_line(len, offs))
754  SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
755  }
756 
757  if (bitremap_in_single_string)
758  num=1;
759 
760  num_vectors=num;
761  max_string_length=max_len;
762  features=strings;
763 
764  return true;
765 }
766 
767 template<class ST> bool CStringFeatures<ST>::load_from_directory(char* dirname)
768 {
769  remove_subset();
770 
771  struct dirent **namelist;
772  int32_t n;
773 
774  SGIO::set_dirname(dirname);
775 
776  SG_DEBUG("dirname '%s'\n", dirname);
777 
778  n=scandir(dirname, &namelist, &SGIO::filter, alphasort);
779  if (n <= 0)
780  {
781  SG_ERROR("error calling scandir - no files found\n");
782  return false;
783  }
784  else
785  {
786  SGString<ST>* strings=NULL;
787 
788  int32_t num=0;
789  int32_t max_len=-1;
790 
791  //usually n==num_vec, but it might not in race conditions
792  //(file perms modified, file erased)
793  strings=SG_MALLOC(SGString<ST>, n);
794 
795  for (int32_t i=0; i<n; i++)
796  {
797  char* fname=SGIO::concat_filename(namelist[i]->d_name);
798 
799  struct stat s;
800  off_t filesize=0;
801 
802  if (!stat(fname, &s) && s.st_size>0)
803  {
804  filesize=s.st_size/sizeof(ST);
805 
806  FILE* f=fopen(fname, "ro");
807  if (f)
808  {
809  ST* str=SG_MALLOC(ST, filesize);
810  SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
811  if (fread(str, sizeof(ST), filesize, f)!=(size_t) filesize)
812  SG_ERROR("failed to read file\n");
813  strings[num].string=str;
814  strings[num].slen=filesize;
815  max_len=CMath::max(max_len, strings[num].slen);
816 
817  num++;
818  fclose(f);
819  }
820  }
821  else
822  SG_ERROR("empty or non readable file \'%s\'\n", fname);
823 
824  SG_FREE(namelist[i]);
825  }
826  SG_FREE(namelist);
827 
828  if (num>0 && strings)
829  {
830  set_features(strings, num, max_len);
831  return true;
832  }
833  }
834  return false;
835 }
836 
838 {
839  set_features(feats.strings, feats.num_strings, feats.max_string_length);
840 }
841 
842 template<class ST> bool CStringFeatures<ST>::set_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
843 {
844  if (m_subset)
845  SG_ERROR("Cannot call set_features() with subset.\n");
846 
847  if (p_features)
848  {
849  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
850 
851  //compute histogram for char/byte
852  for (int32_t i=0; i<p_num_vectors; i++)
853  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
854 
855  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
856  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
857 
858  if (alpha->check_alphabet_size() && alpha->check_alphabet())
859  {
860  cleanup();
861  SG_UNREF(alphabet);
862 
863  alphabet=alpha;
864  SG_REF(alphabet);
865 
866  features=p_features;
867  num_vectors=p_num_vectors;
868  max_string_length=p_max_string_length;
869 
870  return true;
871  }
872  else
873  SG_UNREF(alpha);
874  }
875 
876  return false;
877 }
878 
880 {
881  ASSERT(sf);
882 
883  if (m_subset)
884  SG_ERROR("Cannot call set_features() with subset.\n");
885 
886  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, sf->get_num_vectors());
887 
888  index_t sf_num_str=sf->get_num_vectors();
889  for (int32_t i=0; i<sf_num_str; i++)
890  {
891  int32_t real_i = sf->subset_idx_conversion(i);
892  int32_t length=sf->features[real_i].slen;
893  new_features[i].string=SG_MALLOC(ST, length);
894  memcpy(new_features[i].string, sf->features[real_i].string, length);
895  new_features[i].slen=length;
896  }
897  return append_features(new_features, sf_num_str,
898  sf->max_string_length);
899 }
900 
901 template<class ST> bool CStringFeatures<ST>::append_features(SGString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
902 {
903  if (m_subset)
904  SG_ERROR("Cannot call set_features() with subset.\n");
905 
906  if (!features)
907  return set_features(p_features, p_num_vectors, p_max_string_length);
908 
909  CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
910 
911  //compute histogram for char/byte
912  for (int32_t i=0; i<p_num_vectors; i++)
913  alpha->add_string_to_histogram( p_features[i].string, p_features[i].slen);
914 
915  SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
916  SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
917 
918  if (alpha->check_alphabet_size() && alpha->check_alphabet())
919  {
920  SG_UNREF(alpha);
921  for (int32_t i=0; i<p_num_vectors; i++)
922  alphabet->add_string_to_histogram( p_features[i].string, p_features[i].slen);
923 
924  int32_t old_num_vectors=num_vectors;
925  num_vectors=old_num_vectors+p_num_vectors;
926  SGString<ST>* new_features=SG_MALLOC(SGString<ST>, num_vectors);
927 
928  for (int32_t i=0; i<num_vectors; i++)
929  {
930  if (i<old_num_vectors)
931  {
932  new_features[i].string=features[i].string;
933  new_features[i].slen=features[i].slen;
934  }
935  else
936  {
937  new_features[i].string=p_features[i-old_num_vectors].string;
938  new_features[i].slen=p_features[i-old_num_vectors].slen;
939  }
940  }
941  SG_FREE(features);
942  SG_FREE(p_features); // free now obsolete features
943 
944  this->features=new_features;
945  max_string_length=CMath::max(max_string_length, p_max_string_length);
946 
947  return true;
948  }
949  SG_UNREF(alpha);
950 
951  return false;
952 }
953 
955 {
956  SGStringList<ST> sl;
957 
958  sl.strings=get_features(sl.num_strings, sl.max_string_length);
959  return sl;
960 }
961 
962 template<class ST> SGString<ST>* CStringFeatures<ST>::get_features(int32_t& num_str, int32_t& max_str_len)
963 {
964  if (m_subset)
965  SG_ERROR("get features() is not possible on subset");
966 
967  num_str=num_vectors;
968  max_str_len=max_string_length;
969  return features;
970 }
971 
972 template<class ST> SGString<ST>* CStringFeatures<ST>::copy_features(int32_t& num_str, int32_t& max_str_len)
973 {
974  ASSERT(num_vectors>0);
975 
976  num_str=get_num_vectors();
977  max_str_len=max_string_length;
978  SGString<ST>* new_feat=SG_MALLOC(SGString<ST>, num_str);
979 
980  for (int32_t i=0; i<num_str; i++)
981  {
982  int32_t len;
983  bool free_vec;
984  ST* vec=get_feature_vector(i, len, free_vec);
985  new_feat[i].string=SG_MALLOC(ST, len);
986  new_feat[i].slen=len;
987  memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
988  free_feature_vector(vec, i, free_vec);
989  }
990 
991  return new_feat;
992 }
993 
994 template<class ST> void CStringFeatures<ST>::get_features(SGString<ST>** dst, int32_t* num_str)
995 {
996  int32_t num_vec;
997  int32_t max_str_len;
998  *dst=copy_features(num_vec, max_str_len);
999  *num_str=num_vec;
1000 }
1001 
1002 template<class ST> bool CStringFeatures<ST>::load_compressed(char* src, bool decompress)
1003 {
1004  remove_subset();
1005 
1006  FILE* file=NULL;
1007 
1008  if (!(file=fopen(src, "r")))
1009  return false;
1010  cleanup();
1011 
1012  // header shogun v0
1013  char id[4];
1014  if (fread(&id[0], sizeof(char), 1, file)!=1)
1015  SG_ERROR("failed to read header");
1016  ASSERT(id[0]=='S');
1017  if (fread(&id[1], sizeof(char), 1, file)!=1)
1018  SG_ERROR("failed to read header");
1019  ASSERT(id[1]=='G');
1020  if (fread(&id[2], sizeof(char), 1, file)!=1)
1021  SG_ERROR("failed to read header");
1022  ASSERT(id[2]=='V');
1023  if (fread(&id[3], sizeof(char), 1, file)!=1)
1024  SG_ERROR("failed to read header");
1025  ASSERT(id[3]=='0');
1026 
1027  //compression type
1028  uint8_t c;
1029  if (fread(&c, sizeof(uint8_t), 1, file)!=1)
1030  SG_ERROR("failed to read compression type");
1031  CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
1032  //alphabet
1033  uint8_t a;
1034  delete alphabet;
1035  if (fread(&a, sizeof(uint8_t), 1, file)!=1)
1036  SG_ERROR("failed to read compression alphabet");
1037  alphabet=new CAlphabet((EAlphabet) a);
1038  // number of vectors
1039  if (fread(&num_vectors, sizeof(int32_t), 1, file)!=1)
1040  SG_ERROR("failed to read compression number of vectors");
1041  ASSERT(num_vectors>0);
1042  // maximum string length
1043  if (fread(&max_string_length, sizeof(int32_t), 1, file)!=1)
1044  SG_ERROR("failed to read maximum string length");
1045  ASSERT(max_string_length>0);
1046 
1047  features=SG_MALLOC(SGString<ST>, num_vectors);
1048 
1049  // vectors
1050  for (int32_t i=0; i<num_vectors; i++)
1051  {
1052  // vector len compressed
1053  int32_t len_compressed;
1054  if (fread(&len_compressed, sizeof(int32_t), 1, file)!=1)
1055  SG_ERROR("failed to read vector length compressed");
1056  // vector len uncompressed
1057  int32_t len_uncompressed;
1058  if (fread(&len_uncompressed, sizeof(int32_t), 1, file)!=1)
1059  SG_ERROR("failed to read vector length uncompressed");
1060 
1061  // vector raw data
1062  if (decompress)
1063  {
1064  features[i].string=SG_MALLOC(ST, len_uncompressed);
1065  features[i].slen=len_uncompressed;
1066  uint8_t* compressed=SG_MALLOC(uint8_t, len_compressed);
1067  if (fread(compressed, sizeof(uint8_t), len_compressed, file)!=(size_t) len_compressed)
1068  SG_ERROR("failed to read compressed data (expected %d bytes)", len_compressed);
1069  uint64_t uncompressed_size=len_uncompressed;
1070  uncompressed_size*=sizeof(ST);
1071  compressor->decompress(compressed, len_compressed,
1072  (uint8_t*) features[i].string, uncompressed_size);
1073  SG_FREE(compressed);
1074  ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
1075  }
1076  else
1077  {
1078  int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
1079  features[i].string=SG_MALLOC(ST, len_compressed+offs);
1080  features[i].slen=len_compressed+offs;
1081  int32_t* feat32ptr=((int32_t*) (features[i].string));
1082  memset(features[i].string, 0, offs*sizeof(ST));
1083  feat32ptr[0]=(int32_t) len_compressed;
1084  feat32ptr[1]=(int32_t) len_uncompressed;
1085  uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
1086  if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1087  SG_ERROR("failed to read uncompressed data");
1088  }
1089  }
1090 
1091  delete compressor;
1092  fclose(file);
1093 
1094  return false;
1095 }
1096 
1097 template<class ST> bool CStringFeatures<ST>::save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
1098 {
1099  if (m_subset)
1100  SG_ERROR("save_compressed() is not possible on subset");
1101 
1102  FILE* file=NULL;
1103 
1104  if (!(file=fopen(dest, "wb")))
1105  return false;
1106 
1107  CCompressor* compressor= new CCompressor(compression);
1108 
1109  // header shogun v0
1110  const char* id="SGV0";
1111  fwrite(&id[0], sizeof(char), 1, file);
1112  fwrite(&id[1], sizeof(char), 1, file);
1113  fwrite(&id[2], sizeof(char), 1, file);
1114  fwrite(&id[3], sizeof(char), 1, file);
1115 
1116  //compression type
1117  uint8_t c=(uint8_t) compression;
1118  fwrite(&c, sizeof(uint8_t), 1, file);
1119  //alphabet
1120  uint8_t a=(uint8_t) alphabet->get_alphabet();
1121  fwrite(&a, sizeof(uint8_t), 1, file);
1122  // number of vectors
1123  fwrite(&num_vectors, sizeof(int32_t), 1, file);
1124  // maximum string length
1125  fwrite(&max_string_length, sizeof(int32_t), 1, file);
1126 
1127  // vectors
1128  for (int32_t i=0; i<num_vectors; i++)
1129  {
1130  int32_t len=-1;
1131  bool vfree;
1132  ST* vec=get_feature_vector(i, len, vfree);
1133 
1134  uint8_t* compressed=NULL;
1135  uint64_t compressed_size=0;
1136 
1137  compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
1138  compressed, compressed_size, level);
1139 
1140  int32_t len_compressed=(int32_t) compressed_size;
1141  // vector len compressed in bytes
1142  fwrite(&len_compressed, sizeof(int32_t), 1, file);
1143  // vector len uncompressed in number of elements of type ST
1144  fwrite(&len, sizeof(int32_t), 1, file);
1145  // vector raw data
1146  fwrite(compressed, compressed_size, 1, file);
1147  SG_FREE(compressed);
1148 
1149  free_feature_vector(vec, i, vfree);
1150  }
1151 
1152  delete compressor;
1153  fclose(file);
1154  return true;
1155 }
1156 
1157 template<class ST> int32_t CStringFeatures<ST>::get_size() { return sizeof(ST); }
1158 
1159 template<class ST> bool CStringFeatures<ST>::apply_preprocessor(bool force_preprocessing)
1160 {
1161  SG_DEBUG( "force: %d\n", force_preprocessing);
1162 
1163  for (int32_t i=0; i<get_num_preprocessors(); i++)
1164  {
1165  if ( (!is_preprocessed(i) || force_preprocessing) )
1166  {
1167  set_preprocessed(i);
1168  CStringPreprocessor<ST>* p=(CStringPreprocessor<ST>*) get_preprocessor(i);
1169  SG_INFO( "preprocessing using preproc %s\n", p->get_name());
1170 
1171  if (!p->apply_to_string_features(this))
1172  {
1173  SG_UNREF(p);
1174  return false;
1175  }
1176  else
1177  SG_UNREF(p);
1178  }
1179  }
1180  return true;
1181 }
1182 
1183 template<class ST> int32_t CStringFeatures<ST>::obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip)
1184 {
1185  if (m_subset)
1187 
1188  ASSERT(step_size>0);
1189  ASSERT(window_size>0);
1190  ASSERT(num_vectors==1 || single_string);
1191  ASSERT(max_string_length>=window_size ||
1192  (single_string && length_of_single_string>=window_size));
1193 
1194  //in case we are dealing with a single remapped string
1195  //allow remapping
1196  if (single_string)
1197  num_vectors= (length_of_single_string-window_size)/step_size + 1;
1198  else if (num_vectors==1)
1199  {
1200  num_vectors= (max_string_length-window_size)/step_size + 1;
1201  length_of_single_string=max_string_length;
1202  }
1203 
1204  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1205  int32_t offs=0;
1206  for (int32_t i=0; i<num_vectors; i++)
1207  {
1208  f[i].string=&features[0].string[offs+skip];
1209  f[i].slen=window_size-skip;
1210  offs+=step_size;
1211  }
1212  single_string=features[0].string;
1213  SG_FREE(features);
1214  features=f;
1215  max_string_length=window_size-skip;
1216 
1217  return num_vectors;
1218 }
1219 
1220 template<class ST> int32_t CStringFeatures<ST>::obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions,
1221  int32_t skip)
1222 {
1223  if (m_subset)
1225 
1226  ASSERT(positions);
1227  ASSERT(window_size>0);
1228  ASSERT(num_vectors==1 || single_string);
1229  ASSERT(max_string_length>=window_size ||
1230  (single_string && length_of_single_string>=window_size));
1231 
1232  num_vectors= positions->get_num_elements();
1233  ASSERT(num_vectors>0);
1234 
1235  int32_t len;
1236 
1237  //in case we are dealing with a single remapped string
1238  //allow remapping
1239  if (single_string)
1240  len=length_of_single_string;
1241  else
1242  {
1243  single_string=features[0].string;
1244  len=max_string_length;
1245  length_of_single_string=max_string_length;
1246  }
1247 
1248  SGString<ST>* f=SG_MALLOC(SGString<ST>, num_vectors);
1249  for (int32_t i=0; i<num_vectors; i++)
1250  {
1251  int32_t p=positions->get_element(i);
1252 
1253  if (p>=0 && p<=len-window_size)
1254  {
1255  f[i].string=&features[0].string[p+skip];
1256  f[i].slen=window_size-skip;
1257  }
1258  else
1259  {
1260  num_vectors=1;
1261  max_string_length=len;
1262  features[0].slen=len;
1263  single_string=NULL;
1264  SG_FREE(f);
1265  SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1266  window_size, i, p, len);
1267  return -1;
1268  }
1269  }
1270 
1271  SG_FREE(features);
1272  features=f;
1273  max_string_length=window_size-skip;
1274 
1275  return num_vectors;
1276 }
1277 
1278 template<class ST> bool CStringFeatures<ST>::obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1279 {
1280  return obtain_from_char_features(sf, start, p_order, gap, rev);
1281 }
1282 
1283 template<class ST> bool CStringFeatures<ST>::have_same_length(int32_t len)
1284 {
1285  if (len!=-1)
1286  {
1287  if (len!=max_string_length)
1288  return false;
1289  }
1290  len=max_string_length;
1291 
1292  index_t num_str=get_num_vectors();
1293  for (int32_t i=0; i<num_str; i++)
1294  {
1295  if (get_vector_length(i)!=len)
1296  return false;
1297  }
1298 
1299  return true;
1300 }
1301 
1302 template<class ST> void CStringFeatures<ST>::embed_features(int32_t p_order)
1303 {
1304  if (m_subset)
1306 
1307  ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
1308 
1309  order=p_order;
1310  original_num_symbols=alphabet->get_num_symbols();
1311  int32_t max_val=alphabet->get_num_bits();
1312 
1313  if (p_order>1)
1314  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1315  else
1316  num_symbols=original_num_symbols;
1317 
1318  SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1319 
1320  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1321  SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
1322 
1323  ST mask=0;
1324  for (int32_t i=0; i<p_order*max_val; i++)
1325  mask= (mask<<1) | ((ST) 1);
1326 
1327  for (int32_t i=0; i<num_vectors; i++)
1328  {
1329  int32_t len=features[i].slen;
1330 
1331  if (len < p_order)
1332  SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
1333 
1334  ST* str=features[i].string;
1335 
1336  // convert first word
1337  for (int32_t j=0; j<p_order; j++)
1338  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1339  str[0]=embed_word(&str[0], p_order);
1340 
1341  // convert the rest
1342  int32_t idx=0;
1343  for (int32_t j=p_order; j<len; j++)
1344  {
1345  str[j]=(ST) alphabet->remap_to_bin(str[j]);
1346  str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1347  idx++;
1348  }
1349 
1350  features[i].slen=len-p_order+1;
1351  }
1352 
1353  compute_symbol_mask_table(max_val);
1354 }
1355 
1356 template<class ST> void CStringFeatures<ST>::compute_symbol_mask_table(int64_t max_val)
1357 {
1358  if (m_subset)
1360 
1361  SG_FREE(symbol_mask_table);
1362  symbol_mask_table=SG_MALLOC(ST, 256);
1363 
1364  uint64_t mask=0;
1365  for (int32_t i=0; i< (int64_t) max_val; i++)
1366  mask=(mask<<1) | 1;
1367 
1368  for (int32_t i=0; i<256; i++)
1369  {
1370  uint8_t bits=(uint8_t) i;
1371  symbol_mask_table[i]=0;
1372 
1373  for (int32_t j=0; j<8; j++)
1374  {
1375  if (bits & 1)
1376  symbol_mask_table[i]|=mask<<(max_val*j);
1377 
1378  bits>>=1;
1379  }
1380  }
1381 }
1382 
1383 template<class ST> void CStringFeatures<ST>::unembed_word(ST word, uint8_t* seq, int32_t len)
1384 {
1385  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1386 
1387  ST mask=0;
1388  for (uint32_t i=0; i<nbits; i++)
1389  mask=(mask<<1) | (ST) 1;
1390 
1391  for (int32_t i=0; i<len; i++)
1392  {
1393  ST w=(word & mask);
1394  seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1395  word>>=nbits;
1396  }
1397 }
1398 
1399 template<class ST> ST CStringFeatures<ST>::embed_word(ST* seq, int32_t len)
1400 {
1401  ST value=(ST) 0;
1402  uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1403  for (int32_t i=0; i<len; i++)
1404  {
1405  value<<=nbits;
1406  value|=seq[i];
1407  }
1408 
1409  return value;
1410 }
1411 
1413 {
1414  max_string_length=0;
1415  index_t num_str=get_num_vectors();
1416 
1417  for (int32_t i=0; i<num_str; i++)
1418  {
1419  max_string_length=CMath::max(max_string_length,
1420  features[subset_idx_conversion(i)].slen);
1421  }
1422 }
1423 
1425 {
1426  int32_t l=str.slen;
1427  ST* s=SG_MALLOC(ST, l+1);
1428  memcpy(s, str.string, sizeof(ST)*l);
1429  s[l]='\0';
1430  return s;
1431 }
1432 
1433 template<class ST> void CStringFeatures<ST>::set_feature_vector(int32_t num, ST* string, int32_t len)
1434 {
1435  ASSERT(features);
1436  ASSERT(num<get_num_vectors());
1437 
1438  int32_t real_num=subset_idx_conversion(num);
1439 
1440 
1441  features[real_num].slen=len ;
1442  features[real_num].string=string ;
1443 
1444  max_string_length=CMath::max(len, max_string_length);
1445 }
1446 
1447 template<class ST> void CStringFeatures<ST>::get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize)
1448 {
1449  int32_t nsym=get_num_symbols();
1450  int32_t slen=get_max_vector_length();
1451  int64_t sz=int64_t(nsym)*slen*sizeof(float64_t);
1452  float64_t* h= SG_MALLOC(float64_t, sz);
1453  memset(h, 0, sz);
1454 
1455  float64_t* h_normalizer=SG_MALLOC(float64_t, slen);
1456  memset(h_normalizer, 0, slen*sizeof(float64_t));
1457  int32_t num_str=get_num_vectors();
1458  for (int32_t i=0; i<num_str; i++)
1459  {
1460  int32_t len;
1461  bool free_vec;
1462  ST* vec=get_feature_vector(i, len, free_vec);
1463  for (int32_t j=0; j<len; j++)
1464  {
1465  h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1466  h_normalizer[j]++;
1467  }
1468  free_feature_vector(vec, i, free_vec);
1469  }
1470 
1471  if (normalize)
1472  {
1473  for (int32_t i=0; i<slen; i++)
1474  {
1475  for (int32_t j=0; j<nsym; j++)
1476  {
1477  if (h_normalizer && h_normalizer[i])
1478  h[int64_t(i)*nsym+j]/=h_normalizer[i];
1479  }
1480  }
1481  }
1482  SG_FREE(h_normalizer);
1483 
1484  *hist=h;
1485  *rows=nsym;
1486  *cols=slen;
1487 }
1488 
1489 template<class ST> void CStringFeatures<ST>::create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec)
1490 {
1491  ASSERT(rows == get_num_symbols());
1492  cleanup();
1493  float64_t* randoms=SG_MALLOC(float64_t, cols);
1494  SGString<ST>* sf=SG_MALLOC(SGString<ST>, num_vec);
1495 
1496  for (int32_t i=0; i<num_vec; i++)
1497  {
1498  sf[i].string=SG_MALLOC(ST, cols);
1499  sf[i].slen=cols;
1500 
1501  CMath::random_vector(randoms, cols, 0.0, 1.0);
1502 
1503  for (int32_t j=0; j<cols; j++)
1504  {
1505  float64_t lik=hist[int64_t(j)*rows+0];
1506 
1507  int32_t c;
1508  for (c=0; c<rows-1; c++)
1509  {
1510  if (randoms[j]<=lik)
1511  break;
1512  lik+=hist[int64_t(j)*rows+c+1];
1513  }
1514  sf[i].string[j]=alphabet->remap_to_char(c);
1515  }
1516  }
1517  SG_FREE(randoms);
1518  set_features(sf, num_vec, cols);
1519 }
1520 
1521 /*
1522 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2)
1523 {
1524  int *s;
1525  int32_t nStr=get_num_vectors();
1526 
1527  int32_t nfeat=0;
1528  for (int32_t i=0; i < nStr; ++i)
1529  nfeat += get_vector_length[i] - d1 -d2;
1530  SGString<SSKFeature>* F= SG_MALLOC(SGString<SSKFeature>, nfeat);
1531  int32_t c=0;
1532  for (int32_t i=0; i < nStr; ++i)
1533  {
1534  int32_t len;
1535  bool free_vec;
1536  ST* S=get_feature_vector(vec_num, len, free_vec);
1537  free_feature_vector(vec, vec_num, free_vec);
1538  int32_t n=len - d1 - d2;
1539  s=S[i];
1540  for (int32_t j=0; j < n; ++j)
1541  {
1542  F[c].feature1=s[j];
1543  F[c].feature2=s[j+d1];
1544  F[c].feature3=s[j+d1+d2];
1545  F[c].group=i;
1546  c++;
1547  }
1548  }
1549  ASSERT(nfeat==c);
1550  return F;
1551 }
1552 
1553 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1)
1554 {
1555  int i, j;
1556  int n, nfeat;
1557  int *group;
1558  int *features;
1559  int *s;
1560  int c;
1561  SSKFeatures *F;
1562 
1563  nfeat=0;
1564  for (i=0; i < nStr; ++i)
1565  nfeat += len[i] - d1;
1566  group=(int *)SG_MALLOC(nfeat*sizeof(int));
1567  features=(int *)SG_MALLOC(nfeat*2*sizeof(int *));
1568  c=0;
1569  for (i=0; i < nStr; ++i)
1570  {
1571  n=len[i] - d1;
1572  s=S[i];
1573  for (j=0; j < n; ++j)
1574  {
1575  features[c]=s[j];
1576  features[c+nfeat]=s[j+d1];
1577  group[c]=i;
1578  c++;
1579  }
1580  }
1581  if (nfeat!=c)
1582  printf("Something is wrong...\n");
1583  F=(SSKFeatures *)SG_MALLOC(sizeof(SSKFeatures));
1584  (*F).features=features;
1585  (*F).group=group;
1586  (*F).n=nfeat;
1587  return F;
1588 }
1589 */
1590 
1592 {
1593  /* string list to create new CStringFeatures from */
1594  SGStringList<ST> list_copy(indices.vlen, max_string_length);
1595 
1596  /* copy all features */
1597  for (index_t i=0; i<indices.vlen; ++i)
1598  {
1599  /* index with respect to possible subset */
1600  index_t real_idx=subset_idx_conversion(indices.vector[i]);
1601 
1602  /* copy string */
1603  SGString<ST> current_string=features[real_idx];
1604  SGString<ST> string_copy(current_string.slen);
1605  memcpy(string_copy.string, current_string.string,
1606  current_string.slen*sizeof(ST));
1607  list_copy.strings[i]=string_copy;
1608  }
1609 
1610  /* create copy instance */
1611  CStringFeatures* result=new CStringFeatures(list_copy, alphabet);
1612 
1613  /* max string length may have changed */
1615 
1616  return result;
1617 }
1618 
1620 {
1621  /* max string length has to be updated */
1622  determine_maximum_string_length();
1623 }
1624 
1625 template<class ST> ST* CStringFeatures<ST>::compute_feature_vector(int32_t num, int32_t& len)
1626 {
1627  ASSERT(features && num<get_num_vectors());
1628 
1629  int32_t real_num=subset_idx_conversion(num);
1630 
1631  len=features[real_num].slen;
1632  if (len<=0)
1633  return NULL;
1634 
1635  ST* target=SG_MALLOC(ST, len);
1636  memcpy(target, features[real_num].string, len*sizeof(ST));
1637  return target;
1638 }
1639 
1640 template<class ST> void CStringFeatures<ST>::init()
1641 {
1642  set_generic<ST>();
1643 
1644  alphabet=NULL;
1645  num_vectors=0;
1646  features=NULL;
1647  single_string=NULL;
1648  length_of_single_string=0;
1649  max_string_length=0;
1650  order=0;
1651  symbol_mask_table=0;
1652  preprocess_on_get=false;
1653  feature_cache=NULL;
1654 
1655  m_parameters->add((CSGObject**) &alphabet, "alphabet");
1656  m_parameters->add_vector(&features, &num_vectors, "features",
1657  "This contains the array of features.");
1658  m_parameters->add_vector(&single_string,
1659  &length_of_single_string,
1660  "single_string",
1661  "Created by sliding window.");
1662  m_parameters->add(&max_string_length, "max_string_length",
1663  "Length of longest string.");
1664  m_parameters->add(&num_symbols, "num_symbols",
1665  "Number of used symbols.");
1666  m_parameters->add(&original_num_symbols, "original_num_symbols",
1667  "Original number of used symbols.");
1668  m_parameters->add(&order, "order",
1669  "Order used in higher order mapping.");
1670  m_parameters->add(&preprocess_on_get, "preprocess_on_get",
1671  "Preprocess on-the-fly?");
1672 
1673  /* TODO M_PARAMETERS->ADD?
1674  * /// order used in higher order mapping
1675  * ST* symbol_mask_table;
1676  */
1677 }
1678 
1684 {
1685  return F_BOOL;
1686 }
1687 
1693 {
1694  return F_CHAR;
1695 }
1696 
1702 {
1703  return F_BYTE;
1704 }
1705 
1711 {
1712  return F_SHORT;
1713 }
1714 
1720 {
1721  return F_WORD;
1722 }
1723 
1729 {
1730  return F_INT;
1731 }
1732 
1738 {
1739  return F_UINT;
1740 }
1741 
1747 {
1748  return F_LONG;
1749 }
1750 
1756 {
1757  return F_ULONG;
1758 }
1759 
1765 {
1766  return F_SHORTREAL;
1767 }
1768 
1774 {
1775  return F_DREAL;
1776 }
1777 
1783 {
1784  return F_LONGREAL;
1785 }
1786 
1787 template<> bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
1788 {
1789  return symbol;
1790 }
1792 {
1793  return symbol;
1794 }
1796 {
1797  return symbol;
1798 }
1800 {
1801  return symbol;
1802 }
1803 
1804 template<> bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
1805 {
1806  return false;
1807 }
1809 {
1810  return 0;
1811 }
1813 {
1814  return 0;
1815 }
1817 {
1818  return 0;
1819 }
1820 
1821 template<> bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
1822 {
1823  return symbol;
1824 }
1826 {
1827  return symbol;
1828 }
1830 {
1831  return symbol;
1832 }
1834 {
1835  return symbol;
1836 }
1837 
1838 #ifndef SUNOS
1839 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1840 {
1841  return false;
1842 }
1843 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1844 {
1845  return false;
1846 }
1847 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
1848 {
1849  return false;
1850 }
1851 #endif
1852 
1853 template<> void CStringFeatures<float32_t>::embed_features(int32_t p_order)
1854 {
1855 }
1856 template<> void CStringFeatures<float64_t>::embed_features(int32_t p_order)
1857 {
1858 }
1859 template<> void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
1860 {
1861 }
1862 
1864 {
1865 }
1867 {
1868 }
1870 {
1871 }
1872 
1874 {
1875  return 0;
1876 }
1878 {
1879  return 0;
1880 }
1882 {
1883  return 0;
1884 }
1885 
1886 template<> void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
1887 {
1888 }
1889 template<> void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
1890 {
1891 }
1892 template<> void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
1893 {
1894 }
1895 #define LOAD(f_load, sg_type) \
1896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1897 { \
1898  SG_INFO( "loading...\n"); \
1899  \
1900  SG_SET_LOCALE_C; \
1901  SGString<sg_type>* strs; \
1902  int32_t num_str; \
1903  int32_t max_len; \
1904  loader->f_load(strs, num_str, max_len); \
1905  set_features(strs, num_str, max_len); \
1906  SG_RESET_LOCALE; \
1907 }
1908 
1909 LOAD(get_string_list, bool)
1910 LOAD(get_string_list, char)
1911 LOAD(get_int8_string_list, int8_t)
1912 LOAD(get_string_list, uint8_t)
1913 LOAD(get_string_list, int16_t)
1914 LOAD(get_string_list, uint16_t)
1915 LOAD(get_string_list, int32_t)
1916 LOAD(get_uint_string_list, uint32_t)
1917 LOAD(get_long_string_list, int64_t)
1918 LOAD(get_ulong_string_list, uint64_t)
1919 LOAD(get_string_list, float32_t)
1920 LOAD(get_string_list, float64_t)
1921 LOAD(get_longreal_string_list, floatmax_t)
1922 #undef LOAD
1923 
1924 #define SAVE(f_write, sg_type) \
1925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1926 { \
1927  if (m_subset) \
1928  SG_ERROR("save() is not possible on subset"); \
1929  SG_SET_LOCALE_C; \
1930  ASSERT(writer); \
1931  writer->f_write(features, num_vectors); \
1932  SG_RESET_LOCALE; \
1933 }
1934 
1935 SAVE(set_string_list, bool)
1936 SAVE(set_string_list, char)
1937 SAVE(set_int8_string_list, int8_t)
1938 SAVE(set_string_list, uint8_t)
1939 SAVE(set_string_list, int16_t)
1940 SAVE(set_string_list, uint16_t)
1941 SAVE(set_string_list, int32_t)
1942 SAVE(set_uint_string_list, uint32_t)
1943 SAVE(set_long_string_list, int64_t)
1944 SAVE(set_ulong_string_list, uint64_t)
1945 SAVE(set_string_list, float32_t)
1946 SAVE(set_string_list, float64_t)
1947 SAVE(set_longreal_string_list, floatmax_t)
1948 #undef SAVE
1949 
1950 template <class ST> template <class CT>
1952  int32_t p_order, int32_t gap, bool rev)
1953 {
1954  remove_subset();
1955  ASSERT(sf);
1956 
1957  CAlphabet* alpha=sf->get_alphabet();
1958  ASSERT(alpha->get_num_symbols_in_histogram() > 0);
1959 
1960  this->order=p_order;
1961  cleanup();
1962 
1963  num_vectors=sf->get_num_vectors();
1964  ASSERT(num_vectors>0);
1965  max_string_length=sf->get_max_vector_length()-start;
1966  features=SG_MALLOC(SGString<ST>, num_vectors);
1967 
1968  SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
1969  alpha->get_num_symbols_in_histogram());
1970 
1971  for (int32_t i=0; i<num_vectors; i++)
1972  {
1973  int32_t len=-1;
1974  bool vfree;
1975  CT* c=sf->get_feature_vector(i, len, vfree);
1976  ASSERT(!vfree); // won't work when preprocessors are attached
1977 
1978  features[i].string=SG_MALLOC(ST, len);
1979  features[i].slen=len;
1980 
1981  ST* str=features[i].string;
1982  for (int32_t j=0; j<len; j++)
1983  str[j]=(ST) alpha->remap_to_bin(c[j]);
1984  }
1985 
1986  original_num_symbols=alpha->get_num_symbols();
1987  int32_t max_val=alpha->get_num_bits();
1988 
1989  SG_UNREF(alpha);
1990 
1991  if (p_order>1)
1992  num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
1993  else
1994  num_symbols=original_num_symbols;
1995  SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1996 
1997  if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
1998  {
1999  SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
2000  return false;
2001  }
2002 
2003  SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
2004  for (int32_t line=0; line<num_vectors; line++)
2005  {
2006  int32_t len=0;
2007  bool vfree;
2008  ST* fv=get_feature_vector(line, len, vfree);
2009  ASSERT(!vfree); // won't work when preprocessors are attached
2010 
2011  if (rev)
2012  CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2013  else
2014  CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2015 
2016  /* fix the length of the string -- hacky */
2017  features[line].slen-=start+gap ;
2018  if (features[line].slen<0)
2019  features[line].slen=0 ;
2020  }
2021 
2022  compute_symbol_mask_table(max_val);
2023 
2024  return true;
2025 }
2026 
2027 template class CStringFeatures<bool>;
2028 template class CStringFeatures<char>;
2029 template class CStringFeatures<int8_t>;
2030 template class CStringFeatures<uint8_t>;
2031 template class CStringFeatures<int16_t>;
2032 template class CStringFeatures<uint16_t>;
2033 template class CStringFeatures<int32_t>;
2034 template class CStringFeatures<uint32_t>;
2035 template class CStringFeatures<int64_t>;
2036 template class CStringFeatures<uint64_t>;
2037 template class CStringFeatures<float32_t>;
2038 template class CStringFeatures<float64_t>;
2039 template class CStringFeatures<floatmax_t>;
2040 
2041 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2042 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2043 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint8_t>(CStringFeatures<uint8_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2044 
2045 template bool CStringFeatures<uint16_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2046 template bool CStringFeatures<uint32_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2047 template bool CStringFeatures<uint64_t>::obtain_from_char_features<uint16_t>(CStringFeatures<uint16_t>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev);
2048 }

SHOGUN Machine Learning Toolbox - Documentation