73 :
CFeatures(orig), num_vectors(orig.num_vectors),
74 single_string(orig.single_string),
75 length_of_single_string(orig.length_of_single_string),
76 max_string_length(orig.max_string_length),
77 num_symbols(orig.num_symbols),
78 original_num_symbols(orig.original_num_symbols),
79 order(orig.order), preprocess_on_get(false),
104 for (int32_t i=0; i<256; i++)
113 features(NULL), single_string(NULL), length_of_single_string(0),
114 max_string_length(0), order(0),
115 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
143 cleanup_feature_vectors(0, num_vectors-1);
149 symbol_mask_table=NULL;
163 ASSERT(num<get_num_vectors());
167 int32_t real_num=subset_idx_conversion(num);
168 SG_FREE(features[real_num].
string);
169 features[real_num].string=NULL;
170 features[real_num].slen=0;
172 determine_maximum_string_length();
178 if (features && get_num_vectors())
180 ASSERT(start<get_num_vectors());
181 ASSERT(stop<get_num_vectors());
183 for (int32_t i=start; i<=stop; i++)
185 int32_t real_num=subset_idx_conversion(i);
186 SG_FREE(features[real_num].
string);
187 features[real_num].string=NULL;
188 features[real_num].slen=0;
190 determine_maximum_string_length();
212 if (num>=get_num_vectors())
214 SG_ERROR(
"Index out of bounds (number of strings %d, you "
215 "requested %d)\n", get_num_vectors(), num);
220 ST* vec=get_feature_vector(num, l, free_vec);
222 memcpy(dst, vec, l*
sizeof(ST));
223 free_feature_vector(vec, num, free_vec);
232 SG_ERROR(
"A subset is set, cannot set feature vector\n");
234 if (num>=num_vectors)
236 SG_ERROR(
"Index out of bounds (number of strings %d, you "
237 "requested %d)\n", num_vectors, num);
241 SG_ERROR(
"String has zero or negative length\n");
243 cleanup_feature_vector(num);
244 features[num].slen=vector.
vlen;
246 memcpy(features[num].
string, vector.
vector, vector.
vlen*
sizeof(ST));
248 determine_maximum_string_length();
253 preprocess_on_get=
true;
258 preprocess_on_get=
false;
264 ASSERT(num<get_num_vectors());
267 int32_t real_num=subset_idx_conversion(num);
269 if (!preprocess_on_get)
272 len=features[real_num].slen;
273 return features[real_num].string;
277 SG_DEBUG(
"computing feature vector!\n") ;
278 ST* feat=compute_feature_vector(num, len);
281 if (get_num_preprocessors())
283 ST* tmp_feat_before=feat;
285 for (int32_t i=0; i<get_num_preprocessors(); i++)
291 tmp_feat_before=feat;
314 num_feat=get_num_vectors();
315 num_vec=get_max_vector_length();
316 ASSERT(have_same_length());
318 SG_DEBUG(
"Allocating memory for transposed string features of size %ld\n",
319 int64_t(num_feat)*num_vec);
323 for (int32_t i=0; i<num_vec; i++)
329 for (int32_t i=0; i<num_feat; i++)
333 ST* vec=get_feature_vector(i, len, free_vec);
335 for (int32_t j=0; j<num_vec; j++)
336 sf[j].
string[i]=vec[j];
338 free_feature_vector(vec, i, free_vec);
345 if (num>=get_num_vectors())
348 "Trying to access string[%d] but num_str=%d\n", num,
352 int32_t real_num=subset_idx_conversion(num);
355 feature_cache->unlock_entry(real_num);
363 if (num>=get_num_vectors())
366 "Trying to access string[%d] but num_str=%d\n", num,
370 int32_t real_num=subset_idx_conversion(num);
373 feature_cache->unlock_entry(real_num);
381 ASSERT(vec_num<get_num_vectors());
385 ST* vec=get_feature_vector(vec_num, len, free_vec);
387 ST result=vec[feat_num];
388 free_feature_vector(vec, vec_num, free_vec);
395 ASSERT(vec_num<get_num_vectors());
399 ST* vec=get_feature_vector(vec_num, len, free_vec);
400 free_feature_vector(vec, vec_num, free_vec);
406 return max_string_length;
411 return m_subset ? m_subset->get_size() : num_vectors;
424 ASSERT(symbol_mask_table);
425 return symbol_mask_table[mask] & symbol;
431 return (offset << (amount*alphabet->get_num_bits()));
437 return (symbol >> (amount*alphabet->get_num_bits()));
445 size_t blocksize=1024*1024;
446 size_t required_blocksize=0;
447 uint8_t* dummy=
SG_MALLOC(uint8_t, blocksize);
448 uint8_t* overflow=NULL;
449 int32_t overflow_len=0;
456 FILE* f=fopen(fname,
"ro");
463 SG_INFO(
"counting line numbers in file %s\n", fname);
465 size_t old_block_offs=0;
466 fseek(f, 0, SEEK_END);
467 size_t fsize=ftell(f);
473 SG_DEBUG(
"block_size=%ld file_size=%ld\n", blocksize, fsize);
476 while (sz == blocksize)
478 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
479 for (
size_t i=0; i<sz; i++)
482 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
485 required_blocksize=
CMath::max(required_blocksize, block_offs-old_block_offs);
486 old_block_offs=block_offs;
489 SG_PROGRESS(block_offs, 0, fsize, 1,
"COUNTING:\t");
492 SG_INFO(
"found %d strings\n", num_vectors);
494 blocksize=required_blocksize;
502 while (sz == blocksize)
504 sz=fread(dummy,
sizeof(uint8_t), blocksize, f);
507 for (
size_t i=0; i<sz; i++)
509 if (dummy[i]==
'\n' || (i==sz-1 && sz<blocksize))
511 int32_t len=i-old_sz;
513 max_string_length=
CMath::max(max_string_length, len+overflow_len);
515 features[lines].slen=len;
516 features[lines].string=
SG_MALLOC(ST, len);
520 for (int32_t j=0; j<overflow_len; j++)
521 features[lines].
string[j]=alpha->
remap_to_bin(overflow[j]);
522 for (int32_t j=0; j<len; j++)
523 features[lines].
string[j+overflow_len]=alpha->
remap_to_bin(dummy[old_sz+j]);
529 for (int32_t j=0; j<overflow_len; j++)
530 features[lines].
string[j]=overflow[j];
531 for (int32_t j=0; j<len; j++)
532 features[lines].
string[j+overflow_len]=dummy[old_sz+j];
543 SG_PROGRESS(lines, 0, num_vectors, 1,
"LOADING:\t");
546 for (
size_t i=old_sz; i<sz; i++)
547 overflow[i-old_sz]=dummy[i];
549 overflow_len=sz-old_sz;
554 SG_INFO(
"file successfully read\n");
555 SG_INFO(
"max_string_length=%d\n", max_string_length);
556 SG_INFO(
"num_strings=%d\n", num_vectors);
570 num_symbols=alphabet->get_num_symbols();
591 if (len>0 && s[0]==
'>')
596 SG_ERROR(
"No fasta hunks (lines starting with '>') found\n");
601 num_symbols=alphabet->get_num_symbols();
614 int32_t spanned_lines=0;
619 SG_ERROR(
"Error reading fasta entry in line %d len=%ld", 4*i+1, len);
621 if (s[0]==
'>' || offs==f.
get_size())
630 len=fasta_len-spanned_lines;
634 ST* str=strings[i].
string;
636 SG_DEBUG(
"'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len,
id, (int32_t) len, (int32_t) spanned_lines);
638 for (int32_t j=0; j<fasta_len; j++)
645 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
648 if (uint64_t(idx)>=len)
649 SG_ERROR(
"idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
663 return set_features(strings, num, max_len);
667 bool ignore_invalid,
bool bitremap_in_single_string)
681 SG_ERROR(
"Number of lines must be divisible by 4 in fastq files\n");
691 if (bitremap_in_single_string)
701 original_num_symbols=alphabet->get_num_symbols();
710 SG_ERROR(
"Error reading 'read' identifier in line %d", 4*i);
714 SG_ERROR(
"Error reading 'read' in line %d len=%ld", 4*i+1, len);
716 if (bitremap_in_single_string)
718 if (len!=(uint64_t) order)
719 SG_ERROR(
"read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
720 for (int32_t j=0; j<order; j++)
721 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
723 strings[0].
string[i]=embed_word(str, order);
733 for (uint64_t j=0; j<len; j++)
735 if (alphabet->is_valid((uint8_t) s[j]))
743 for (uint64_t j=0; j<len; j++)
751 SG_ERROR(
"Error reading 'read' quality identifier in line %d", 4*i+2);
754 SG_ERROR(
"Error reading 'read' quality in line %d", 4*i+3);
757 if (bitremap_in_single_string)
761 max_string_length=max_len;
771 struct dirent **namelist;
776 SG_DEBUG(
"dirname '%s'\n", dirname);
778 n=scandir(dirname, &namelist, &
SGIO::filter, alphasort);
781 SG_ERROR(
"error calling scandir - no files found\n");
795 for (int32_t i=0; i<n; i++)
802 if (!stat(fname, &s) && s.st_size>0)
804 filesize=s.st_size/
sizeof(ST);
806 FILE* f=fopen(fname,
"ro");
810 SG_DEBUG(
"%s:%ld\n", fname, (int64_t) filesize);
811 if (fread(str,
sizeof(ST), filesize, f)!=(
size_t) filesize)
814 strings[num].
slen=filesize;
815 max_len=
CMath::max(max_len, strings[num].slen);
822 SG_ERROR(
"empty or non readable file \'%s\'\n", fname);
828 if (num>0 && strings)
830 set_features(strings, num, max_len);
845 SG_ERROR(
"Cannot call set_features() with subset.\n");
852 for (int32_t i=0; i<p_num_vectors; i++)
867 num_vectors=p_num_vectors;
868 max_string_length=p_max_string_length;
884 SG_ERROR(
"Cannot call set_features() with subset.\n");
889 for (int32_t i=0; i<sf_num_str; i++)
894 memcpy(new_features[i].
string, sf->
features[real_i].
string, length);
895 new_features[i].
slen=length;
897 return append_features(new_features, sf_num_str,
904 SG_ERROR(
"Cannot call set_features() with subset.\n");
907 return set_features(p_features, p_num_vectors, p_max_string_length);
912 for (int32_t i=0; i<p_num_vectors; i++)
921 for (int32_t i=0; i<p_num_vectors; i++)
922 alphabet->add_string_to_histogram( p_features[i].
string, p_features[i].
slen);
924 int32_t old_num_vectors=num_vectors;
925 num_vectors=old_num_vectors+p_num_vectors;
928 for (int32_t i=0; i<num_vectors; i++)
930 if (i<old_num_vectors)
932 new_features[i].
string=features[i].string;
933 new_features[i].
slen=features[i].slen;
937 new_features[i].
string=p_features[i-old_num_vectors].
string;
938 new_features[i].
slen=p_features[i-old_num_vectors].
slen;
944 this->features=new_features;
945 max_string_length=
CMath::max(max_string_length, p_max_string_length);
965 SG_ERROR(
"get features() is not possible on subset");
968 max_str_len=max_string_length;
976 num_str=get_num_vectors();
977 max_str_len=max_string_length;
980 for (int32_t i=0; i<num_str; i++)
984 ST* vec=get_feature_vector(i, len, free_vec);
986 new_feat[i].
slen=len;
987 memcpy(new_feat[i].
string, vec, ((
size_t) len) *
sizeof(ST));
988 free_feature_vector(vec, i, free_vec);
998 *dst=copy_features(num_vec, max_str_len);
1008 if (!(file=fopen(src,
"r")))
1014 if (fread(&
id[0],
sizeof(
char), 1, file)!=1)
1017 if (fread(&
id[1],
sizeof(
char), 1, file)!=1)
1020 if (fread(&
id[2],
sizeof(
char), 1, file)!=1)
1023 if (fread(&
id[3],
sizeof(
char), 1, file)!=1)
1029 if (fread(&c,
sizeof(uint8_t), 1, file)!=1)
1030 SG_ERROR(
"failed to read compression type");
1035 if (fread(&a,
sizeof(uint8_t), 1, file)!=1)
1036 SG_ERROR(
"failed to read compression alphabet");
1039 if (fread(&num_vectors,
sizeof(int32_t), 1, file)!=1)
1040 SG_ERROR(
"failed to read compression number of vectors");
1043 if (fread(&max_string_length,
sizeof(int32_t), 1, file)!=1)
1044 SG_ERROR(
"failed to read maximum string length");
1045 ASSERT(max_string_length>0);
1050 for (int32_t i=0; i<num_vectors; i++)
1053 int32_t len_compressed;
1054 if (fread(&len_compressed,
sizeof(int32_t), 1, file)!=1)
1055 SG_ERROR(
"failed to read vector length compressed");
1057 int32_t len_uncompressed;
1058 if (fread(&len_uncompressed,
sizeof(int32_t), 1, file)!=1)
1059 SG_ERROR(
"failed to read vector length uncompressed");
1064 features[i].string=
SG_MALLOC(ST, len_uncompressed);
1065 features[i].slen=len_uncompressed;
1066 uint8_t* compressed=
SG_MALLOC(uint8_t, len_compressed);
1067 if (fread(compressed,
sizeof(uint8_t), len_compressed, file)!=(
size_t) len_compressed)
1068 SG_ERROR(
"failed to read compressed data (expected %d bytes)", len_compressed);
1069 uint64_t uncompressed_size=len_uncompressed;
1070 uncompressed_size*=
sizeof(ST);
1071 compressor->
decompress(compressed, len_compressed,
1072 (uint8_t*) features[i].
string, uncompressed_size);
1074 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*
sizeof(ST));
1078 int32_t offs=
CMath::ceil(2.0*
sizeof(int32_t)/
sizeof(ST));
1079 features[i].string=
SG_MALLOC(ST, len_compressed+offs);
1080 features[i].slen=len_compressed+offs;
1081 int32_t* feat32ptr=((int32_t*) (features[i].
string));
1082 memset(features[i].
string, 0, offs*
sizeof(ST));
1083 feat32ptr[0]=(int32_t) len_compressed;
1084 feat32ptr[1]=(int32_t) len_uncompressed;
1085 uint8_t* compressed=(uint8_t*) (&features[i].
string[offs]);
1086 if (fread(compressed, 1, len_compressed, file)!=(size_t) len_compressed)
1087 SG_ERROR(
"failed to read uncompressed data");
1100 SG_ERROR(
"save_compressed() is not possible on subset");
1104 if (!(file=fopen(dest,
"wb")))
1110 const char*
id=
"SGV0";
1111 fwrite(&
id[0],
sizeof(
char), 1, file);
1112 fwrite(&
id[1],
sizeof(
char), 1, file);
1113 fwrite(&
id[2],
sizeof(
char), 1, file);
1114 fwrite(&
id[3],
sizeof(
char), 1, file);
1117 uint8_t c=(uint8_t) compression;
1118 fwrite(&c,
sizeof(uint8_t), 1, file);
1120 uint8_t a=(uint8_t) alphabet->get_alphabet();
1121 fwrite(&a,
sizeof(uint8_t), 1, file);
1123 fwrite(&num_vectors,
sizeof(int32_t), 1, file);
1125 fwrite(&max_string_length,
sizeof(int32_t), 1, file);
1128 for (int32_t i=0; i<num_vectors; i++)
1132 ST* vec=get_feature_vector(i, len, vfree);
1134 uint8_t* compressed=NULL;
1135 uint64_t compressed_size=0;
1137 compressor->
compress((uint8_t*) vec, ((uint64_t) len)*
sizeof(ST),
1138 compressed, compressed_size, level);
1140 int32_t len_compressed=(int32_t) compressed_size;
1142 fwrite(&len_compressed,
sizeof(int32_t), 1, file);
1144 fwrite(&len,
sizeof(int32_t), 1, file);
1146 fwrite(compressed, compressed_size, 1, file);
1149 free_feature_vector(vec, i, vfree);
1161 SG_DEBUG(
"force: %d\n", force_preprocessing);
1163 for (int32_t i=0; i<get_num_preprocessors(); i++)
1165 if ( (!is_preprocessed(i) || force_preprocessing) )
1167 set_preprocessed(i);
1190 ASSERT(num_vectors==1 || single_string);
1191 ASSERT(max_string_length>=window_size ||
1192 (single_string && length_of_single_string>=window_size));
1197 num_vectors= (length_of_single_string-window_size)/step_size + 1;
1198 else if (num_vectors==1)
1200 num_vectors= (max_string_length-window_size)/step_size + 1;
1201 length_of_single_string=max_string_length;
1206 for (int32_t i=0; i<num_vectors; i++)
1208 f[i].
string=&features[0].string[offs+skip];
1209 f[i].
slen=window_size-skip;
1212 single_string=features[0].string;
1215 max_string_length=window_size-skip;
1228 ASSERT(num_vectors==1 || single_string);
1229 ASSERT(max_string_length>=window_size ||
1230 (single_string && length_of_single_string>=window_size));
1240 len=length_of_single_string;
1243 single_string=features[0].string;
1244 len=max_string_length;
1245 length_of_single_string=max_string_length;
1249 for (int32_t i=0; i<num_vectors; i++)
1253 if (p>=0 && p<=len-window_size)
1255 f[i].
string=&features[0].string[p+skip];
1256 f[i].
slen=window_size-skip;
1261 max_string_length=len;
1262 features[0].slen=len;
1265 SG_ERROR(
"window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
1266 window_size, i, p, len);
1273 max_string_length=window_size-skip;
1280 return obtain_from_char_features(sf, start, p_order, gap, rev);
1287 if (len!=max_string_length)
1290 len=max_string_length;
1292 index_t num_str=get_num_vectors();
1293 for (int32_t i=0; i<num_str; i++)
1295 if (get_vector_length(i)!=len)
1307 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
1310 original_num_symbols=alphabet->get_num_symbols();
1311 int32_t max_val=alphabet->get_num_bits();
1316 num_symbols=original_num_symbols;
1318 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1321 SG_WARNING(
"symbols did not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val);
1324 for (int32_t i=0; i<p_order*max_val; i++)
1325 mask= (mask<<1) | ((ST) 1);
1327 for (int32_t i=0; i<num_vectors; i++)
1329 int32_t len=features[i].slen;
1332 SG_ERROR(
"Sequence must be longer than order (%d vs. %d)\n", len, p_order);
1334 ST* str=features[i].string;
1337 for (int32_t j=0; j<p_order; j++)
1338 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1339 str[0]=embed_word(&str[0], p_order);
1343 for (int32_t j=p_order; j<len; j++)
1345 str[j]=(ST) alphabet->remap_to_bin(str[j]);
1346 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
1350 features[i].slen=len-p_order+1;
1353 compute_symbol_mask_table(max_val);
1365 for (int32_t i=0; i< (int64_t) max_val; i++)
1368 for (int32_t i=0; i<256; i++)
1370 uint8_t bits=(uint8_t) i;
1371 symbol_mask_table[i]=0;
1373 for (int32_t j=0; j<8; j++)
1376 symbol_mask_table[i]|=mask<<(max_val*j);
1385 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1388 for (uint32_t i=0; i<nbits; i++)
1389 mask=(mask<<1) | (ST) 1;
1391 for (int32_t i=0; i<len; i++)
1394 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
1402 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
1403 for (int32_t i=0; i<len; i++)
1414 max_string_length=0;
1415 index_t num_str=get_num_vectors();
1417 for (int32_t i=0; i<num_str; i++)
1419 max_string_length=
CMath::max(max_string_length,
1420 features[subset_idx_conversion(i)].slen);
1428 memcpy(s, str.
string,
sizeof(ST)*l);
1436 ASSERT(num<get_num_vectors());
1438 int32_t real_num=subset_idx_conversion(num);
1441 features[real_num].slen=len ;
1442 features[real_num].string=string ;
1444 max_string_length=
CMath::max(len, max_string_length);
1449 int32_t nsym=get_num_symbols();
1450 int32_t slen=get_max_vector_length();
1451 int64_t sz=int64_t(nsym)*slen*
sizeof(
float64_t);
1456 memset(h_normalizer, 0, slen*
sizeof(
float64_t));
1457 int32_t num_str=get_num_vectors();
1458 for (int32_t i=0; i<num_str; i++)
1462 ST* vec=get_feature_vector(i, len, free_vec);
1463 for (int32_t j=0; j<len; j++)
1465 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++;
1468 free_feature_vector(vec, i, free_vec);
1473 for (int32_t i=0; i<slen; i++)
1475 for (int32_t j=0; j<nsym; j++)
1477 if (h_normalizer && h_normalizer[i])
1478 h[int64_t(i)*nsym+j]/=h_normalizer[i];
1491 ASSERT(rows == get_num_symbols());
1496 for (int32_t i=0; i<num_vec; i++)
1503 for (int32_t j=0; j<cols; j++)
1508 for (c=0; c<rows-1; c++)
1510 if (randoms[j]<=lik)
1512 lik+=hist[int64_t(j)*rows+c+1];
1514 sf[i].
string[j]=alphabet->remap_to_char(c);
1518 set_features(sf, num_vec, cols);
1606 current_string.
slen*
sizeof(ST));
1607 list_copy.
strings[i]=string_copy;
1622 determine_maximum_string_length();
1627 ASSERT(features && num<get_num_vectors());
1629 int32_t real_num=subset_idx_conversion(num);
1631 len=features[real_num].slen;
1636 memcpy(target, features[real_num].
string, len*
sizeof(ST));
1648 length_of_single_string=0;
1649 max_string_length=0;
1651 symbol_mask_table=0;
1652 preprocess_on_get=
false;
1655 m_parameters->add((
CSGObject**) &alphabet,
"alphabet");
1656 m_parameters->add_vector(&features, &num_vectors,
"features",
1657 "This contains the array of features.");
1658 m_parameters->add_vector(&single_string,
1659 &length_of_single_string,
1661 "Created by sliding window.");
1662 m_parameters->add(&max_string_length,
"max_string_length",
1663 "Length of longest string.");
1664 m_parameters->add(&num_symbols,
"num_symbols",
1665 "Number of used symbols.");
1666 m_parameters->add(&original_num_symbols,
"original_num_symbols",
1667 "Original number of used symbols.");
1668 m_parameters->add(&order,
"order",
1669 "Order used in higher order mapping.");
1670 m_parameters->add(&preprocess_on_get,
"preprocess_on_get",
1671 "Preprocess on-the-fly?");
1895 #define LOAD(f_load, sg_type) \
1896 template<> void CStringFeatures<sg_type>::load(CFile* loader) \
1898 SG_INFO( "loading...\n"); \
1901 SGString<sg_type>* strs; \
1904 loader->f_load(strs, num_str, max_len); \
1905 set_features(strs, num_str, max_len); \
1909 LOAD(get_string_list,
bool)
1910 LOAD(get_string_list,
char)
1911 LOAD(get_int8_string_list, int8_t)
1912 LOAD(get_string_list, uint8_t)
1913 LOAD(get_string_list, int16_t)
1914 LOAD(get_string_list, uint16_t)
1915 LOAD(get_string_list, int32_t)
1916 LOAD(get_uint_string_list, uint32_t)
1917 LOAD(get_long_string_list, int64_t)
1918 LOAD(get_ulong_string_list, uint64_t)
1924 #define SAVE(f_write, sg_type) \
1925 template<> void CStringFeatures<sg_type>::save(CFile* writer) \
1928 SG_ERROR("save() is not possible on subset"); \
1931 writer->f_write(features, num_vectors); \
1935 SAVE(set_string_list,
bool)
1936 SAVE(set_string_list,
char)
1937 SAVE(set_int8_string_list, int8_t)
1938 SAVE(set_string_list, uint8_t)
1939 SAVE(set_string_list, int16_t)
1940 SAVE(set_string_list, uint16_t)
1941 SAVE(set_string_list, int32_t)
1942 SAVE(set_uint_string_list, uint32_t)
1943 SAVE(set_long_string_list, int64_t)
1944 SAVE(set_ulong_string_list, uint64_t)
1950 template <
class ST>
template <
class CT>
1952 int32_t p_order, int32_t gap,
bool rev)
1960 this->order=p_order;
1971 for (int32_t i=0; i<num_vectors; i++)
1979 features[i].slen=len;
1981 ST* str=features[i].string;
1982 for (int32_t j=0; j<len; j++)
1994 num_symbols=original_num_symbols;
1995 SG_INFO(
"max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
1999 SG_ERROR(
"symbol does not fit into datatype \"%c\" (%d)\n", (
char) max_val, (
int) max_val);
2003 SG_DEBUG(
"translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap,
sizeof(ST)) ;
2004 for (int32_t line=0; line<num_vectors; line++)
2008 ST* fv=get_feature_vector(line, len, vfree);
2012 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
2014 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
2017 features[line].slen-=start+gap ;
2018 if (features[line].slen<0)
2019 features[line].slen=0 ;
2022 compute_symbol_mask_table(max_val);