SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashedWDFeatures.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 
14 using namespace shogun;
15 
17 {
18  SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures()", "\n");
19 
20  strings = NULL;
21 
22  degree = 0;
23  start_degree = 0;
24  from_degree = 0;
25  string_length = 0;
26  num_strings = 0;
27  alphabet_size = 0;
28  w_dim = 0;
29  partial_w_dim = 0;
30  wd_weights = NULL;
31  mask = 0;
32  m_hash_bits = 0;
33 
34  normalization_const = 0.0;
35 }
36 
38  int32_t start_order, int32_t order, int32_t from_order,
39  int32_t hash_bits) : CDotFeatures()
40 {
41  ASSERT(start_order>=0);
42  ASSERT(start_order<order);
43  ASSERT(order<=from_order);
44  ASSERT(hash_bits>0);
45  ASSERT(str);
46  ASSERT(str->have_same_length());
47  SG_REF(str);
48 
49  strings=str;
52  CAlphabet* alpha=str->get_alphabet();
54  SG_UNREF(alpha);
55 
56  degree=order;
57  start_degree=start_order;
58  from_degree=from_order;
59  m_hash_bits=hash_bits;
62 }
63 
65  : CDotFeatures(orig), strings(orig.strings),
66  degree(orig.degree), start_degree(orig.start_degree),
67  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
68  normalization_const(orig.normalization_const)
69 {
70  SG_REF(strings);
73  CAlphabet* alpha=strings->get_alphabet();
75  SG_UNREF(alpha);
76 
78 }
79 
81 {
84 }
85 
86 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
87 {
88  ASSERT(df);
92 
93  int32_t len1, len2;
94  bool free_vec1, free_vec2;
95 
96  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
97  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
98 
99  ASSERT(len1==len2);
100 
101  float64_t sum=0.0;
102 
103  for (int32_t i=0; i<len1; i++)
104  {
105  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
106  {
107  if (vec1[i+j]!=vec2[i+j])
108  break;
109  if (j>=start_degree)
110  sum += wd_weights[j]*wd_weights[j];
111  }
112  }
113  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
114  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
115  return sum/CMath::sq(normalization_const);
116 }
117 
118 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
119 {
120  if (vec2_len != w_dim)
121  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
122 
123  float64_t sum=0;
124  int32_t lim=CMath::min(degree, string_length);
125  int32_t len;
126  bool free_vec1;
127  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
128  uint32_t* val=SG_MALLOC(uint32_t, len);
129 
130  uint32_t offs=0;
131 
132  if (start_degree>0)
133  {
134  // compute hash for strings of length start_degree-1
135  for (int32_t i=0; i+start_degree < len; i++)
136  val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF);
137  }
138  else
139  CMath::fill_vector(val, len, 0xDEADBEAF);
140 
141  for (int32_t k=start_degree; k<lim; k++)
142  {
143  float64_t wd = wd_weights[k];
144 
145  uint32_t o=offs;
146  for (int32_t i=0; i+k < len; i++)
147  {
148  const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
149  val[i]=h;
150 #ifdef DEBUG_HASHEDWD
151  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
152 #endif
153  sum+=vec2[o+(h & mask)]*wd;
154  o+=partial_w_dim;
155  }
156  offs+=partial_w_dim*len;
157  }
158  SG_FREE(val);
159  strings->free_feature_vector(vec, vec_idx1, free_vec1);
160 
161  return sum/normalization_const;
162 }
163 
164 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
165 {
166  if (vec2_len != w_dim)
167  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
168 
169  int32_t lim=CMath::min(degree, string_length);
170  int32_t len;
171  bool free_vec1;
172  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
173  uint32_t* val=SG_MALLOC(uint32_t, len);
174 
175  uint32_t offs=0;
176 
177  if (start_degree>0)
178  {
179  // compute hash for strings of length start_degree-1
180  for (int32_t i=0; i+start_degree < len; i++)
181  val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF);
182  }
183  else
184  CMath::fill_vector(val, len, 0xDEADBEAF);
185 
186  for (int32_t k=start_degree; k<lim; k++)
187  {
189 
190  if (abs_val)
191  wd=CMath::abs(wd);
192 
193  uint32_t o=offs;
194  for (int32_t i=0; i+k < len; i++)
195  {
196  const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
197  val[i]=h;
198 
199 #ifdef DEBUG_HASHEDWD
200  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
201  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
202 #endif
203  vec2[o+(h & mask)]+=wd;
204  o+=partial_w_dim;
205  }
206  offs+=partial_w_dim*len;
207  }
208 
209  SG_FREE(val);
210  strings->free_feature_vector(vec, vec_idx1, free_vec1);
211 }
212 
214 {
215  ASSERT(degree>0);
216 
217  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
220 
222 
223  for (int32_t i=0; i<degree; i++)
224  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
225 
226  SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, "
227  "dim=%d partial_dim=%d num=%d, len=%d\n",
228  degree, from_degree, alphabet_size,
230 }
231 
232 
234 {
235  if (n==0)
236  {
238  for (int32_t i=0; i<degree; i++)
240 
242  }
243  else
245 
246  SG_DEBUG("normalization_const:%f\n", normalization_const);
247 }
248 
250 {
251  return new CHashedWDFeatures(*this);
252 }
253 
254 
256 {
257  int32_t vlen=-1;
258  bool free_vec;
259  uint8_t* vec=strings->get_feature_vector(num, vlen, free_vec);
260  strings->free_feature_vector(vec, num, free_vec);
261  return degree*vlen;
262 }
263 
264 void* CHashedWDFeatures::get_feature_iterator(int32_t vector_index)
265 {
267  return NULL;
268 }
269 
270 bool CHashedWDFeatures::get_next_feature(int32_t& index, float64_t& value,
271  void* iterator)
272 {
274  return NULL;
275 }
276 
278 {
280 }

SHOGUN Machine Learning Toolbox - Documentation