BFGraph
BlockedBloomFilter.hpp
1 #ifndef BFG_BLOCKEDBLOOMFILTER_HPP
2 #define BFG_BLOCKEDBLOOMFILTER_HPP
3 
4 #include <cmath>
5 #include <iostream>
6 #include <cstdlib>
7 
8 //#include "hash.hpp"
9 #include "libdivide.h"
10 
11 #include <vector>
12 
13 
14 // ------ TEST ------
15 #include <algorithm>
16 #include <fstream>
17 #include <random>
18 
19 #include "KmerHashTable.h"
20 #include "Kmer.hpp"
21 #include "RepHash.hpp"
22 
23 #include "libpopcnt.h"
24 
25 #define NB_BITS_BLOCK (0x800ULL)
26 #define MASK_BITS_BLOCK (0x7ffULL)
27 #define NB_ELEM_BLOCK (32)
28 
29 /* Short description:
30  * - Extended BloomFilter which hashes into 64-bit blocks
31  * that can be accessed very fast from the CPU cache
32  * */
34 
35  private:
36 
37  uint64_t* table_; //Bit array
38 
39  uint64_t size_table_; //Size of bit array (in bits)
40  uint64_t blocks_; //Nb blocks
41  int k_; //Nb hash functions
42 
43  libdivide::divider<uint64_t> fast_div_; // fast division
44 
45  public:
46 
47  BlockedBloomFilter() : table_(NULL), size_table_(0), blocks_(0), k_(0), fast_div_() {}
48 
49  BlockedBloomFilter(size_t nb_elem, size_t bits_per_elem) : table_(NULL), size_table_(0), blocks_(0), k_(0), fast_div_() {
50 
51  size_table_ = ((bits_per_elem * nb_elem + MASK_BITS_BLOCK) / NB_BITS_BLOCK) * NB_BITS_BLOCK;
52  blocks_ = size_table_ / NB_BITS_BLOCK;
53 
54  init_table();
55 
56  k_ = (int) (bits_per_elem * log(2));
57  if (fpp(bits_per_elem, k_) >= fpp(bits_per_elem, k_+1)) k_++;
58  }
59 
61 
62  clear();
63  }
64 
65  inline std::pair<uint64_t*,uint64_t*> getBlock(uint64_t min_hash) const{
66 
67  uint64_t min_hash_2 = (min_hash * 49157) % (1610612741ULL);
68 
69  min_hash -= (min_hash / fast_div_) * blocks_;
70  min_hash_2 -= (min_hash_2 / fast_div_) * blocks_;
71 
72  return std::make_pair(table_ + NB_ELEM_BLOCK * min_hash, table_ + NB_ELEM_BLOCK * min_hash_2);
73  }
74 
75  bool contains(uint64_t kmer_hash, const uint64_t min_hash) const {
76 
77  int i = 0;
78 
79  const int k = k_;
80 
81  uint64_t kmer_hash_2 = kmer_hash;
82 
83  uint64_t* table = table_ + ((min_hash - (min_hash / fast_div_) * blocks_) * NB_ELEM_BLOCK);
84 
85  __builtin_prefetch(table, 0, 1);
86 
87  for (; i < k; i++) {
88 
89  if ((table[(kmer_hash & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash & 0x3fULL))) == 0) break;
90  kmer_hash = (kmer_hash * 49157) % (1610612741ULL);
91  }
92 
93  if (i != k){
94 
95  const uint64_t min_hash_2 = (min_hash * 49157) % (1610612741ULL);
96 
97  table = table_ + ((min_hash_2 - (min_hash_2 / fast_div_) * blocks_) * NB_ELEM_BLOCK);
98 
99  __builtin_prefetch(table, 0, 1);
100 
101  for (i = 0; i < k; i++) {
102 
103  if ((table[(kmer_hash_2 & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash_2 & 0x3fULL))) == 0) break;
104  kmer_hash_2 = (kmer_hash_2 * 49157) % (1610612741ULL);
105  }
106  }
107 
108  return i == k;
109  }
110 
111  inline bool contains(uint64_t kmer_hash, const std::pair<uint64_t*, uint64_t*> block_ptr) const {
112 
113  return (contains_block(kmer_hash, block_ptr) != 0);
114  }
115 
116  size_t contains_block(uint64_t kmer_hash, const std::pair<const uint64_t* const, const uint64_t* const> block_ptr) const {
117 
118  uint64_t kmer_hash_2 = kmer_hash;
119 
120  int i = 0;
121 
122  const int k = k_;
123 
124  __builtin_prefetch(block_ptr.first, 0, 1);
125 
126  for (; i != k; i++) {
127 
128  if ((block_ptr.first[(kmer_hash & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash & 0x3fULL))) == 0) break;
129  kmer_hash = (kmer_hash * 49157) % (1610612741ULL);
130  }
131 
132  if (i != k){
133 
134  __builtin_prefetch(block_ptr.second, 0, 1);
135 
136  for (i = 0; i != k; i++) {
137 
138  if ((block_ptr.second[(kmer_hash_2 & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash_2 & 0x3fULL))) == 0) break;
139  kmer_hash_2 = (kmer_hash_2 * 49157) % (1610612741ULL);
140  }
141 
142  return (i == k ? 2 : 0);
143  }
144 
145  return 1;
146  }
147 
148  bool search_and_insert(uint64_t kmer_hash, const uint64_t min_hash, const bool multi_threaded = false) {
149 
150  int i = 0, j = 0;
151 
152  const int k = k_;
153 
154  uint64_t kmer_hash_2 = kmer_hash;
155 
156  uint64_t* table = table_ + ((min_hash - (min_hash / fast_div_) * blocks_) * NB_ELEM_BLOCK);
157 
158  __builtin_prefetch(table, 0, 1);
159 
160  for (; i != k; i++) {
161 
162  if ((table[(kmer_hash & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash & 0x3fULL))) == 0) break;
163  kmer_hash = (kmer_hash * 49157) % (1610612741ULL);
164  }
165 
166  if (i != k){
167 
168  const uint64_t min_hash_2 = (min_hash * 49157) % (1610612741ULL);
169 
170  uint64_t* table2 = table_ + ((min_hash_2 - (min_hash_2 / fast_div_) * blocks_) * NB_ELEM_BLOCK);
171 
172  __builtin_prefetch(table2, 0, 1);
173 
174  for (; j != k; j++) {
175 
176  if ((table2[(kmer_hash_2 & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash_2 & 0x3fULL))) == 0) break;
177  kmer_hash_2 = (kmer_hash_2 * 49157) % (1610612741ULL);
178  }
179 
180  if (j != k){
181 
182  if (!multi_threaded){
183 
184  if (popcnt(table2, NB_ELEM_BLOCK * sizeof(uint64_t)) < popcnt(table, NB_ELEM_BLOCK * sizeof(uint64_t))){
185 
186  i = j;
187  table = table2;
188  kmer_hash = kmer_hash_2;
189  }
190 
191  __builtin_prefetch(table, 1, 1);
192 
193  for (; i != k; i++) {
194 
195  //__sync_fetch_and_or(table + ((kmer_hash & MASK_BITS_BLOCK) >> 6), 1ULL << (kmer_hash & 0x3fULL));
196  table[(kmer_hash & MASK_BITS_BLOCK) >> 6] |= 1ULL << (kmer_hash & 0x3fULL);
197  kmer_hash = (kmer_hash * 49157) % (1610612741ULL);
198  }
199  }
200  else {
201 
202  if (popcnt(table2, NB_ELEM_BLOCK * sizeof(uint64_t)) < popcnt(table, NB_ELEM_BLOCK * sizeof(uint64_t))){
203 
204  int tmp = i;
205  i = j;
206  j = tmp;
207 
208  uint64_t tmp_size_t = kmer_hash;
209  kmer_hash = kmer_hash_2;
210  kmer_hash_2 = tmp_size_t;
211 
212  uint64_t* tmp_ptr = table;
213  table = table2;
214  table2 = tmp_ptr;
215  }
216 
217  __builtin_prefetch(table, 1, 1);
218 
219  for (; i != k; i++) {
220 
221  __sync_fetch_and_or(table + ((kmer_hash & MASK_BITS_BLOCK) >> 6), 1ULL << (kmer_hash & 0x3fULL));
222  //table[(kmer_hash & MASK_BITS_BLOCK) >> 6] |= 1ULL << (kmer_hash & 0x3fULL);
223  kmer_hash = (kmer_hash * 49157) % (1610612741ULL);
224  }
225 
226  __builtin_prefetch(table2, 0, 1);
227 
228  for (; j != k; j++) {
229 
230  if ((table2[(kmer_hash_2 & MASK_BITS_BLOCK) >> 6] & (1ULL << (kmer_hash_2 & 0x3fULL))) == 0) break;
231  kmer_hash_2 = (kmer_hash_2 * 49157) % (1610612741ULL);
232  }
233  }
234 
235  return j != k;
236  }
237  }
238 
239  return false;
240  }
241 
242  inline void insert(uint64_t kmer_hash, const uint64_t min_hash){
243 
244  search_and_insert(kmer_hash, min_hash, false);
245  }
246 
247  bool WriteBloomFilter(FILE *fp) {
248 
249  if (fwrite(&size_table_, sizeof(size_table_), 1, fp) != 1) return false;
250  if (fwrite(&blocks_, sizeof(blocks_), 1, fp) != 1) return false;
251  if (fwrite(&k_, sizeof(k_), 1, fp) != 1) return false;
252 
253  if (fwrite(table_, sizeof(uint64_t), NB_ELEM_BLOCK * blocks_, fp) != (NB_ELEM_BLOCK * blocks_)) return false;
254 
255  return true;
256  }
257 
258  bool ReadBloomFilter(FILE *fp) {
259 
260  clear();
261 
262  if (fread(&size_table_, sizeof(size_table_), 1, fp) != 1) return false;
263  if (fread(&blocks_, sizeof(blocks_), 1, fp) != 1) return false;
264  if (fread(&k_, sizeof(k_), 1, fp) != 1) return false;
265 
266  init_table();
267 
268  if (fread(table_, sizeof(uint64_t), NB_ELEM_BLOCK * blocks_, fp) != (NB_ELEM_BLOCK * blocks_)) return false;
269 
270  return true;
271  }
272 
273  void clear() {
274 
275  if (table_ != NULL){
276 
277  free(table_);
278  table_ = NULL;
279  }
280 
281  size_table_ = 0;
282  blocks_ = 0;
283  k_ = 0;
284  }
285 
286  void get(BlockedBloomFilter& bf) {
287 
288  clear();
289 
290  table_ = bf.table_;
291  size_table_ = bf.size_table_;
292  blocks_ = bf.blocks_;
293  k_ = bf.k_;
294  fast_div_ = bf.fast_div_;
295 
296  bf.table_ = NULL;
297  }
298 
299  inline uint64_t getNbBlocks() const { return blocks_; }
300 
301  inline const uint64_t* getTable_ptr() const { return table_; }
302 
303  private:
304 
305  void init_table(){
306 
307  fast_div_ = libdivide::divider<uint64_t>(blocks_);
308 
309  posix_memalign((void**)&table_, 64, NB_ELEM_BLOCK * blocks_* sizeof(table_[0]));
310  memset(table_, 0, NB_ELEM_BLOCK * blocks_ * sizeof(table_[0]));
311  }
312 
313  inline double fpp(size_t bits, int k) const {
314 
315  return pow(1-exp(-((double)k)/((double)bits)),(double)k);
316  }
317 };
318 
319 #endif // BFG_BLOCKEDBLOOMFILTER_HPP
Definition: BlockedBloomFilter.hpp:33