Loading...
Searching...
No Matches
bloom.h
Go to the documentation of this file.
1/*
2 * Copyright (C) 2014 Freie Universität Berlin
3 *
4 * This file is subject to the terms and conditions of the GNU Lesser
5 * General Public License v2.1. See the file LICENSE in the top level
6 * directory for more details.
7 */
8
9/*
10 * bloom.c
11 *
12 * Bloom filters
13 *
14 * HISTORY
15 * {x, y, z}
16 * A Bloom filter is a probibalistic : : :
17 * data structure with several interesting /|\ /|\ /|\
18 * properties, such as low memory usage, / | X | X | \
19 * asymmetric query confidence, and a very / |/ \|/ \| \
20 * speedy O(k) membership test. / | | \ \
21 * / /| /|\ |\ \
22 * Because a Bloom filter can . . . . . . . . .
23 * accept any input that can be 00000000001000101010101010100010000000000
24 * hashed effectively (such as " " "
25 * strings), that membership test \ | /
26 * tends to draw a crowd. TNSTAAFL, but \ | /
27 * as caveats go, the Bloom filters' are \ | /
28 * more interesting than incapacitating. \|/
29 * :
30 * Most notably, it can tell you with certainty {w}
31 * that an item 'i' is *not* a member of set 's',
32 * but it can only tell you with some finite
33 * probability whether an item 'i' *is* a member
34 * of set 's'.
35 *
36 * Still, along with the intriguing possibility of using bitwise AND and OR
37 * to compute the logical union and intersection of two filters, the cheap
38 * cost of adding elements to the filter set, and the low memory requirements,
39 * the Bloom filter is a good choice for many applications.
40 *
41 * NOTES
42 *
43 * Let's look more closely at the probability values.
44 *
45 * Assume that a hash function selects each array position with equal
46 * probability. If m is the number of bits in the array, and k is the number
47 * of hash functions, then the probability that a certain bit is not set
48 * to 1 by a certain hash function during the insertion of an element is
49 *
50 * 1-(1/m).
51 *
52 * The probability that it is not set to 1 by any of the hash functions is
53 *
54 * (1-(1/m))^k.
55 *
56 * If we have inserted n elements, the probability that a certain bit is
57 * set 0 is
58 *
59 * (1-(1/m))^kn,
60 *
61 * Meaning that the probability said bit is set to 1 is therefore
62 *
63 * 1-([1-(1/m)]^kn).
64 *
65 * Now test membership of an element that is not in the set. Each of the k
66 * array positions computed by the hash functions is 1 with a probability
67 * as above. The probability of all of them being 1, which would cause the
68 * algorithm to erroneously claim that the element is in the set, is often
69 * given as
70 *
71 * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
72 *
73 * This is not strictly correct as it assumes independence for the
74 * probabilities of each bit being set. However, assuming it is a close
75 * approximation we have that the probability of false positives decreases
76 * as m (the number of bits in the array) increases, and increases as n
77 * (the number of inserted elements) increases. For a given m and n, the
78 * value of k (the number of hash functions) that minimizes the probability
79 * is
80 *
81 * (m/n)ln(2) ~~ 0.7(m/n),
82 *
83 * which gives the false positive probability of
84 *
85 * 2^-k ~~ 0.6185^(m/n).
86 *
87 * The required number of bits m, given n and a desired false positive
88 * probability p (and assuming the optimal value of k is used) can be
89 * computed by substituting the optimal value of k in the probability
90 * expression above:
91 *
92 * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
93 *
94 * which simplifies to
95 *
96 * ln(p) = -(m/n) * (ln2)^2.
97 *
98 * This results in the equation
99 *
100 * m = -((n*ln(p)) / ((ln(2))^2))
101 *
102 * The classic filter uses
103 *
104 * 1.44*log2(1/eta)
105 *
106 * bits of space per inserted key, where eta is the false positive rate of
107 * the Bloom filter.
108 *
109 */
110
123#ifndef BLOOM_H
124#define BLOOM_H
125
126#include <stdlib.h>
127#include <stdbool.h>
128#include <stdint.h>
129
130#ifdef __cplusplus
131extern "C" {
132#endif
133
137typedef uint32_t (*hashfp_t)(const uint8_t *, size_t len);
138
142typedef struct {
144 size_t m;
146 size_t k;
148 uint8_t *a;
151} bloom_t;
152
166void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof);
167
175void bloom_del(bloom_t *bloom);
176
189void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len);
190
229bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len);
230
231#ifdef __cplusplus
232}
233#endif
234
236#endif /* BLOOM_H */
uint32_t(* hashfp_t)(const uint8_t *, size_t len)
hash function to use in thee filter
Definition bloom.h:137
void bloom_init(bloom_t *bloom, size_t size, uint8_t *bitfield, hashfp_t *hashes, int hashes_numof)
Initialize a Bloom Filter.
void bloom_del(bloom_t *bloom)
Delete a Bloom filter.
bool bloom_check(bloom_t *bloom, const uint8_t *buf, size_t len)
Determine if a string is in the Bloom filter.
void bloom_add(bloom_t *bloom, const uint8_t *buf, size_t len)
Add a string to a Bloom filter.
bloom_t bloom filter object
Definition bloom.h:142
size_t m
number of bits in the bloom array
Definition bloom.h:144
size_t k
number of hash functions
Definition bloom.h:146
hashfp_t * hash
the hash functions
Definition bloom.h:150
uint8_t * a
the bloom array
Definition bloom.h:148