Apa itu Filter Bloom?





Halo! Pada artikel ini, saya akan mencoba menjelaskan apa itu filter Bloom, menjelaskan tujuannya, dan menunjukkan skenario yang dapat digunakannya. Saya juga menerapkan filter Bloom dengan Python dari awal untuk membuatnya lebih mudah memahami bagian dalamnya.





Tujuan filter Bloom 

— , — , ( , O , — O(1)



). , , , . , — : 100% , , 100% , , ( ). , Python, , !





. , .





, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))



), , IP-, , IP- . , , IP- .





— . , , , : , ​​ ; (, ) , .





, , , , .





.





?

, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».





4- IP- , .





IP-?

, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.





-

- — , . , - IP-, , . - , , .





- , ( IP), . , .





… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).





. IP- , , .





, : 100 IP-. IP- -, - , . , , IP- . — ?





, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?





, , — , . 0, . , 1, - , . , , , , , IP .





, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))



, m — , , n .





— -. , IP- -, .. 1. k



-, (1-e^(mk/n))^k



, , - (n/m)*ln(2)



( ).





-. IP- , - , IP 112.64.90.12 , 1.





, Python ! 50 .





BloomFilter



( ). ( ) , . bitarray



, . , - , -, .





import math
from bitarray import bitarray

class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
      
      



- . ( ) DJB2. , .





def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size
      
      



-, K -? . , -, -. -. - , -. , ?





def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)
      
      



. -, , , 1 ( True) .





def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1
      
      



, , . -. - 0, , . , 1, , .





 def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False
      
      



! . !





, , . 1 , 100 000. «192.168.1.1» IP-.





bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
      
      



, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .





for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



192.168.1.1





! , 100 000 IP- , , — IP-. !





:





import math
from bitarray import bitarray


class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))


    def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size


    def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)


    def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1


    def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False


bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))

for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



, . , , . !






«Data Engineer».









- «ML Spark». ML Spark, production












All Articles