diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index 4e9ad1790..eca420b87 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -21,45 +21,45 @@ with AvailabilityChecking(['simhash-py'], OP_NAME): import simhash + def local_num_differing_bits(hash_a, hash_b): + """ + Local implementation of calculating the number of different bits + between two integers. -def local_num_differing_bits(hash_a, hash_b): - """ - Local implementation of calculating the number of different bits between - two integers. - - :param hash_a: integer hash value a - :param hash_b: integer hash value b - :return: number of different bits between input hashes. - """ - cnt = 0 - n = hash_a ^ hash_b - while n != 0: - cnt += 1 - n = n & (n - 1) - return cnt - - -def num_differing_bits_selector(): - """ - Select a num_differing_bits method according to the Python version - installed. - - When Python >= 3.9, the original simhash library cannot be compiled - correctly due to some changes in cython. After fixing this - incompatibility, RecursionError occurs sometimes when calling - simhash.num_differing_bits. So we use our implementation when Python - >= 3.9. Otherwise, we use implementation of simhash. - - :return: an available num_differing_bits function. - """ - import platform - a, b, _ = platform.python_version().split('.') - if a == '3' and int(b) >= 9: - # for >= 3.9, use local implementation - return local_num_differing_bits - else: - # for < 3.9, use simhash version - return simhash.num_differing_bits + :param hash_a: integer hash value a + :param hash_b: integer hash value b + :return: number of different bits between input hashes. + """ + cnt = 0 + n = hash_a ^ hash_b + while n != 0: + cnt += 1 + n = n & (n - 1) + return cnt + + def num_differing_bits_selector(): + """ + Select a num_differing_bits method according to the Python version + installed. + + When Python >= 3.9, the original simhash library cannot be compiled + correctly due to some changes in cython. After fixing this + incompatibility, RecursionError occurs sometimes when calling + simhash.num_differing_bits. So we use our implementation when Python + >= 3.9. Otherwise, we use implementation of simhash. + + :return: an available num_differing_bits function. + """ + import platform + a, b, _ = platform.python_version().split('.') + if a == '3' and int(b) >= 9: + # for >= 3.9, use local implementation + return local_num_differing_bits + else: + # for < 3.9, use simhash version + return simhash.num_differing_bits + + num_differing_bits = num_differing_bits_selector() @OPERATORS.register_module(OP_NAME) @@ -114,8 +114,6 @@ def __init__(self, self.num_blocks = num_blocks self.hamming_distance = hamming_distance - self.num_differing_bits = num_differing_bits_selector() - def compute_hash(self, sample): """ Compute simhash values for the sample. @@ -189,7 +187,7 @@ def process(self, dataset, show_num=0): dist = Counter() for x, y in matches: graph[x][y] = graph[y][x] = True - num_diff = self.num_differing_bits(x, y) + num_diff = num_differing_bits(x, y) dist[num_diff] += 1 logger.info(f'Hash diff distribution: {dist}')