1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| class SimhashIndex(object):
def __init__(self, objs, f=64, k=2, log=None): """ `objs` is a list of (obj_id, simhash) obj_id is a string, simhash is an instance of Simhash `f` is the same with the one for Simhash `k` is the tolerance """ self.k = k self.f = f count = len(objs)
if log is None: self.log = logging.getLogger("simhash") else: self.log = log
self.log.info('Initializing %s data.', count)
self.bucket = collections.defaultdict(set)
for i, q in enumerate(objs): if i % 10000 == 0 or i == count - 1: self.log.info('%s/%s', i + 1, count)
self.add(*q)
def get_near_dups(self, simhash): """ `simhash` is an instance of Simhash return a list of obj_id, which is in type of str """ assert simhash.f == self.f
ans = set()
for key in self.get_keys(simhash): dups = self.bucket[key] self.log.debug('key:%s', key) if len(dups) > 200: self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))
for dup in dups: sim2, obj_id = dup.split(',', 1) sim2 = Simhash(long(sim2, 16), self.f)
d = simhash.distance(sim2) if d <= self.k: ans.add(obj_id) return list(ans)
def add(self, obj_id, simhash): """ `obj_id` is a string `simhash` is an instance of Simhash """ assert simhash.f == self.f
for key in self.get_keys(simhash): v = '%x,%s' % (simhash.value, obj_id) self.bucket[key].add(v)
def delete(self, obj_id, simhash): """ `obj_id` is a string `simhash` is an instance of Simhash """ assert simhash.f == self.f
for key in self.get_keys(simhash): v = '%x,%s' % (simhash.value, obj_id) if v in self.bucket[key]: self.bucket[key].remove(v)
@property def offsets(self): """ You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf> """ return [self.f // (self.k + 1) * i for i in range(self.k + 1)]
def get_keys(self, simhash): for i, offset in enumerate(self.offsets): if i == (len(self.offsets) - 1): m = 2 ** (self.f - offset) - 1 else: m = 2 ** (self.offsets[i + 1] - offset) - 1 c = simhash.value >> offset & m yield '%x:%x' % (c, i)
def bucket_size(self): return len(self.bucket)
|