Jaccard Similarity

def jaccard_sim(s1, s2):
    a = set(s1.split()) 
    print(len(a))
    b = set(s2.split()) 
    print(len(b))
    c = a.intersection(b)
    print(len(c))
    print(c)
    return float(len(c)) / (len(a) + len(b) - len(c))

s1 = "Natural language processing is a promising research area"
s2 = "More and more researchers are working on natural language processing nowadays"
print(jaccard_sim(s1, s2))

8
11
2
{'processing', 'language'}
0.11764705882352941

基于simhash的相似文本判断

SimHash

Reference: A Python Implementation of simhash algorithm

选择一个hashsize，例如32
- V = [0] * 32
把一段text变成features (shingles)
- howareyouiamfinethanks
- [‘how’, ‘owa’, ‘war’, ‘are’, ‘rey’, ‘eyo’, ‘you’, ‘oui’, ‘uia’, ‘iam’, ‘amf’, ‘mfi’, ‘fin’, ‘ine’, ‘net’, ‘eth’, ‘tha’, ‘han’, ‘ank’, ‘nks’]
把每个feature都hash成32位
对于每个hash的每个位置，如果位置上是1就把V[i]加1，如果不是就把V[i]减1
最后，如果V[i]>0就设为1，否则设为0，得到的V就是我们想要的simhash

# Created by 1e0n in 2013
from __future__ import division, unicode_literals

import re
import sys
import hashlib
import logging
import numbers
import collections
from itertools import groupby

if sys.version_info[0] >= 3:
    basestring = str
    unicode = str
    long = int
else:
    range = xrange


def _hashfunc(x): # 使用的hash函数
    return int(hashlib.md5(x).hexdigest(), 16)


class Simhash(object):

    def __init__(
        self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=None, log=None
    ):
        """
        `f` is the dimensions of fingerprints

        `reg` is meaningful only when `value` is basestring and describes
        what is considered to be a letter inside parsed string. Regexp
        object can also be specified (some attempt to handle any letters
        is to specify reg=re.compile(r'\w', re.UNICODE))

        `hashfunc` accepts a utf-8 encoded string and returns a unsigned
        integer in at least `f` bits.
        """

        self.f = f
        self.reg = reg
        self.value = None

        if hashfunc is None:
            self.hashfunc = _hashfunc
        else:
            self.hashfunc = hashfunc

        if log is None:
            self.log = logging.getLogger("simhash")
        else:
            self.log = log


        if isinstance(value, Simhash):
            self.value = value.value
        elif isinstance(value, basestring):
#             print("build by text")
            self.build_by_text(unicode(value))
        elif isinstance(value, collections.Iterable):
            self.build_by_features(value)
        elif isinstance(value, numbers.Integral):
            self.value = value
        else:
            raise Exception('Bad parameter with type {}'.format(type(value)))

    def __eq__(self, other):
        """
        Compare two simhashes by their value.

        :param Simhash other: The Simhash object to compare to
        """
        return self.value == other.value

    def _slide(self, content, width=4):
        return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]

    def _tokenize(self, content):
        content = content.lower()
        content = ''.join(re.findall(self.reg, content))
        ans = self._slide(content)
        return ans

    def build_by_text(self, content):
        features = self._tokenize(content)
        features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))}
        return self.build_by_features(features)

    def build_by_features(self, features):
        """
        `features` might be a list of unweighted tokens (a weight of 1
                   will be assumed), a list of (token, weight) tuples or
                   a token -> weight dict.
        """
        v = [0] * self.f # 初始化 [0,0,0,...]
        masks = [1 << i for i in range(self.f)] # [1, 10, 100, 1000...]
        if isinstance(features, dict):
            features = features.items()
        for f in features: 
            if isinstance(f, basestring):
                h = self.hashfunc(f.encode('utf-8')) # hash成32位
                w = 1
            else:
                assert isinstance(f, collections.Iterable)
                h = self.hashfunc(f[0].encode('utf-8'))
                w = f[1]
            for i in range(self.f):
                v[i] += w if h & masks[i] else -w
        ans = 0
        for i in range(self.f): # 计算结果
            if v[i] > 0: # 如果大于0，就把那一位变成1
                ans |= masks[i] 
        self.value = ans

    def distance(self, another):
        assert self.f == another.f
        x = (self.value ^ another.value) & ((1 << self.f) - 1) # XOR
        ans = 0
        while x:
            ans += 1
            x &= x - 1
        return ans

x = 4
ans = 0
while x:
    ans += 1
    x &= x - 1
ans

1
2
3

x = 7
x &= x-1
x

3 & 4

def get_features(s):
    width = 3
    s = s.lower()
    s = re.sub(r'[^\w]+', '', s)
    return [s[i:i+width] for i in range(max(len(s) - width + 1, 1))]

print(hex(Simhash(get_features("How are you? I am fine. Thanks. ")).value))
print(hex(Simhash(get_features("How are u? I am fine.       Thanks. ")).value))
print(hex(Simhash(get_features("How r you? I      am fine. Thanks. ")).value))

0x4d4da690b5a57e47
0x69deac90b5a15eeb
0x4f08a4f4b5a13a4b

1 2	print(Simhash('aa').distance(Simhash('bb'))) print(Simhash('aa').distance(Simhash('aa')))

31
0

Simhash Index

class SimhashIndex(object):

    def __init__(self, objs, f=64, k=2, log=None):
        """
        `objs` is a list of (obj_id, simhash)
        obj_id is a string, simhash is an instance of Simhash
        `f` is the same with the one for Simhash
        `k` is the tolerance
        """
        self.k = k
        self.f = f
        count = len(objs)

        if log is None:
            self.log = logging.getLogger("simhash")
        else:
            self.log = log

        self.log.info('Initializing %s data.', count)

        self.bucket = collections.defaultdict(set)

        for i, q in enumerate(objs):
            if i % 10000 == 0 or i == count - 1:
                self.log.info('%s/%s', i + 1, count)

            self.add(*q)

    def get_near_dups(self, simhash):
        """
        `simhash` is an instance of Simhash
        return a list of obj_id, which is in type of str
        """
        assert simhash.f == self.f

        ans = set()

        for key in self.get_keys(simhash):
            dups = self.bucket[key]
            self.log.debug('key:%s', key)
            if len(dups) > 200:
                self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))

            for dup in dups:
                sim2, obj_id = dup.split(',', 1)
                sim2 = Simhash(long(sim2, 16), self.f)

                d = simhash.distance(sim2)
                if d <= self.k:
                    ans.add(obj_id)
        return list(ans)

    def add(self, obj_id, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s' % (simhash.value, obj_id)
            self.bucket[key].add(v)

    def delete(self, obj_id, simhash):
        """
        `obj_id` is a string
        `simhash` is an instance of Simhash
        """
        assert simhash.f == self.f

        for key in self.get_keys(simhash):
            v = '%x,%s' % (simhash.value, obj_id)
            if v in self.bucket[key]:
                self.bucket[key].remove(v)

    @property
    def offsets(self):
        """
        You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
        """
        return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

    def get_keys(self, simhash):
        for i, offset in enumerate(self.offsets):
            if i == (len(self.offsets) - 1):
                m = 2 ** (self.f - offset) - 1
            else:
                m = 2 ** (self.offsets[i + 1] - offset) - 1
            c = simhash.value >> offset & m
            yield '%x:%x' % (c, i)

    def bucket_size(self):
        return len(self.bucket)

data = {
    1: u'How are you? I am fine. blar blar blar blar blar Thanks.', 
    2: u'How are you i am fine. blar blar blar blar blar Thanks.', 
    3: u'This is a simhash test', 
}

objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())
s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thanks'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))

8
['1', '2']
['1', '4', '2']