/ NLP  

NLP系列

Jaccard Similarity

1
2
3
4
5
6
7
8
9
10
11
12
13
def jaccard_sim(s1, s2):
a = set(s1.split())
print(len(a))
b = set(s2.split())
print(len(b))
c = a.intersection(b)
print(len(c))
print(c)
return float(len(c)) / (len(a) + len(b) - len(c))

s1 = "Natural language processing is a promising research area"
s2 = "More and more researchers are working on natural language processing nowadays"
print(jaccard_sim(s1, s2))
8
11
2
{'processing', 'language'}
0.11764705882352941

基于simhash的相似文本判断

SimHash

Reference: A Python Implementation of simhash algorithm

  • 选择一个hashsize,例如32
    • V = [0] * 32
  • 把一段text变成features (shingles)
    • howareyouiamfinethanks
    • [‘how’, ‘owa’, ‘war’, ‘are’, ‘rey’, ‘eyo’, ‘you’, ‘oui’, ‘uia’, ‘iam’, ‘amf’, ‘mfi’, ‘fin’, ‘ine’, ‘net’, ‘eth’, ‘tha’, ‘han’, ‘ank’, ‘nks’]
  • 把每个feature都hash成32位
  • 对于每个hash的每个位置,如果位置上是1就把V[i]加1,如果不是就把V[i]减1
  • 最后,如果V[i]>0就设为1,否则设为0,得到的V就是我们想要的simhash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# Created by 1e0n in 2013
from __future__ import division, unicode_literals

import re
import sys
import hashlib
import logging
import numbers
import collections
from itertools import groupby

if sys.version_info[0] >= 3:
basestring = str
unicode = str
long = int
else:
range = xrange


def _hashfunc(x): # 使用的hash函数
return int(hashlib.md5(x).hexdigest(), 16)


class Simhash(object):

def __init__(
self, value, f=64, reg=r'[\w\u4e00-\u9fcc]+', hashfunc=None, log=None
):
"""
`f` is the dimensions of fingerprints

`reg` is meaningful only when `value` is basestring and describes
what is considered to be a letter inside parsed string. Regexp
object can also be specified (some attempt to handle any letters
is to specify reg=re.compile(r'\w', re.UNICODE))

`hashfunc` accepts a utf-8 encoded string and returns a unsigned
integer in at least `f` bits.
"""

self.f = f
self.reg = reg
self.value = None

if hashfunc is None:
self.hashfunc = _hashfunc
else:
self.hashfunc = hashfunc

if log is None:
self.log = logging.getLogger("simhash")
else:
self.log = log


if isinstance(value, Simhash):
self.value = value.value
elif isinstance(value, basestring):
# print("build by text")
self.build_by_text(unicode(value))
elif isinstance(value, collections.Iterable):
self.build_by_features(value)
elif isinstance(value, numbers.Integral):
self.value = value
else:
raise Exception('Bad parameter with type {}'.format(type(value)))

def __eq__(self, other):
"""
Compare two simhashes by their value.

:param Simhash other: The Simhash object to compare to
"""
return self.value == other.value

def _slide(self, content, width=4):
return [content[i:i + width] for i in range(max(len(content) - width + 1, 1))]

def _tokenize(self, content):
content = content.lower()
content = ''.join(re.findall(self.reg, content))
ans = self._slide(content)
return ans

def build_by_text(self, content):
features = self._tokenize(content)
features = {k:sum(1 for _ in g) for k, g in groupby(sorted(features))}
return self.build_by_features(features)

def build_by_features(self, features):
"""
`features` might be a list of unweighted tokens (a weight of 1
will be assumed), a list of (token, weight) tuples or
a token -> weight dict.
"""
v = [0] * self.f # 初始化 [0,0,0,...]
masks = [1 << i for i in range(self.f)] # [1, 10, 100, 1000...]
if isinstance(features, dict):
features = features.items()
for f in features:
if isinstance(f, basestring):
h = self.hashfunc(f.encode('utf-8')) # hash成32位
w = 1
else:
assert isinstance(f, collections.Iterable)
h = self.hashfunc(f[0].encode('utf-8'))
w = f[1]
for i in range(self.f):
v[i] += w if h & masks[i] else -w
ans = 0
for i in range(self.f): # 计算结果
if v[i] > 0: # 如果大于0,就把那一位变成1
ans |= masks[i]
self.value = ans

def distance(self, another):
assert self.f == another.f
x = (self.value ^ another.value) & ((1 << self.f) - 1) # XOR
ans = 0
while x:
ans += 1
x &= x - 1
return ans
1
2
3
4
5
6
x = 4
ans = 0
while x:
ans += 1
x &= x - 1
ans
1
1
2
3
x = 7
x &= x-1
x
6
1
3 & 4
0
1
2
3
4
5
6
7
8
9
def get_features(s):
width = 3
s = s.lower()
s = re.sub(r'[^\w]+', '', s)
return [s[i:i+width] for i in range(max(len(s) - width + 1, 1))]

print(hex(Simhash(get_features("How are you? I am fine. Thanks. ")).value))
print(hex(Simhash(get_features("How are u? I am fine. Thanks. ")).value))
print(hex(Simhash(get_features("How r you? I am fine. Thanks. ")).value))
0x4d4da690b5a57e47
0x69deac90b5a15eeb
0x4f08a4f4b5a13a4b
1
2
print(Simhash('aa').distance(Simhash('bb')))
print(Simhash('aa').distance(Simhash('aa')))
31
0

Simhash Index

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class SimhashIndex(object):

def __init__(self, objs, f=64, k=2, log=None):
"""
`objs` is a list of (obj_id, simhash)
obj_id is a string, simhash is an instance of Simhash
`f` is the same with the one for Simhash
`k` is the tolerance
"""
self.k = k
self.f = f
count = len(objs)

if log is None:
self.log = logging.getLogger("simhash")
else:
self.log = log

self.log.info('Initializing %s data.', count)

self.bucket = collections.defaultdict(set)

for i, q in enumerate(objs):
if i % 10000 == 0 or i == count - 1:
self.log.info('%s/%s', i + 1, count)

self.add(*q)

def get_near_dups(self, simhash):
"""
`simhash` is an instance of Simhash
return a list of obj_id, which is in type of str
"""
assert simhash.f == self.f

ans = set()

for key in self.get_keys(simhash):
dups = self.bucket[key]
self.log.debug('key:%s', key)
if len(dups) > 200:
self.log.warning('Big bucket found. key:%s, len:%s', key, len(dups))

for dup in dups:
sim2, obj_id = dup.split(',', 1)
sim2 = Simhash(long(sim2, 16), self.f)

d = simhash.distance(sim2)
if d <= self.k:
ans.add(obj_id)
return list(ans)

def add(self, obj_id, simhash):
"""
`obj_id` is a string
`simhash` is an instance of Simhash
"""
assert simhash.f == self.f

for key in self.get_keys(simhash):
v = '%x,%s' % (simhash.value, obj_id)
self.bucket[key].add(v)

def delete(self, obj_id, simhash):
"""
`obj_id` is a string
`simhash` is an instance of Simhash
"""
assert simhash.f == self.f

for key in self.get_keys(simhash):
v = '%x,%s' % (simhash.value, obj_id)
if v in self.bucket[key]:
self.bucket[key].remove(v)

@property
def offsets(self):
"""
You may optimize this method according to <http://www.wwwconference.org/www2007/papers/paper215.pdf>
"""
return [self.f // (self.k + 1) * i for i in range(self.k + 1)]

def get_keys(self, simhash):
for i, offset in enumerate(self.offsets):
if i == (len(self.offsets) - 1):
m = 2 ** (self.f - offset) - 1
else:
m = 2 ** (self.offsets[i + 1] - offset) - 1
c = simhash.value >> offset & m
yield '%x:%x' % (c, i)

def bucket_size(self):
return len(self.bucket)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
data = {
1: u'How are you? I am fine. blar blar blar blar blar Thanks.',
2: u'How are you i am fine. blar blar blar blar blar Thanks.',
3: u'This is a simhash test',
}

objs = [(str(k), Simhash(get_features(v))) for k, v in data.items()]
index = SimhashIndex(objs, k=3)

print(index.bucket_size())
s1 = Simhash(get_features(u'How are you i am fine. blar blar blar blar blar thanks'))
print(index.get_near_dups(s1))

index.add('4', s1)
print(index.get_near_dups(s1))
8
['1', '2']
['1', '4', '2']