Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
async def test__H_mongo(self):
"""
Check _H output consistent bytes length given
the same concatenated hash value size
"""
mg = WeightedMinHashGenerator(100, sample_size=128)
for l in range(2, mg.sample_size + 1, 16):
m = mg.minhash(np.random.randint(1, 99999999, 100))
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
num_perm=128) as lsh:
await lsh.insert("m", m)
fs = (ht.keys() for ht in lsh.hashtables)
hashtables = await asyncio.gather(*fs)
sizes = [len(H) for H in hashtables]
self.assertTrue(all(sizes[0] == s for s in sizes))
m2 = mg.minhash(np.random.uniform(1, 10, 10))
await lsh.insert("a", m1)
await lsh.insert("b", m2)
for t in lsh.hashtables:
self.assertTrue(await t.size() >= 1)
items = []
for H in await t.keys():
items.extend(await t.get(H))
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue(await lsh.has_key("a"))
self.assertTrue(await lsh.has_key("b"))
for i, H in enumerate(await lsh.keys.get("a")):
self.assertTrue("a" in await lsh.hashtables[i].get(H))
mg = WeightedMinHashGenerator(10, 5)
m3 = mg.minhash(np.random.uniform(1, 10, 10))
with self.assertRaises(ValueError):
await lsh.insert("c", m3)
async def test_insert_mongo(self):
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.5, num_perm=4) as lsh:
mg = WeightedMinHashGenerator(10, 4)
m1 = mg.minhash(np.random.uniform(1, 10, 10))
m2 = mg.minhash(np.random.uniform(1, 10, 10))
await lsh.insert("a", m1)
await lsh.insert("b", m2)
for t in lsh.hashtables:
self.assertTrue(await t.size() >= 1)
items = []
for H in await t.keys():
items.extend(await t.get(H))
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue(await lsh.has_key("a"))
self.assertTrue(await lsh.has_key("b"))
for i, H in enumerate(await lsh.keys.get("a")):
self.assertTrue("a" in await lsh.hashtables[i].get(H))
async def test__H_mongo(self):
"""
Check _H output consistent bytes length given
the same concatenated hash value size
"""
mg = WeightedMinHashGenerator(100, sample_size=128)
for l in range(2, mg.sample_size + 1, 16):
m = mg.minhash(np.random.randint(1, 99999999, 100))
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
num_perm=128) as lsh:
await lsh.insert("m", m)
fs = (ht.keys() for ht in lsh.hashtables)
hashtables = await asyncio.gather(*fs)
sizes = [len(H) for H in hashtables]
self.assertTrue(all(sizes[0] == s for s in sizes))
async def test_insertion_session_mongo(self):
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
_chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
seq = frozenset(chain((''.join(s) for s in _chunked_str),
('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
objs = [MinHash(16) for _ in range(len(seq))]
for e, obj in zip(seq, objs):
for i in e:
obj.update(i.encode('utf-8'))
data = [(e, m) for e, m in zip(seq, objs)]
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.5, num_perm=16) as lsh:
async with lsh.insertion_session(batch_size=1000) as session:
fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
await asyncio.gather(*fs)
for t in lsh.hashtables:
self.assertTrue(await t.size() >= 1)
items = []
for H in await t.keys():
items.extend(await t.get(H))
self.assertTrue('aahhb' in items)
self.assertTrue('kld' in items)
self.assertTrue(await lsh.has_key('aahhb'))
self.assertTrue(await lsh.has_key('kld'))
for i, H in enumerate(await lsh.keys.get('aahh')):
self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
async def test_init_mongo(self):
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.8) as lsh:
self.assertTrue(await lsh.is_empty())
b1, r1 = lsh.b, lsh.r
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.8,
weights=(0.2, 0.8)) as lsh:
b2, r2 = lsh.b, lsh.r
self.assertTrue(b1 < b2)
self.assertTrue(r1 > r2)
async def test_init_mongo(self):
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.8) as lsh:
self.assertTrue(await lsh.is_empty())
b1, r1 = lsh.b, lsh.r
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.8,
weights=(0.2, 0.8)) as lsh:
b2, r2 = lsh.b, lsh.r
self.assertTrue(b1 < b2)
self.assertTrue(r1 > r2)
from it and then creates a MinHash object from every
remaining character in the domain.
If a domain starts with www., it will be stripped of the
domain before the Minhash is calculated.
Args:
domain: string with a full domain, eg. www.google.com
Returns:
A minhash (instance of datasketch.minhash.MinHash)
"""
domain_items = domain.split('.')
domain_part = '.'.join(domain_items[:-1])
minhash = MinHash(similarity.DEFAULT_PERMUTATIONS)
for char in domain_part:
minhash.update(char.encode('utf8'))
return minhash
def print_stats(
f, show=None, skip_unique=False, max_int_value=5, duration_limit=None,
print_duplicates=False, print_urls=False, limit=None):
stats = Counter()
if not skip_unique:
lsh = MinHashLSH(threshold=0.9, num_perm=128)
too_common = get_too_common_shingles(f, limit=1000)
urls = {}
min_timestamp = max_timestamp = None
for i, item in enumerate(item_reader(f, limit=limit)):
if print_urls:
print(item['url'])
content_type = item.get('content_type', 'missing')
stats.update([
'content_type: ' + content_type,
'content_type[0]: ' + content_type.split('/')[0]])
if min_timestamp is None:
min_timestamp = item['timestamp']
max_timestamp = item['timestamp']
if duration_limit and \
(max_timestamp - min_timestamp) / 1000 > duration_limit:
break
altnid_sid_dict = dict([(tmp[0], tmp[1]) for tmp in altn])
altnid_name_dict = dict([(tmp[0], tmp[2]) for tmp in altn])
sid_sname_dict = dict([(tmp[1], tmp[3]) for tmp in altn])
sid_altnid_dict = {}
for nid, sid in altnid_sid_dict.items():
sid_altnid_dict.setdefault(sid, [])
sid_altnid_dict[sid].append(nid)
print("Have %s altnames for %s series" % (len(altnid_sid_dict), len(sid_altnid_dict)))
perms = 512
gram_sz = 3
minhashes = {}
lsh = MinHashLSH(threshold=SIMILARITY_RATIO, num_perm=perms)
print("Building lsh minhash data structure")
with ProcessPoolExecutor(max_workers=8) as ex:
print("Submitting jobs")
futures = [(key, ex.submit(minhash_str, content, perms, gram_sz))
for
key, content
in
altnid_name_dict.items()
if
len(content) >= 5
]
print("Consuming futures")
for key, future in tqdm.tqdm(futures):
minhash = future.result()