Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
async def test__H_mongo(self):
"""
Check _H output consistent bytes length given
the same concatenated hash value size
"""
mg = WeightedMinHashGenerator(100, sample_size=128)
for l in range(2, mg.sample_size + 1, 16):
m = mg.minhash(np.random.randint(1, 99999999, 100))
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
num_perm=128) as lsh:
await lsh.insert("m", m)
fs = (ht.keys() for ht in lsh.hashtables)
hashtables = await asyncio.gather(*fs)
sizes = [len(H) for H in hashtables]
self.assertTrue(all(sizes[0] == s for s in sizes))
m2 = mg.minhash(np.random.uniform(1, 10, 10))
await lsh.insert("a", m1)
await lsh.insert("b", m2)
for t in lsh.hashtables:
self.assertTrue(await t.size() >= 1)
items = []
for H in await t.keys():
items.extend(await t.get(H))
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue(await lsh.has_key("a"))
self.assertTrue(await lsh.has_key("b"))
for i, H in enumerate(await lsh.keys.get("a")):
self.assertTrue("a" in await lsh.hashtables[i].get(H))
mg = WeightedMinHashGenerator(10, 5)
m3 = mg.minhash(np.random.uniform(1, 10, 10))
with self.assertRaises(ValueError):
await lsh.insert("c", m3)
async def test_insert_mongo(self):
async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
threshold=0.5, num_perm=4) as lsh:
mg = WeightedMinHashGenerator(10, 4)
m1 = mg.minhash(np.random.uniform(1, 10, 10))
m2 = mg.minhash(np.random.uniform(1, 10, 10))
await lsh.insert("a", m1)
await lsh.insert("b", m2)
for t in lsh.hashtables:
self.assertTrue(await t.size() >= 1)
items = []
for H in await t.keys():
items.extend(await t.get(H))
self.assertTrue("a" in items)
self.assertTrue("b" in items)
self.assertTrue(await lsh.has_key("a"))
self.assertTrue(await lsh.has_key("b"))
for i, H in enumerate(await lsh.keys.get("a")):
self.assertTrue("a" in await lsh.hashtables[i].get(H))
def eg2():
mg = WeightedMinHashGenerator(10, 5)
m1 = mg.minhash(v1)
m2 = mg.minhash(v2)
m3 = mg.minhash(v3)
print("Estimated Jaccard m1, m2", m1.jaccard(m2))
print("Estimated Jaccard m1, m3", m1.jaccard(m3))
# Create LSH index
lsh = MinHashLSH(threshold=0.1, num_perm=5)
lsh.insert("m2", m2)
lsh.insert("m3", m3)
result = lsh.query(m1)
print("Approximate neighbours with weighted Jaccard similarity > 0.1", result)