Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, X):
self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
for i, x in enumerate(X):
m = MinHash(num_perm=self._n_perm)
for e in x:
m.update(str(e).encode('utf8'))
self._index.add(str(i), m)
self._index.index()
def benchmark_lshforest(num_perm, l, k, index_data, query_data):
print("Building LSH Forest index")
forest = MinHashLSHForest(num_perm=num_perm, l=l)
for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
forest.add(key, minhash)
forest.index()
print("Querying")
times = []
results = []
for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
start = time.clock()
result = forest.query(minhash, k)
duration = time.clock() - start
times.append(duration)
results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
for key in result],
key=lambda x : x[1], reverse=True))
return times, results
def clustering(self, data_tag):
"""
Params:
:data_tag: Whether it's source or target data.
"""
# Create a min hash forest to quickly find nearest neighbours.
self.forest = MinHashLSHForest(num_perm=self.num_perm)
# Initialize clusters.
medoids = random.sample(range(len(self.data_points[data_tag])),
self.num_clusters[data_tag])
for i in range(self.num_clusters[data_tag]):
cl = self.ClusterClass(self.data_points[data_tag][medoids[i]])
self.clusters[data_tag].append(cl)
# Put medoids in a the forest.
self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash)
self.forest.index()
# For each data_point find a cluster.
self.cluster_points(data_tag)