Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
else:
addrs.append(instr.addr)
addrs.append(instr.offset)
addrs.sort()
addrs_dict = {}
for i in range(len(addrs)):
addrs_dict[addrs[i]] = i
internals_sorted = internals[:]
internals_sorted.sort()
calleds_dict = {}
for i in range(len(internals_sorted)):
calleds_dict[internals_sorted[i]] = str(i)
flowhash = datasketch.MinHash(num_perm=32)
for instr in self.bb_insns:
if isinstance(instr, CallInsn):
if instr.is_api:
#flow.append(hex(instr.offset)+" API:" + instr.fcn_name)
flowhash.update("API:" + instr.fcn_name)
else:
#flow.append(hex(instr.offset)+" OUT:" + calleds_dict[instr.addr])
flowhash.update("OUT:" + calleds_dict[instr.addr])
self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
else:
if instr.jumpout:
#flow.append(hex(instr.offset)+" OUT:" + calleds_dict[instr.addr])
flowhash.update("OUT:" + calleds_dict[instr.addr])
self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
else:
def get_min_hash(text, too_common, num_perm=128):
min_hash = MinHash(num_perm=num_perm)
for shingle_h in shingle_hashes(text):
digest = shingle_h.digest()
if digest not in too_common:
min_hash.update(digest)
return min_hash
@author: LLL
"""
from datasketch import MinHash, MinHashLSH
data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'documents']
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'documents']
# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in data1:
m1.update(d.encode('utf8'))
for d in data2:
m1.update(d.encode('utf8'))
for d in data3:
m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)
def get_hashbands(window):
'''
Given a string of content, return a generator of hashbands
from that string.
@args:
str window: a window of text content from a file
@returns:
generator: a generator of strings, where each string has the
format a.b.c in which individual hashes are concatenated
with periods into a 'hashband'
'''
minhash = MinHash(num_perm=config['n_permutations'], seed=1)
for ngram in set(ngrams(' '.join(window), 3)):
minhash.update( ''.join(ngram).encode('utf8') )
hashband_vals = []
for i in minhash.hashvalues:
hashband_vals.append(i)
if len(hashband_vals) == config['hashband_length']:
hashband = '.'.join([str(j) for j in hashband_vals])
hashband_vals = []
yield hashband
def _generate_minhash(self, streams):
minhash = MinHash(num_perm=self.num_perm, seed=self.seed)
for stream in streams:
minhash.update(stream.encode('utf-8'))
return minhash
def construct_lsh(obj_dict):
lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
# forest = MinHashLSHForest(num_perm=128)
keys = obj_dict.keys()
values = obj_dict.values()
ms = []
for i in range(len(keys)):
temp = MinHash(num_perm=128)
for d in values[i]:
temp.update(d.encode('utf8'))
ms.append(temp)
lsh_0.insert(keys[i], temp)
lsh_5.insert(keys[i], temp)
return lsh_0,lsh_5, keys, ms
for expr in stmts[i].expressions:
if isinstance(expr, pyvex.expr.Get):
# registers abstraction
regs[expr.offset] = regs.get(expr.offset, len(regs))
expr.offset = regs[expr.offset]
#order addresses
addrs = {}
ips.sort()
for i in range(len(ips)):
addrs[ips[i]] = i
#self.vex_code = ""
#self.shingled_code = ""
vexhash = datasketch.MinHash(num_perm=64)
shingled = {}
last = ""
for c in range(len(irsbs)):
irsb = irsbs[c]
if type(irsb) == type(""):
ngram = last + irsb
#self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, irsb)
shingled[ngram] = shingled.get(ngram, 0) +1
last = irsb
continue
stmts = irsb.statements
ins = ""
def minhash_str(in_str, perms, gram_sz):
minhash = MinHash(num_perm=perms)
for d in ngrams(in_str, gram_sz):
minhash.update("".join(d).encode('utf-8'))
return minhash
@author: LLL
"""
from datasketch import MinHash, MinHashLSH
data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'documents']
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
'estimating', 'the', 'similarity', 'between', 'documents']
# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)
for d in data1:
m1.update(d.encode('utf8'))
for d in data2:
m1.update(d.encode('utf8'))
for d in data3:
m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)