How to use the datasketch.MinHash function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github Carbonara-Project / Guanciale / guanciale / matching.py View on Github external
else:
                    addrs.append(instr.addr)
                    addrs.append(instr.offset)
        
        addrs.sort()
        addrs_dict = {}
        for i in range(len(addrs)):
            addrs_dict[addrs[i]] = i
        
        internals_sorted = internals[:]
        internals_sorted.sort()
        calleds_dict = {}
        for i in range(len(internals_sorted)):
            calleds_dict[internals_sorted[i]] = str(i)
            
        flowhash = datasketch.MinHash(num_perm=32)
        
        for instr in self.bb_insns:
            if isinstance(instr, CallInsn):
                if instr.is_api:
                    #flow.append(hex(instr.offset)+"  API:" + instr.fcn_name)
                    flowhash.update("API:" + instr.fcn_name)
                else:
                    #flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
            else:
                if instr.jumpout:
                    #flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[instr.addr] = "OUT:" + calleds_dict[instr.addr]
                else:
github TeamHG-Memex / MaybeDont / maybedont / utils.py View on Github external
def get_min_hash(text, too_common, num_perm=128):
    min_hash = MinHash(num_perm=num_perm)
    for shingle_h in shingle_hashes(text):
        digest = shingle_h.digest()
        if digest not in too_common:
            min_hash.update(digest)
    return min_hash
github cangyeone / GeophysicsResearch / WaveReconize / fgpoint / mysimhash_t.py View on Github external
@author: LLL
"""

from datasketch import MinHash, MinHashLSH

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)

for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m1.update(d.encode('utf8'))
for d in data3:
    m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)
github YaleDHLab / intertext / intertext / tasks.py View on Github external
def get_hashbands(window):
  '''
  Given a string of content, return a generator of hashbands
  from that string.
  @args:
    str window: a window of text content from a file
  @returns:
    generator: a generator of strings, where each string has the
      format a.b.c in which individual hashes are concatenated
      with periods into a 'hashband'
  '''
  minhash = MinHash(num_perm=config['n_permutations'], seed=1)
  for ngram in set(ngrams(' '.join(window), 3)):
    minhash.update( ''.join(ngram).encode('utf8') )
  hashband_vals = []
  for i in minhash.hashvalues:
    hashband_vals.append(i)
    if len(hashband_vals) == config['hashband_length']:
      hashband = '.'.join([str(j) for j in hashband_vals])
      hashband_vals = []
      yield hashband
github kyunooh / tiny-elephant / tiny_elephant / in_memory_cluster.py View on Github external
def _generate_minhash(self, streams):
        minhash = MinHash(num_perm=self.num_perm, seed=self.seed)

        for stream in streams:
            minhash.update(stream.encode('utf-8'))

        return minhash
github clhchtcjj / BiNE / model / lsh.py View on Github external
def construct_lsh(obj_dict):
    lsh_0 = MinHashLSH(threshold=0, num_perm=128,params=None)
    lsh_5 = MinHashLSH(threshold=0.6, num_perm=128,params=None)
    # forest = MinHashLSHForest(num_perm=128)
    keys = obj_dict.keys()
    values = obj_dict.values()
    ms = []
    for i in range(len(keys)):
        temp = MinHash(num_perm=128)
        for d in values[i]:
            temp.update(d.encode('utf8'))
        ms.append(temp)
        lsh_0.insert(keys[i], temp)
        lsh_5.insert(keys[i], temp)
    return lsh_0,lsh_5, keys, ms
github Carbonara-Project / Guanciale / guanciale / matching.py View on Github external
for expr in stmts[i].expressions:
                    if isinstance(expr, pyvex.expr.Get):
                        # registers abstraction
                        regs[expr.offset] = regs.get(expr.offset, len(regs))
                        expr.offset = regs[expr.offset]

        #order addresses
        addrs = {}
        ips.sort()
        for i in range(len(ips)):
            addrs[ips[i]] = i
        
        #self.vex_code = ""
        #self.shingled_code = ""
        
        vexhash = datasketch.MinHash(num_perm=64)
        shingled = {}
        last = ""
        
        for c in range(len(irsbs)):
            irsb = irsbs[c]
            
            if type(irsb) == type(""):
                ngram = last + irsb
                #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, irsb)
                shingled[ngram] = shingled.get(ngram, 0) +1
                last = irsb
                continue
            
            stmts = irsb.statements
            ins = ""
github fake-name / wlnupdates / util / db_organize.py View on Github external
def minhash_str(in_str, perms, gram_sz):
	minhash = MinHash(num_perm=perms)
	for d in ngrams(in_str, gram_sz):
		minhash.update("".join(d).encode('utf-8'))
	return minhash
github cangyeone / GeophysicsResearch / WaveReconize / fgpoint / mysimhash_t.py View on Github external
@author: LLL
"""

from datasketch import MinHash, MinHashLSH

data1 = ['minhash', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'datasets']
data2 = ['minhash', 'is', 'a', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']
data3 = ['minhash', 'is', 'probability', 'data', 'structure', 'for',
        'estimating', 'the', 'similarity', 'between', 'documents']

# Create MinHash objects
m1 = MinHash(num_perm=128)
m2 = MinHash(num_perm=128)
m3 = MinHash(num_perm=128)

for d in data1:
    m1.update(d.encode('utf8'))
for d in data2:
    m1.update(d.encode('utf8'))
for d in data3:
    m1.update(d.encode('utf8'))
print((m1.hashvalues))
print((m2.hashvalues))
print((m3.hashvalues))
import numpy as np
print(np.shape(m1.hashvalues))
# Create an MinHashLSH index optimized for Jaccard threshold 0.5,
# that accepts MinHash objects with 128 permutations functions
lsh = MinHashLSH(threshold=0.5, num_perm=128)