How to use editdistance - 10 common examples

To help you get started, we’ve selected a few editdistance examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github stanfordnlp / cocoa / mutualfriends / core / lexicon.py View on Github external
:param agent: Agent id whose span is being entity linked
        :param uuid: uuid of scenario containing KB for given agent
        :return:
        """
        # Use heuristic scoring system
        #print 'span:', span
        if not self.learned_lex:
            entity_scores = []
            for c in candidates:
                #print 'c:', c
                # Clean up punctuation
                c_s = re.sub("-", " ", c[0])
                span_tokens = span.split()
                entity_tokens = c_s.split()

                ed = editdistance.eval(span, c[0])
                # Filter false positives
                if c[1] not in kb_entity_types:
                    #print 'false type'
                    continue

                def is_stopwords():
                    if span == c[0]:
                        return False
                    if len(span_tokens) == 1 and span in self.stop_words:
                        return True
                    if span_tokens[0] in ('and', 'or', 'to', 'from', 'of', 'in', 'at'):
                        return True
                    all_stop = True
                    for x in span_tokens:
                        if x not in self.stop_words:
                            all_stop = False
github espnet / espnet / espnet / nets / pytorch_backend / e2e_asr_mulenc.py View on Github external
for i, y_hat in enumerate(y_hats):
                y_true = ys_pad[i]

                seq_hat = [self.char_list[int(idx)] for idx in y_hat if int(idx) != -1]
                seq_true = [self.char_list[int(idx)] for idx in y_true if int(idx) != -1]
                seq_hat_text = "".join(seq_hat).replace(self.recog_args.space, ' ')
                seq_hat_text = seq_hat_text.replace(self.recog_args.blank, '')
                seq_true_text = "".join(seq_true).replace(self.recog_args.space, ' ')

                hyp_words = seq_hat_text.split()
                ref_words = seq_true_text.split()
                word_eds.append(editdistance.eval(hyp_words, ref_words))
                word_ref_lens.append(len(ref_words))
                hyp_chars = seq_hat_text.replace(' ', '')
                ref_chars = seq_true_text.replace(' ', '')
                char_eds.append(editdistance.eval(hyp_chars, ref_chars))
                char_ref_lens.append(len(ref_chars))

            wer = 0.0 if not self.report_wer else float(sum(word_eds)) / sum(word_ref_lens)
            cer = 0.0 if not self.report_cer else float(sum(char_eds)) / sum(char_ref_lens)

        alpha = self.mtlalpha
        if alpha == 0:
            self.loss = self.loss_att
            loss_att_data = float(self.loss_att)
            loss_ctc_data_list = [None] * (self.num_encs + 1)
        elif alpha == 1:
            self.loss = torch.sum(torch.cat(
                [(item * self.weights_ctc_train[i]).unsqueeze(0) for i, item in enumerate(self.loss_ctc_list)]))
            loss_att_data = None
            loss_ctc_data_list = [float(self.loss)] + [float(item) for item in self.loss_ctc_list]
        else:
github ChrisCummins / clgen / clgen / _fetch.py View on Github external
subprocess.check_output(
                    f'find "{escp_topdir}" -type f -name {esc_basename}',
                    shell=True, universal_newlines=True)\
                    .split('\n')
                if x]

            # Select which file to inline:
            if len(candidates) == 1:
                # If there's exactly one match, then we're done:
                file_to_inline = candidates[0]
            elif len(candidates) > 1:
                # We have multiple candidates to inline, so we'll compare the
                # full paths (relative to the top directory) to select the one
                # whose name is the closest match:
                rel_matches = [match[len(topdir) + 1:] for match in candidates]
                distances = [editdistance.eval(include, path) for path in rel_matches]
                min_distance = min(distances)
                file_to_inline = candidates[distances.index(min_distance)]
                log.debug(f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}")
            else:
                # We didn't find anything suitable:
                file_to_inline = None

            # Process the inline file:
            if file_to_inline in stack:
                # We've already inlined this file, so ignore it:
                outlines.append(clgen.format_as_comment(
                    lang, f'[FETCH] ignored_include({line})'))
            elif file_to_inline:
                # Inline the file by recursively expanding its contents:
                outlines.append(clgen.format_as_comment(
                    lang, f'[FETCH] begin_include({line})'))
github iamlemec / patents / firm_cluster.py View on Github external
def dmetr(name1, name2):
        max_len = max(len(name1), len(name2))
        max_dist = int(ceil(max_len*(1.0-thresh)))
        ldist = levenshtein(name1, name2)
        return (1.0 - float(ldist)/max_len) if (ldist != -1 and max_len != 0) else 0.0
github zhourunlai / learning / Keras / image_ocr.py View on Github external
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(0, num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
github jdidion / atropos / paper / scripts / summarize_art_alignments.py View on Github external
except:
                break
        r1 = next(p).rstrip()
        r2 = next(p).rstrip()
        total_reads += 1
        total_ref_bp += len(r1)
        
        if r1 == r2:
            continue
        
        if len(r1) != len(r2):
            num_adapters += 1
            l1 = len(r1)
            n = min(len(r2) - l1, len(adapter))
            total_adapter_bp += n
            total_adapter_edit_dist += editdistance.eval(r2[l1:(l1+n)], adapter[0:n])
            r2 = r2[0:l1]
        
        if r1 != r2:
            num_reads_mismatch += 1
            total_edit_dist += editdistance.eval(r1,r2)
    
    w.writerow((i,
        num_adapters, num_reads_mismatch, total_reads,
        total_edit_dist, total_ref_bp,
        total_adapter_edit_dist, total_adapter_bp
    ))
github chrismattmann / tika-similarity / edit-value-similarity.py View on Github external
file2_metadata = metadata_dict[file2]

                intersect_features = set(file1_metadata.keys()) & set(file2_metadata.keys())

                intersect_features = [feature for feature in intersect_features if feature not in na_metadata ]

                file_edit_distance = 0.0
                for feature in intersect_features:

                    file1_feature_value = stringify(file1_metadata[feature])
                    file2_feature_value = stringify(file2_metadata[feature])

                    if len(file1_feature_value) == 0 and len(file2_feature_value) == 0:
                        feature_distance = 0.0
                    else:
                        feature_distance = float(editdistance.eval(file1_feature_value, file2_feature_value))/(len(file1_feature_value) if len(file1_feature_value) > len(file2_feature_value) else len(file2_feature_value))

                    file_edit_distance += feature_distance

                if allKeys:
                    file1_only_features = set(file1_metadata.keys()) - set(intersect_features)
                    file1_only_features = [feature for feature in file1_only_features if feature not in na_metadata]

                    file2_only_features = set(file2_metadata.keys()) - set(intersect_features)
                    file2_only_features = [feature for feature in file2_only_features if feature not in na_metadata]

                    file_edit_distance += len(file1_only_features) + len(file2_only_features)       # increment by 1 for each disjunct feature in (A-B) & (B-A), file1_disjunct_feature_value/file1_disjunct_feature_value = 1
                    file_edit_distance /= float(len(intersect_features) + len(file1_only_features) + len(file2_only_features))

                else:
                    file_edit_distance /= float(len(intersect_features))    #average edit distance
github jsn5 / fakeElonDetector / twitterbot.py View on Github external
api = tweepy.API(auth,wait_on_rate_limit=True)


c = tweepy.Cursor(api.search,q='to:'+'elonmusk',since_id='1017463575919693826',include_entities=True).items()


while True:
	try:
		tweet = c.next()
		text = tweet.text
		name = tweet.user.name
		screen_name = tweet.user.screen_name
		tweet_id = tweet.id
		print(tweet_id)
		if screen_name != 'elonmusk':
			name_dist = editdistance.eval('Elon Musk', name)
			screen_dist = editdistance.eval('elonmusk',screen_name)
			if name_dist <= 2 or screen_dist <= 4:
				print("fake found")
				api.update_status("*❗️❗️beep boop❗️❗️* Fake Elon Musk detected❎, report as spam ❎ @elonmusk @{}".format(screen_name), in_reply_to_status_id=tweet_id)
				api.report_spam([screen_name])
	except tweepy.TweepError:
		print("limit reached")
		time.sleep(60*5)
		continue
	except StopIteration:
		print("end of result")
		time.sleep(10)
		c = tweepy.Cursor(api.search,q='to:'+'elonmusk',since_id=tweet_id,include_entities=True).items()
		continue
github pannous / tensorflow-ocr / train.py View on Github external
def show_edit_distance(self, num):
        num_left = num
        mean_norm_ed = 0.0
        mean_ed = 0.0
        while num_left > 0:
            word_batch = next(self.text_img_gen)[0]
            num_proc = min(word_batch['the_input'].shape[0], num_left)
            decoded_res = decode_batch(self.test_func, word_batch['the_input'][0:num_proc])
            for j in range(num_proc):
                edit_dist = editdistance.eval(decoded_res[j], word_batch['source_str'][j])
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(word_batch['source_str'][j])
            num_left -= num_proc
        mean_norm_ed = mean_norm_ed / num
        mean_ed = mean_ed / num
        print('\nOut of %d samples:  Mean edit distance: %.3f Mean normalized edit distance: %0.3f'
              % (num, mean_ed, mean_norm_ed))
github modernmt / DataCollection / docaligner / ratio.py View on Github external
def levenshtein_avg(weights, seq1, seq2):
    norm = .5 * (len(seq1) + len(seq2))
    return 1 - (editdistance.eval(seq1, seq2) / norm)

editdistance

Fast implementation of the edit distance (Levenshtein distance)

MIT
Latest version published 10 months ago

Package Health Score

75 / 100
Full package analysis

Similar packages