How to use the fuzzywuzzy.fuzz.token_sort_ratio function in fuzzywuzzy

To help you get started, we’ve selected a few fuzzywuzzy examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github sunlightlabs / datacommons / dcapi / reconcile / tests.py View on Github external
def test_service_metadata(self):
        self.maxDiff = None
        response = self.client.get('/api/1.0/refine/reconcile', {'callback': 'jsonp123'})

        self.assertEqual(200, response.status_code)
        self.assertEqual(100,
            fuzz.token_sort_ratio(
                'jsonp123({"name": "Influence Explorer Reconciliation3", "identifierSpace": "http://staging.influenceexplorer.com/ns/entities", "schemaspace": "http://staging.influenceexplorer.com/ns/entity.object.id", "view": { "url": "http://staging.influenceexplorer.com/entity/{{id}}" }, "preview": { "url": "http://staging.influenceexplorer.com/entity/{{id}}", "width": 430, "height": 300 }, "defaultTypes": []})',
                response.content
            )
github opentargets / library-beam / modules / BioentityTagger.py View on Github external
pref_name) in automation.iter(text_to_tag.lower()):
            start_index = end_index - len(match) + 1
            end_index += 1

            if (start_index == 0 or text_to_tag[start_index - 1] in BioEntityTagger.separators_all) and \
                    (end_index == len(text_to_tag) or text_to_tag[end_index] in BioEntityTagger.separators_all):
                for j in range(len(category_list)):
                    category = category_list[j]
                    reference_db = reference_db_list[j]
                    entity_id = entity_id_list[j]
                    if isinstance(entity_id, list):
                        entity_id = entity_id[0]
                    if category.endswith('-TOKEN'):
                        pre, post = original_value.split(match)[:2]
                        potential_match = text_to_tag[start_index:end_index + len(post)]
                        score = fuzz.token_sort_ratio(original_value, potential_match)
                        if score > 90:
                            tag = MatchedTag(match, start_index, end_index, category.replace('-TOKEN', ''),
                                             reference_db,
                                             entity_id, original_value, pref_name)
                            matches.append(tag.__dict__)
                    else:
                        tag = MatchedTag(match, start_index, end_index, category, reference_db, entity_id,
                                         original_value, pref_name)
                        matches.append(tag.__dict__)
            else:
                pass

        grouped_matches = BioEntityTagger.group_matches_by_category_and_reference(matches)
        filtered_matches = []
        for group, matches_in_group in grouped_matches.items():
            non_nested_matches = BioEntityTagger.remove_nested_matches(matches_in_group)
github ettorerizza / aat_reconcile / reconcile.py View on Github external
else:
            url = api_base_url + urllib.quote(query) + '&logop=and&notes='
        app.logger.debug("AAT url is " + url)
        resp = requests.get(url)
        results = ET.fromstring(resp.content)
    except getopt.GetoptError as e:
        app.logger.warning(e)
        return out

    for child in results.iter('Preferred_Parent'):
        match = False
        try:
            name = re.sub(r'\[.+?\]', '', child.text.split(',')[0]).strip()
            # the termid is NOT the ID ! We have to find it in the first prefered parent
            id = re.search(r"\[(.+?)\]", child.text.split(',')[0]).group(1)
            score = fuzz.token_sort_ratio(query, name)
        except AttributeError:
            pass
        if score > 95:
            match = True
        app.logger.debug("Label is " + name + " Score is " +
                         str(score) + " URI is " + id)
        resource = {
            "id": id,
            "name": name,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)

    # Sort this list containing prefterms by score
github vered1986 / Chirps / source / generate_instances / get_corefering_predicates.py View on Github external
def pair_aligned_propositions(propositions, pronouns):
    """
    Align predicates with the same arguments in different sentences
    :param propositions: the (sent, pred, arg1, arg2) tuples
    :return: a list of aligned_prop
    """
    predicate_alignments = []

    candidates = get_candidate_pairs(propositions, pronouns)

    for (tweet_id1, sent1, sf_pred1, pred1, s0_a0, s0_a1, tweet_id2, sent2, sf_pred2, pred2, s1_a0, s1_a1) in candidates:

        # Same tweet
        if fuzz.token_sort_ratio(sent1, sent2) >= 95:
            continue

        # Same predicates?
        if is_eq_preds(pred1, pred2):
            continue

        # Same arguments?
        is_eq_a0_a0, is_eq_a1_a1, is_eq_a0_a1, is_eq_a1_a0 = \
            is_eq_arg(s0_a0, s1_a0), is_eq_arg(s0_a1, s1_a1), is_eq_arg(s0_a0, s1_a1), is_eq_arg(s0_a1, s1_a0)

        # Are arguments aligned?
        is_aligned_a0_a0 = is_eq_a0_a0 or is_aligned_arg(s0_a0, s1_a0)
        is_aligned_a1_a1 = is_eq_a1_a1 or is_aligned_arg(s0_a1, s1_a1)
        is_aligned_a0_a1 = is_eq_a0_a1 or is_aligned_arg(s0_a0, s1_a1)
        is_aligned_a1_a0 = is_eq_a1_a0 or is_aligned_arg(s0_a1, s1_a0)
github lfd / PaStA / pypasta / PatchEvaluation.py View on Github external
def preevaluate_filenames(thresholds, right_files, left_file):
    # We won't enter preevaluate_filenames, if tf >= 1.0
    candidates = set()
    for right_file in right_files:
        sim = fuzz.token_sort_ratio(left_file, right_file) / 100
        if sim < thresholds.filename:
            continue
        candidates.add(right_file)
    return left_file, candidates
github castorini / BuboQA / entity_linking / cross_linking.py View on Github external
# relation correction
    C_pruned = []
    for mid in set(C):
        if mid in index_reach.keys():  # PROBLEM: don't know why this may not exist??
            count_mid = C.count(mid)  # count number of times mid appeared in C
            C_pruned.append((mid, count_mid))

    C_tfidf_pruned = []
    for mid, count_mid in C_pruned:
        if mid in index_names.keys():
            cand_ent_name = pick_best_name(question[2], index_names[mid])
            if args.sim == "custom":
                tfidf = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
                simple_match =  fuzz.ratio(cand_ent_name, question) / 100.0
                token_sort_ratio = fuzz.token_sort_ratio(cand_ent_name, question) / 100.0
                score = tfidf * 0.01 + simple_match + token_sort_ratio
            elif args.sim == "fuzzy":
                score = fuzzy_match_score(cand_ent_name, query_text)
            else:
                score = calc_tf_idf(query_text, cand_ent_name, count_mid, num_entities_fbsubset, index_ent)
            C_tfidf_pruned.append((mid, cand_ent_name, score))
    # print("C_tfidf_pruned[:10]: {}".format(C_tfidf_pruned[:10]))

    if len(C_tfidf_pruned) == 0:
        #print("WARNING: C_tfidf_pruned is empty.")
        notfound_c_lineids.append(lineid)
        notfound_c += 1
        continue

    C_tfidf_pruned.sort(key=lambda t: -t[2])
    cand_mids = C_tfidf_pruned[:HITS_TOP_ENTITIES]
github cmharlow / geonames-reconcile / reconcile.py View on Github external
alt = alternate[0]
        else:
            alt = ''
        geonames_id = item.get('geonameId')
        geonames_uri = make_uri(geonames_id)
        lat = item.get('lat')
        lng = item.get('lng')
        #Way to cheat + get name + coordinates into results:
        name_coords = name + ' | ' + lat + ', ' + lng
        #Avoid returning duplicates:
        if geonames_id in unique_geonames_ids:
            continue
        else:
            unique_geonames_ids.append(geonames_id)
        score_1 = fuzz.token_sort_ratio(query, name)
        score_2 = fuzz.token_sort_ratio(query, alt)
        score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": geonames_uri,
            "name": name_coords,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
    #Refine only will handle top three matches.
github lfd / PaStA / pypasta / PatchEvaluation.py View on Github external
def compare_hunks(left, right):
        # This case happens for example, if both hunks remove empty newlines
        # This check is _required_ as fuzzywuzzy currently contains a bug that
        # does misevaluations in case of equivalence. See
        # https://github.com/seatgeek/fuzzywuzzy/issues/196
        if left == right:
            return 100
        return fuzz.token_sort_ratio(left, right)
github cmharlow / geonames-reconcile / reconcile.py View on Github external
if (len(alternate) > 0):
            alt = alternate[0]
        else:
            alt = ''
        geonames_id = item.get('geonameId')
        geonames_uri = make_uri(geonames_id)
        lat = item.get('lat')
        lng = item.get('lng')
        #Way to cheat + get name + coordinates into results:
        name_coords = name + ' | ' + lat + ', ' + lng
        #Avoid returning duplicates:
        if geonames_id in unique_geonames_ids:
            continue
        else:
            unique_geonames_ids.append(geonames_id)
        score_1 = fuzz.token_sort_ratio(query, name)
        score_2 = fuzz.token_sort_ratio(query, alt)
        score = max(score_1, score_2)
        if query == text.normalize(name, PY3):
            match = True
        elif query == text.normalize(alt, PY3):
            match = True
        resource = {
            "id": geonames_uri,
            "name": name_coords,
            "score": score,
            "match": match,
            "type": query_type_meta
        }
        out.append(resource)
    #Sort this list by score
    sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
github ricosr / retrieval_chatbot / fuzzy_match.py View on Github external
def fuzzy_for_domains(utterance, context_ls):
    ratio_ls = []
    for i in range(len(context_ls)):
        ratio_sum = 0
        ratio_sum += fuzz.ratio(utterance, context_ls[i][0]+context_ls[i][1])
        ratio_sum += fuzz.partial_ratio(utterance, context_ls[i][0]+context_ls[i][1])
        ratio_sum += fuzz.token_sort_ratio(utterance, context_ls[i][0]+context_ls[i][1])
        ratio_sum += fuzz.token_set_ratio(utterance, context_ls[i][0]+context_ls[i][1])
        mean_ratio = ratio_sum / 4
        ratio_ls.append(mean_ratio)
    return normalization(ratio_ls)