How to use the reynir.TOK.WORD function in reynir

To help you get started, we’ve selected a few reynir examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mideind / Greynir / nertokenizer.py View on Github external
# For all uppercase phrases (words, entities, persons),
                        # maintain a map of last names to full names
                        parts = w.split()
                        lastname = parts[-1]
                        # Clinton -> Hillary [Rodham] Clinton
                        if lastname[0].isupper():
                            # Look for Icelandic patronyms/matronyms
                            _, m = db.lookup_word(lastname, False)
                            if m and any(mm.fl in {"föð", "móð"} for mm in m):
                                # We don't store Icelandic patronyms/matronyms
                                # as surnames
                                pass
                            else:
                                lastnames[lastname] = token

                    if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT:
                        if " " in w:
                            # w may be a person name with more than one embedded word
                            # parts is assigned in the if statement above
                            cnt = len(parts)
                        elif not token.val or ("-" in token.val[0].stofn):
                            # No BÍN meaning for this token, or the meanings
                            # were constructed by concatenation (indicated by a hyphen
                            # in the stem)
                            weak = False  # Accept single-word entity references
                        # elist is a list of Entity instances
                        elist = query_entities(w)
                    else:
                        elist = []

                    if elist:
                        # This word might be a candidate to start an entity reference
github mideind / Greynir / postagger.py View on Github external
def ifd_taglist_word(txt, mlist):
            if not mlist:
                if txt[0].isupper():
                    # Óþekkt sérnafn?
                    # !!! The probabilities below are a rough guess
                    return [
                        ("nxen-s", 0.6),
                        ("nxeo-s", 0.1),
                        ("nxeþ-s", 0.1),
                        ("nxee-s", 0.2),
                    ]
                # Erlent orð?
                return [("e", 1.0)]
            s = set(ifd_tag(TOK.WORD, txt, m) for m in mlist)
            ltxt = txt.lower()
            if ltxt in Prepositions.PP:
                for case in Prepositions.PP[ltxt]:
                    if case in self.CASE_TO_TAG:
                        s.add(self.CASE_TO_TAG[case])
            if ltxt in self._CONJ_REF:
                # For referential conjunctions,
                # add 'ct' as a possibility (it does not come directly from a BÍN mark)
                s.add("ct")
            # Add a +1 bias to the counts so that no lemma/tag pairs have zero frequency
            prob = self.lemma_count(txt) + len(s)
            d = self.lemma_tags(txt)
            # It is possible for the probabilities of the tags in set s
            # not to add up to 1.0. This can happen if the tokenizer has
            # eliminated certain BÍN meanings due to updated settings
            # in Pref.conf.
github mideind / Greynir / postagger.py View on Github external
def gen_tokens():
            """ Generate a Greynir token sequence from a tagging result """
            ix = 0
            for t in toklist:
                if not t.txt:
                    continue
                # The code below should correspond to TreeUtility._describe_token()
                d = dict(x=t.txt)
                if t.kind == TOK.WORD:
                    # set d["m"] to the meaning
                    pass
                else:
                    d["k"] = t.kind
                if t.val is not None and t.kind not in {
                    TOK.WORD,
                    TOK.ENTITY,
                    TOK.PUNCTUATION,
                }:
                    # For tokens except words, entities and punctuation, include the val field
                    if t.kind == TOK.PERSON:
                        d["v"], d["g"] = TreeUtility.choose_full_name(
                            t.val, case=None, gender=None
                        )
                    else:
                        d["v"] = t.val
                if t.kind in {
                    TOK.WORD,
                    TOK.ENTITY,
                    TOK.PERSON,
                    TOK.NUMBER,
                    TOK.YEAR,
github mideind / Greynir / postagger.py View on Github external
def tag_stream(sentence_stream: Iterable[Iterable[Dict[str, Any]]]) -> Iterator[str]:
            """ Generator for tag stream from a token stream """
            for sent in sentence_stream:
                if not sent:
                    continue
                # For each sentence, start and end with empty strings
                for _ in range(n - 1):
                    yield ""
                for t in sent:
                    tag = None
                    # Skip punctuation
                    if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
                        canonicalize_token(t)
                        tag = str(IFD_Tagset(t))
                        if tag:
                            self.lemma_cnt[t["x"]][tag] += 1
                    if tag:
                        yield tag
                for _ in range(n - 1):
                    yield ""