How to use the reynir.binparser.BIN_Token function in reynir

To help you get started, we’ve selected a few reynir examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github mideind / Greynir / nn / nnclient.py View on Github external
def _normalize_text(cls, text):
        """ Preprocess text and normalize for parsing network """
        pgs = text.split("\n")
        normalized_pgs = [
            [
                tok.txt
                for tok in list(bintokenizer.tokenize(pg))
                if BIN_Token.is_understood(tok)
            ]
            for pg in pgs
        ]
        return [
            " ".join(tok for tok in npg if tok) for npg in normalized_pgs
        ]
github mideind / Greynir / nn / utils.py View on Github external
def split_text(text: str) -> List[List[str]]:
    """ Segments contiguous (Icelandic) text into paragraphs and sentences
        and returns a list of lists
    """
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)
    pgs = tokenizer.paragraphs(tok_stream)
    data = []  # type: List[List[str]]
    for pg in pgs:
        pg_data = []  # type: List[str]
        for _, sentence in pg:
            sentence = list(filter(BIN_Token.is_understood, sentence))
            sentence_text = tokenizer.normalized_text_from_tokens(sentence)
            pg_data.append(sentence_text)
        data.append(pg_data)
    return data
github mideind / Greynir / nn / nnclient.py View on Github external
def _normalize_sentence(cls, single_sentence):
        """ Preprocess text and normalize for parsing network """
        return [
            tok.txt
            for tok in bintokenizer.tokenize(single_sentence)
            if BIN_Token.is_understood(tok)
        ]
github mideind / Greynir / tree.py View on Github external
for c in self._CASES:
                    if c != case_override:
                        if c.upper() in m.beyging:
                            return False
            elif self.case.upper() not in m.beyging:
                return False
        # Check number match
        if self.number is not None:
            if self.number.upper() not in m.beyging:
                return False

        if self.is_verb:
            # The following code is parallel to BIN_Token.verb_matches()
            for v in self.varlist:
                # Lookup variant to see if it is one of the required ones for verbs
                rq = BIN_Token._VERB_FORMS.get(v)
                if rq and rq not in m.beyging:
                    # If this is required variant that is not found in the form we have,
                    # return False
                    return False
            for v in ["sagnb", "lhþt", "bh"]:
                if BIN_Token.VARIANT[v] in m.beyging and v not in self.variants:
                    return False
            if "bh" in self.variants and "ST" in m.beyging:
                return False
            if self.varlist[0] not in "012":
                # No need for argument check: we're done, unless...
                if "lhþt" in self.variants:
                    # Special check for lhþt: may specify a case without it being an argument case
                    if any(
                        c in self.variants and BIN_Token.VARIANT[c] not in m.beyging
                        for c in BIN_Token.CASES
github mideind / Greynir / tree.py View on Github external
if rq and rq not in m.beyging:
                    # If this is required variant that is not found in the form we have,
                    # return False
                    return False
            for v in ["sagnb", "lhþt", "bh"]:
                if BIN_Token.VARIANT[v] in m.beyging and v not in self.variants:
                    return False
            if "bh" in self.variants and "ST" in m.beyging:
                return False
            if self.varlist[0] not in "012":
                # No need for argument check: we're done, unless...
                if "lhþt" in self.variants:
                    # Special check for lhþt: may specify a case without it being an argument case
                    if any(
                        c in self.variants and BIN_Token.VARIANT[c] not in m.beyging
                        for c in BIN_Token.CASES
                    ):
                        # Terminal specified a non-argument case but the token doesn't have it:
                        # no match
                        return False
                return True
            nargs = int(self.varlist[0])
            if m.stofn in VerbObjects.VERBS[nargs]:
                if nargs == 0 or len(self.varlist) < 2:
                    # No arguments: we're done
                    return True
                for argspec in VerbObjects.VERBS[nargs][m.stofn]:
                    if all(self.varlist[1 + ix] == c for ix, c in enumerate(argspec)):
                        # This verb takes arguments that match the terminal
                        return True
                return False
            for i in range(0, nargs):
github mideind / Greynir / nn / utils.py View on Github external
and returns:
            dictionary of sentence indices to sentences
            dictionary of paragraph index to constituent sentence indices"""
    text = prep_text_for_tokenizer(text)
    tok_stream = bintokenizer.tokenize(text)

    pgs = tokenizer.paragraphs(tok_stream)
    pg_idx_to_sent_idx = dict()  # type: Dict[int, List[int]]
    sent_idx_to_sent = dict()  # type: Dict[int, str]
    curr_sent_idx = 0
    curr_pg_idx = 0

    for pg in pgs:
        sent_idxs = []
        for _, sent in pg:
            curr_sent = list(filter(BIN_Token.is_understood, sent))  # type: List[Tok]
            curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
            sent_idxs.append(curr_sent_idx)
            sent_idx_to_sent[curr_sent_idx] = curr_sent_text
            curr_sent_idx += 1
        pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
        curr_pg_idx += 1
    return pg_idx_to_sent_idx, sent_idx_to_sent