How to use the flair.data.Token function in flair

To help you get started, we’ve selected a few flair examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / tests / test_visual.py View on Github external
def mock_ner_span(text, tag, start, end):
    span = Span([])
    span.tag = tag
    span.start_pos = start
    span.end_pos = end
    span.tokens = [Token(text[start:end])]
    return span
github flairNLP / flair / tests / test_data.py View on Github external
def test_sentence_add_token():
    token1: Token = Token("Munich")
    token2: Token = Token("and")
    token3: Token = Token("Berlin")
    token4: Token = Token("are")
    token5: Token = Token("nice")

    sentence: Sentence = Sentence()

    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    sentence.add_token(token4)
    sentence.add_token(token5)

    sentence.add_token("cities")
    sentence.add_token(Token("."))
github flairNLP / flair / tests / test_data.py View on Github external
def test_get_head():
    token1 = Token("I", 0)
    token2 = Token("love", 1, 0)
    token3 = Token("Berlin", 2, 1)

    sentence: Sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)

    assert token2 == token3.get_head()
    assert token1 == token2.get_head()
    assert None == token1.get_head()
github flairNLP / flair / tests / test_data.py View on Github external
def test_sentence_add_token():
    token1: Token = Token("Munich")
    token2: Token = Token("and")
    token3: Token = Token("Berlin")
    token4: Token = Token("are")
    token5: Token = Token("nice")

    sentence: Sentence = Sentence()

    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    sentence.add_token(token4)
    sentence.add_token(token5)

    sentence.add_token("cities")
    sentence.add_token(Token("."))

    assert "Munich and Berlin are nice cities ." == sentence.to_tokenized_string()
github flairNLP / flair / flair / datasets.py View on Github external
):
                        line = file.readline()
                        continue

                    if self.__line_completes_sentence(line):
                        if len(sentence) > 0:
                            sentence.infer_space_after()
                            if self.tag_to_bioes is not None:
                                sentence.convert_tag_scheme(
                                    tag_type=self.tag_to_bioes, target_scheme="iobes"
                                )
                            return sentence

                    else:
                        fields: List[str] = re.split("\s+", line)
                        token = Token(fields[self.text_column])
                        for column in self.column_name_map:
                            if len(fields) > column:
                                if column != self.text_column:
                                    token.add_tag(
                                        self.column_name_map[column], fields[column]
                                    )

                        if not line.isspace():
                            sentence.add_token(token)

                    line = file.readline()
        return sentence
github dcavar / Flair-JSON-NLP / flairjsonnlp / __init__.py View on Github external
def get_sentences(text, lang, use_ontonotes, fast, use_embeddings, char_embeddings, bpe_size, expressions, pos, sentiment) -> List[Sentence]:
        """Process text using Flair and return the output from Flair"""

        if lang not in ('en', 'multi', 'de', 'nl', 'fr'):
            raise TypeError(
                f'{lang} is not supported! Try multi. See https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md')

        # tokenize sentences
        sentences = []
        for s in segment(text):
            sentence = Sentence()
            sentences.append(sentence)
            for t in s:
                sentence.add_token(Token(t.value, start_position=t.offset, whitespace_after=t.space_after))

        # run models
        for model in get_models(lang=lang, use_ontonotes=use_ontonotes, fast=fast, expressions=expressions, pos=pos, sentiment=sentiment):
            model.predict(sentences)

        # load embedding models
        if use_embeddings or char_embeddings or bpe_size > 0:
            get_embeddings([e.strip() for e in use_embeddings.split(',')], char_embeddings, lang, bpe_size).embed(sentences)

        return sentences
github flairNLP / flair / flair / data.py View on Github external
def tokenizer(text: str) -> List[Token]:
        """
        Tokenizer using tiny_tokenizer, a third party library which supports
        multiple Japanese tokenizer such as MeCab, KyTea and SudachiPy.
        """
        tokens: List[Token] = []
        words: List[str] = []

        sentences = sentence_tokenizer.tokenize(text)
        for sentence in sentences:
            tiny_tokenizer_tokens = word_tokenizer.tokenize(sentence)
            words.extend(list(map(str, tiny_tokenizer_tokens)))

        # determine offsets for whitespace_after field
        index = text.index
        current_offset = 0
        previous_word_offset = -1
        previous_token = None
        for word in words:
            try:
                word_offset = index(word, current_offset)
                start_position = word_offset
github fishjh2 / merge_label / mg_lb / data_loading / fl_embeds.py View on Github external
def sent_to_flair(sent):
    """
    Convert a tokenized sentence (list of words) to a Flair sentence object
    """
    sentence = Sentence()

    for w in sent:
        token = Token(w)
        sentence.add_token(token)
        sentence.infer_space_after()

    return sentence
github flairNLP / flair / flair / data.py View on Github external
index = text.index
    current_offset = 0
    previous_word_offset = -1
    previous_token = None
    for word in words:
        try:
            word_offset = index(word, current_offset)
            start_position = word_offset
        except:
            word_offset = previous_word_offset + 1
            start_position = (
                current_offset + 1 if current_offset > 0 else current_offset
            )

        if word:
            token = Token(
                text=word, start_position=start_position, whitespace_after=True
            )
            tokens.append(token)

        if (previous_token is not None) and word_offset - 1 == previous_word_offset:
            previous_token.whitespace_after = False

        current_offset = word_offset + len(word)
        previous_word_offset = current_offset - 1
        previous_token = token

    return tokens
github flairNLP / flair / flair / data.py View on Github external
# determine offsets for whitespace_after field
        index = text.index
        current_offset = 0
        previous_word_offset = -1
        previous_token = None
        for word in words:
            try:
                word_offset = index(word, current_offset)
                start_position = word_offset
            except:
                word_offset = previous_word_offset + 1
                start_position = (
                    current_offset + 1 if current_offset > 0 else current_offset
                )

            token = Token(
                text=word, start_position=start_position, whitespace_after=True
            )
            tokens.append(token)

            if (previous_token is not None) and word_offset - 1 == previous_word_offset:
                previous_token.whitespace_after = False

            current_offset = word_offset + len(word)
            previous_word_offset = current_offset - 1
            previous_token = token

        return tokens