How to use the flair.data_fetcher.NLPTaskDataFetcher function in flair

To help you get started, we’ve selected a few flair examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / tests / test_text_regressor.py View on Github external
def init(tasks_base_path) -> Tuple[TaggedCorpus, TextRegressor, ModelTrainer]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION, tasks_base_path)

    glove_embedding: WordEmbeddings = WordEmbeddings("glove")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False
    )

    model = TextRegressor(document_embeddings)

    trainer = ModelTrainer(model, corpus)

    return corpus, model, trainer
github flairNLP / flair / tests / test_sequence_labeler_trainer.py View on Github external
def test_training(tasks_base_path):

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.FASHION, tasks_base_path)
    tag_dictionary = corpus.make_tag_dictionary('ner')

    embeddings = WordEmbeddings('glove')

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type='ner',
                                            use_crf=False)

    # initialize trainer
    trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger, corpus, test_mode=True)

    trainer.train('./results', learning_rate=0.1, mini_batch_size=2, max_epochs=10)

    # clean up results directory
github flairNLP / flair / tests / test_text_classifier_trainer.py View on Github external
def test_text_classifier_single_label(tasks_base_path):
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert(l.value is not None)
            assert(0.0 <= l.score <= 1.0)
github undertheseanlp / ner / egs / vlsp2016_flair / train_wordvector.py View on Github external
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, MemoryEmbeddings, CharacterEmbeddings
from typing import List
import torch

# 1. get the corpus
columns = {0: 'text', 1: 'ner'}
corpus: TaggedCorpus = NLPTaskDataFetcher.fetch_column_corpus("data1", columns,
                                                              train_file="train.txt",
                                                              test_file="test.txt",
                                                              dev_file="dev.txt")
print(corpus)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] = [

    # WordEmbeddings('glove'),
github flairNLP / flair / flair / data_fetcher.py View on Github external
return NLPTaskDataFetcher.load_ud_corpus(data_folder)

        # for text classifiers, we use our own special format
        if task in [
            NLPTask.IMDB.value,
            NLPTask.AG_NEWS.value,
            NLPTask.TREC_6.value,
            NLPTask.TREC_50.value,
            NLPTask.REGRESSION.value,
        ]:
            use_tokenizer: bool = False if task in [
                NLPTask.TREC_6.value,
                NLPTask.TREC_50.value,
            ] else True

            return NLPTaskDataFetcher.load_classification_corpus(
                data_folder, use_tokenizer=use_tokenizer
            )

        # NER corpus for Basque
        if task == NLPTask.NER_BASQUE.value:
            columns = {0: "text", 1: "ner"}
            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="ner"
            )

        if task.startswith("wassa"):
            return NLPTaskDataFetcher.load_classification_corpus(
                data_folder, use_tokenizer=True
            )
github flairNLP / flair / flair / data_fetcher.py View on Github external
if (
            task == NLPTask.CONLL_03.value
            or task == NLPTask.ONTONER.value
            or task == NLPTask.FASHION.value
        ):
            columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}

            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="ner"
            )

        # the CoNLL 03 task for German has an additional lemma column
        if task == NLPTask.CONLL_03_GERMAN.value:
            columns = {0: "text", 1: "lemma", 2: "pos", 3: "np", 4: "ner"}

            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="ner"
            )

        # the CoNLL 03 task for Dutch has no NP column
        if task == NLPTask.CONLL_03_DUTCH.value or task.startswith("wikiner"):
            columns = {0: "text", 1: "pos", 2: "ner"}

            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="ner"
            )

        # the CoNLL 03 task for Spanish only has two columns
        if task == NLPTask.CONLL_03_SPANISH.value or task == NLPTask.WNUT_17.value:
            columns = {0: "text", 1: "ner"}

            return NLPTaskDataFetcher.load_column_corpus(
github flairNLP / flair / flair / data_fetcher.py View on Github external
# read in test file if exists, otherwise sample 10% of train data as test dataset
        if test_file is not None:
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data(
                test_file, column_format
            )
        else:
            sentences_test: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_test]

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
        if dev_file is not None:
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data(
                dev_file, column_format
            )
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_dev]

        if tag_to_biloes is not None:
            # convert tag scheme to iobes
            for sentence in sentences_train + sentences_test + sentences_dev:
                sentence.convert_tag_scheme(
                    tag_type=tag_to_biloes, target_scheme="iobes"
                )
github flairNLP / flair / flair / data_fetcher.py View on Github external
log.info("Test: {}".format(test_file))

        # get train and test data
        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_column_data(
            train_file, column_format
        )

        # read in test file if exists, otherwise sample 10% of train data as test dataset
        if test_file is not None:
            sentences_test: List[Sentence] = NLPTaskDataFetcher.read_column_data(
                test_file, column_format
            )
        else:
            sentences_test: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_test]

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
        if dev_file is not None:
            sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_column_data(
                dev_file, column_format
            )
        else:
            sentences_dev: List[Sentence] = [
                sentences_train[i]
                for i in NLPTaskDataFetcher.__sample(len(sentences_train), 0.1)
            ]
            sentences_train = [x for x in sentences_train if x not in sentences_dev]

        if tag_to_biloes is not None:
github flairNLP / flair / flair / data_fetcher.py View on Github external
def load_corpus(task: Union[NLPTask, str], base_path: [str, Path] = None) -> Corpus:
        """
        Helper function to fetch a Corpus for a specific NLPTask. For this to work you need to first download
        and put into the appropriate folder structure the corresponding NLP task data. The tutorials on
        https://github.com/zalandoresearch/flair give more info on how to do this. Alternatively, you can use this
        code to create your own data fetchers.
        :param task: specification of the NLPTask you wish to get
        :param base_path: path to data folder containing tasks sub folders
        :return: a Corpus consisting of train, dev and test data
        """

        # first, try to fetch dataset online
        if type(task) is NLPTask:
            NLPTaskDataFetcher.download_dataset(task)

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # get string value if enum is passed
        task = task.value if type(task) is NLPTask else task

        data_folder = base_path / task.lower()

        # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
        if task == NLPTask.CONLL_2000.value:
            columns = {0: "text", 1: "pos", 2: "np"}
github flairNLP / flair / flair / data_fetcher.py View on Github external
test_file = file
                if "dev" in file_name:
                    dev_file = file
                if "testa" in file_name:
                    dev_file = file
                if "testb" in file_name:
                    test_file = file

        log.info("Reading data from {}".format(data_folder))
        log.info("Train: {}".format(train_file))
        log.info("Dev: {}".format(dev_file))
        log.info("Test: {}".format(test_file))

        sentences_train: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(train_file)
        sentences_test: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(test_file)
        sentences_dev: List[Sentence] = NLPTaskDataFetcher.read_conll_ud(dev_file)

        return Corpus(
            sentences_train, sentences_dev, sentences_test, name=data_folder.name
        )