How to use the flair.data_fetcher.NLPTask function in flair

To help you get started, we’ve selected a few flair examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / flair / data_fetcher.py View on Github external
data_folder, columns, tag_to_biloes="ner"
            )

        # WSD tasks may be put into this column format
        if task == NLPTask.WSD.value:
            columns = {0: "text", 1: "lemma", 2: "pos", 3: "sense"}
            return NLPTaskDataFetcher.load_column_corpus(
                data_folder,
                columns,
                train_file="semcor.tsv",
                test_file="semeval2015.tsv",
            )

        # the UD corpora follow the CoNLL-U format, for which we have a special reader
        if task.startswith("ud_") or task in [
            NLPTask.ONTONOTES.value,
            NLPTask.CONLL_12.value,
            NLPTask.PENN.value,
        ]:
            return NLPTaskDataFetcher.load_ud_corpus(data_folder)

        # for text classifiers, we use our own special format
        if task in [
            NLPTask.IMDB.value,
            NLPTask.AG_NEWS.value,
            NLPTask.TREC_6.value,
            NLPTask.TREC_50.value,
            NLPTask.REGRESSION.value,
        ]:
            use_tokenizer: bool = False if task in [
                NLPTask.TREC_6.value,
                NLPTask.TREC_50.value,
github flairNLP / flair / flair / data_fetcher.py View on Github external
if task == NLPTask.UD_KOREAN:
            cached_path(
                f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_BASQUE:
            cached_path(
                f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task.value.startswith("wassa"):

            emotion = task.value[6:]
github flairNLP / flair / flair / data_fetcher.py View on Github external
)
        if task.value.startswith("wikiner"):
            lc = ""
            if task == NLPTask.WIKINER_ENGLISH:
                lc = "en"
            if task == NLPTask.WIKINER_GERMAN:
                lc = "de"
            if task == NLPTask.WIKINER_DUTCH:
                lc = "nl"
            if task == NLPTask.WIKINER_FRENCH:
                lc = "fr"
            if task == NLPTask.WIKINER_ITALIAN:
                lc = "it"
            if task == NLPTask.WIKINER_SPANISH:
                lc = "es"
            if task == NLPTask.WIKINER_PORTUGUESE:
                lc = "pt"
            if task == NLPTask.WIKINER_POLISH:
                lc = "pl"
            if task == NLPTask.WIKINER_RUSSIAN:
                lc = "ru"

            data_file = (
                Path(flair.cache_root)
                / "datasets"
                / task.value
                / f"aij-wikiner-{lc}-wp3.train"
            )
            if not data_file.is_file():

                cached_path(
                    f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2",
github flairNLP / flair / flair / data_fetcher.py View on Github external
if task == NLPTask.UD_GERMAN:
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_DUTCH:
            cached_path(
                f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-train.conllu",
                Path("datasets") / task.value,
            )

        # --- UD Romance
        if task == NLPTask.UD_FRENCH:
            cached_path(
github flairNLP / flair / flair / data_fetcher.py View on Github external
NLPTaskDataFetcher.download_dataset(task)

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"

        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # get string value if enum is passed
        task = task.value if type(task) is NLPTask else task

        data_folder = base_path / task.lower()

        # the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
        if task == NLPTask.CONLL_2000.value:
            columns = {0: "text", 1: "pos", 2: "np"}

            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="np"
            )

        # many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag
        if (
            task == NLPTask.CONLL_03.value
            or task == NLPTask.ONTONER.value
            or task == NLPTask.FASHION.value
        ):
            columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}

            return NLPTaskDataFetcher.load_column_corpus(
                data_folder, columns, tag_to_biloes="ner"
github flairNLP / flair / textc.py View on Github external
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer


# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, 'agnews/').downsample(0.1)

# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()

# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),

                   # comment in flair embeddings for state-of-the-art results 
                    FlairEmbeddings('news-forward'),
                    FlairEmbeddings('news-backward'),
                   ]

# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
github flairNLP / flair / flair / data_fetcher.py View on Github external
if task == NLPTask.UD_CROATIAN:
            cached_path(
                f"{ud_path}UD_Croatian-SET/master/hr_set-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Croatian-SET/master/hr_set-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Croatian-SET/master/hr_set-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_SERBIAN:
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_BULGARIAN:
            cached_path(
                f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu",
github flairNLP / flair / flair / data_fetcher.py View on Github external
if task == NLPTask.UD_SERBIAN:
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_BULGARIAN:
            cached_path(
                f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-train.conllu",
                Path("datasets") / task.value,
            )

        # --- UD Asian
        if task == NLPTask.UD_ARABIC:
            cached_path(
github flairNLP / flair / flair / data_fetcher.py View on Github external
# --- UD Germanic
        if task == NLPTask.UD_ENGLISH:
            cached_path(
                f"{ud_path}UD_English-EWT/master/en_ewt-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_English-EWT/master/en_ewt-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_English-EWT/master/en_ewt-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_GERMAN:
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu",
                Path("datasets") / task.value,
            )
            cached_path(
                f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu",
                Path("datasets") / task.value,
            )

        if task == NLPTask.UD_DUTCH:
            cached_path(
                f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu",