Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data_folder, columns, tag_to_biloes="ner"
)
# WSD tasks may be put into this column format
if task == NLPTask.WSD.value:
columns = {0: "text", 1: "lemma", 2: "pos", 3: "sense"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder,
columns,
train_file="semcor.tsv",
test_file="semeval2015.tsv",
)
# the UD corpora follow the CoNLL-U format, for which we have a special reader
if task.startswith("ud_") or task in [
NLPTask.ONTONOTES.value,
NLPTask.CONLL_12.value,
NLPTask.PENN.value,
]:
return NLPTaskDataFetcher.load_ud_corpus(data_folder)
# for text classifiers, we use our own special format
if task in [
NLPTask.IMDB.value,
NLPTask.AG_NEWS.value,
NLPTask.TREC_6.value,
NLPTask.TREC_50.value,
NLPTask.REGRESSION.value,
]:
use_tokenizer: bool = False if task in [
NLPTask.TREC_6.value,
NLPTask.TREC_50.value,
if task == NLPTask.UD_KOREAN:
cached_path(
f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Korean-Kaist/master/ko_kaist-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_BASQUE:
cached_path(
f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Basque-BDT/master/eu_bdt-ud-train.conllu",
Path("datasets") / task.value,
)
if task.value.startswith("wassa"):
emotion = task.value[6:]
)
if task.value.startswith("wikiner"):
lc = ""
if task == NLPTask.WIKINER_ENGLISH:
lc = "en"
if task == NLPTask.WIKINER_GERMAN:
lc = "de"
if task == NLPTask.WIKINER_DUTCH:
lc = "nl"
if task == NLPTask.WIKINER_FRENCH:
lc = "fr"
if task == NLPTask.WIKINER_ITALIAN:
lc = "it"
if task == NLPTask.WIKINER_SPANISH:
lc = "es"
if task == NLPTask.WIKINER_PORTUGUESE:
lc = "pt"
if task == NLPTask.WIKINER_POLISH:
lc = "pl"
if task == NLPTask.WIKINER_RUSSIAN:
lc = "ru"
data_file = (
Path(flair.cache_root)
/ "datasets"
/ task.value
/ f"aij-wikiner-{lc}-wp3.train"
)
if not data_file.is_file():
cached_path(
f"{wikiner_path}aij-wikiner-{lc}-wp3.bz2",
if task == NLPTask.UD_GERMAN:
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_DUTCH:
cached_path(
f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-train.conllu",
Path("datasets") / task.value,
)
# --- UD Romance
if task == NLPTask.UD_FRENCH:
cached_path(
NLPTaskDataFetcher.download_dataset(task)
# default dataset folder is the cache root
if not base_path:
base_path = Path(flair.cache_root) / "datasets"
if type(base_path) == str:
base_path: Path = Path(base_path)
# get string value if enum is passed
task = task.value if type(task) is NLPTask else task
data_folder = base_path / task.lower()
# the CoNLL 2000 task on chunking has three columns: text, pos and np (chunk)
if task == NLPTask.CONLL_2000.value:
columns = {0: "text", 1: "pos", 2: "np"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="np"
)
# many NER tasks follow the CoNLL 03 format with four colulms: text, pos, np and ner tag
if (
task == NLPTask.CONLL_03.value
or task == NLPTask.ONTONER.value
or task == NLPTask.FASHION.value
):
columns = {0: "text", 1: "pos", 2: "np", 3: "ner"}
return NLPTaskDataFetcher.load_column_corpus(
data_folder, columns, tag_to_biloes="ner"
from flair.data import TaggedCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentLSTMEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
# 1. get the corpus
corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, 'agnews/').downsample(0.1)
# 2. create the label dictionary
label_dict = corpus.make_label_dictionary()
# 3. make a list of word embeddings
word_embeddings = [WordEmbeddings('glove'),
# comment in flair embeddings for state-of-the-art results
FlairEmbeddings('news-forward'),
FlairEmbeddings('news-backward'),
]
# 4. init document embedding by passing list of word embeddings
document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(word_embeddings,
hidden_size=512,
reproject_words=True,
if task == NLPTask.UD_CROATIAN:
cached_path(
f"{ud_path}UD_Croatian-SET/master/hr_set-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Croatian-SET/master/hr_set-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Croatian-SET/master/hr_set-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_SERBIAN:
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_BULGARIAN:
cached_path(
f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu",
if task == NLPTask.UD_SERBIAN:
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Serbian-SET/master/sr_set-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_BULGARIAN:
cached_path(
f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_Bulgarian-BTB/master/bg_btb-ud-train.conllu",
Path("datasets") / task.value,
)
# --- UD Asian
if task == NLPTask.UD_ARABIC:
cached_path(
# --- UD Germanic
if task == NLPTask.UD_ENGLISH:
cached_path(
f"{ud_path}UD_English-EWT/master/en_ewt-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_English-EWT/master/en_ewt-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_English-EWT/master/en_ewt-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_GERMAN:
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-dev.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-test.conllu",
Path("datasets") / task.value,
)
cached_path(
f"{ud_path}UD_German-GSD/master/de_gsd-ud-train.conllu",
Path("datasets") / task.value,
)
if task == NLPTask.UD_DUTCH:
cached_path(
f"{ud_path}UD_Dutch-Alpino/master/nl_alpino-ud-dev.conllu",