How to use the flair.datasets.ColumnCorpus function in flair

To help you get started, we’ve selected a few flair examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github flairNLP / flair / tests / test_data_fetchers.py View on Github external
def test_load_sequence_labeling_data(tasks_base_path):
    # get training, test and dev data
    corpus = flair.datasets.ColumnCorpus(
        tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )

    assert len(corpus.train) == 6
    assert len(corpus.dev) == 1
    assert len(corpus.test) == 1
github flairNLP / flair / tests / test_model_integration.py View on Github external
def test_train_resume_sequence_tagging_training(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    model: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )
github flairNLP / flair / tests / test_model_integration.py View on Github external
def test_train_load_use_tagger_multicorpus(results_base_path, tasks_base_path):
    corpus_1 = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )
    corpus_2 = flair.datasets.GERMEVAL(base_path=tasks_base_path)

    corpus = MultiCorpus([corpus_1, corpus_2])
    tag_dictionary = corpus.make_tag_dictionary("ner")

    embeddings = WordEmbeddings("turian")

    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
    )
github flairNLP / flair / flair / datasets.py View on Github external
dev_file,
                column_format,
                tag_to_bioes,
                encoding=encoding,
                comment_symbol=comment_symbol,
                in_memory=in_memory,
                document_separator_token=document_separator_token,
            )
        else:
            train_length = len(train)
            dev_size: int = round(train_length / 10)
            splits = random_split(train, [train_length - dev_size, dev_size])
            train = splits[0]
            dev = splits[1]

        super(ColumnCorpus, self).__init__(train, dev, test, name=data_folder.name)
github flairNLP / flair / flair / datasets.py View on Github external
dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        _download_wikiner("pl", dataset_name)

        super(WIKINER_POLISH, self).__init__(
            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
        )


class WIKINER_RUSSIAN(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = False,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "pos", 2: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
github flairNLP / flair / flair / datasets.py View on Github external
dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        _download_wikiner("ru", dataset_name)

        super(WIKINER_RUSSIAN, self).__init__(
            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
        )


class WNUT_17(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = True,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
github flairNLP / flair / flair / datasets.py View on Github external
dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        _download_wikiner("nl", dataset_name)

        super(WIKINER_DUTCH, self).__init__(
            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
        )


class WIKINER_FRENCH(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = False,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "pos", 2: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
github flairNLP / flair / flair / datasets.py View on Github external
dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
        if not base_path:
            base_path = Path(flair.cache_root) / "datasets"
        data_folder = base_path / dataset_name

        # download data if necessary
        _download_wikiner("en", dataset_name)

        super(WIKINER_ENGLISH, self).__init__(
            data_folder, columns, tag_to_bioes=tag_to_bioes, in_memory=in_memory
        )


class WIKINER_GERMAN(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = False,
    ):
        if type(base_path) == str:
            base_path: Path = Path(base_path)

        # column format
        columns = {0: "text", 1: "pos", 2: "ner"}

        # this dataset name
        dataset_name = self.__class__.__name__.lower()

        # default dataset folder is the cache root
github flairNLP / flair / flair / datasets.py View on Github external
# download data if necessary
        conll_02_path = "https://www.clips.uantwerpen.be/conll2002/ner/data/"
        cached_path(f"{conll_02_path}esp.testa", Path("datasets") / dataset_name)
        cached_path(f"{conll_02_path}esp.testb", Path("datasets") / dataset_name)
        cached_path(f"{conll_02_path}esp.train", Path("datasets") / dataset_name)

        super(CONLL_03_SPANISH, self).__init__(
            data_folder,
            columns,
            tag_to_bioes=tag_to_bioes,
            encoding="latin-1",
            in_memory=in_memory,
        )


class CONLL_2000(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "np",
        in_memory: bool = True,
    ):
        """
        Initialize the CoNLL-2000 corpus for English chunking.
        The first time you call this constructor it will automatically download the dataset.
        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
        to point to a different folder but typically this should not be necessary.
        :param tag_to_bioes: 'np' by default, should not be changed, but you can set 'pos' instead to predict POS tags
        :param in_memory: If True, keeps dataset in memory giving speedups in training.
        """
        if type(base_path) == str:
            base_path: Path = Path(base_path)
github flairNLP / flair / flair / datasets.py View on Github external
image = Image(imageURL=image_info["url"])
            for caption in image_info["descriptions"]:
                # append Sentence-Image data point
                self.data_points.append(
                    DataPair(Sentence(preprocessor(caption), use_tokenizer=True), image)
                )
                self.split.append(int(image_info["split"]))

    def __len__(self):
        return len(self.data_points)

    def __getitem__(self, index: int = 0) -> DataPair:
        return self.data_points[index]


class CONLL_03(ColumnCorpus):
    def __init__(
        self,
        base_path: Union[str, Path] = None,
        tag_to_bioes: str = "ner",
        in_memory: bool = True,
        document_as_sequence: bool = False,
    ):
        """
        Initialize the CoNLL-03 corpus. This is only possible if you've manually downloaded it to your machine.
        Obtain the corpus from https://www.clips.uantwerpen.be/conll2003/ner/ and put it into some folder. Then point
        the base_path parameter in the constructor to this folder
        :param base_path: Path to the CoNLL-03 corpus on your machine
        :param tag_to_bioes: NER by default, need not be changed, but you could also select 'pos' or 'np' to predict
        POS tags or chunks respectively
        :param in_memory: If True, keeps dataset in memory giving speedups in training.
        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object