How to use the annif.corpus.DocumentFile function in annif

To help you get started, we’ve selected a few annif examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github NatLibFi / Annif / tests / test_backend_vw_ensemble.py View on Github external
def test_vw_ensemble_train_and_learn(app, datadir, tmpdir):
    vw_ensemble_type = annif.backend.get_backend("vw_ensemble")
    vw_ensemble = vw_ensemble_type(
        backend_id='vw_ensemble',
        config_params={'sources': 'dummy-en'},
        datadir=str(datadir))

    tmpfile = tmpdir.join('document.tsv')
    tmpfile.write("dummy\thttp://example.org/dummy\n" +
                  "another\thttp://example.org/dummy\n" +
                  "none\thttp://example.org/none")
    document_corpus = annif.corpus.DocumentFile(str(tmpfile))
    project = annif.project.get_project('dummy-en')

    with app.app_context():
        vw_ensemble.train(document_corpus, project)
    assert datadir.join('vw-train.txt').exists()
    assert datadir.join('vw-train.txt').size() > 0
    assert datadir.join('subject-freq.json').exists()
    assert datadir.join('subject-freq.json').size() > 0
    assert datadir.join('vw-model').exists()
    assert datadir.join('vw-model').size() > 0

    # test online learning
    modelfile = datadir.join('vw-model')
    freqfile = datadir.join('subject-freq.json')

    old_size = modelfile.size()
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_combinedcorpus(tmpdir):
    docfile = tmpdir.join('documents.tsv')
    docfile.write("""Läntinen\t
        Oulunlinnan\t
        Harald Hirmuinen\t""")

    corpus1 = annif.corpus.DocumentFile(str(docfile))
    corpus2 = annif.corpus.DocumentFile(str(docfile))

    combined = annif.corpus.CombinedCorpus([corpus1, corpus2])

    assert len(list(combined.documents)) == 6
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_combinedcorpus(tmpdir):
    docfile = tmpdir.join('documents.tsv')
    docfile.write("""Läntinen\t
        Oulunlinnan\t
        Harald Hirmuinen\t""")

    corpus1 = annif.corpus.DocumentFile(str(docfile))
    corpus2 = annif.corpus.DocumentFile(str(docfile))

    combined = annif.corpus.CombinedCorpus([corpus1, corpus2])

    assert len(list(combined.documents)) == 6
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_docfile_is_empty(tmpdir):
    empty_file = tmpdir.ensure('empty.tsv')
    docs = annif.corpus.DocumentFile(str(empty_file))
    assert docs.is_empty()
github NatLibFi / Annif / tests / test_corpus.py View on Github external
def test_docfile_plain(tmpdir):
    docfile = tmpdir.join('documents.tsv')
    docfile.write("""Läntinen\t
        Oulunlinnan\t
        Harald Hirmuinen\t""")

    docs = annif.corpus.DocumentFile(str(docfile))
    assert len(list(docs.documents)) == 3
github NatLibFi / Annif / annif / cli.py View on Github external
def open_doc_path(path):
        """open a single path and return it as a DocumentCorpus"""
        if os.path.isdir(path):
            return annif.corpus.DocumentDirectory(path, require_subjects=True)
        return annif.corpus.DocumentFile(path)