Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_clear_project_nonexistent_data(testdatadir, caplog):
logger = annif.logger
logger.propagate = True
runner.invoke(
annif.cli.cli,
['clear', 'dummy-fi']).exit_code != 0
assert len(caplog.records) == 1
expected_msg = 'No model data to remove for project dummy-fi.'
assert expected_msg == caplog.records[0].message
def test_docfile_plain_invalid_lines(tmpdir, caplog):
logger = annif.logger
logger.propagate = True
docfile = tmpdir.join('documents_invalid.tsv')
docfile.write("""Läntinen\t
Oulunlinnan\t
A line with no tabs
Harald Hirmuinen\t""")
docs = annif.corpus.DocumentFile(str(docfile))
assert len(list(docs.documents)) == 3
assert len(caplog.records) == 2
expected_msg = 'Skipping invalid line (missing tab):'
for record in caplog.records:
assert expected_msg in record.message
def atomic_save(obj, dirname, filename, method=None):
"""Save the given object (which must have a .save() method, unless the
method parameter is given) into the given directory with the given
filename, using a temporary file and renaming the temporary file to the
final name."""
prefix, suffix = os.path.splitext(filename)
tempfd, tempfilename = tempfile.mkstemp(
prefix=prefix, suffix=suffix, dir=dirname)
os.close(tempfd)
logger.debug('saving %s to temporary file %s', str(obj), tempfilename)
if method is not None:
method(obj, tempfilename)
else:
obj.save(tempfilename)
for fn in glob.glob(tempfilename + '*'):
newname = fn.replace(tempfilename, os.path.join(dirname, filename))
logger.debug('renaming temporary file %s to %s', fn, newname)
os.rename(fn, newname)
"""Vocabulary management functionality for Annif"""
import os.path
import annif
import annif.corpus
import annif.util
from annif.datadir import DatadirMixin
from annif.exception import NotInitializedException
logger = annif.logger
class AnnifVocabulary(DatadirMixin):
"""Class representing a subject vocabulary which can be used by multiple
Annif projects."""
# defaults for uninitialized instances
_subjects = None
def __init__(self, vocab_id, datadir):
DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
self.vocab_id = vocab_id
def _create_subject_index(self, subject_corpus):
self._subjects = annif.corpus.SubjectIndex(subject_corpus)
annif.util.atomic_save(self._subjects, self.datadir, 'subjects')
"""Clases for supporting document corpora"""
import glob
import os.path
import re
import gzip
import annif.util
from .types import DocumentCorpus
from .subject import SubjectSet
logger = annif.logger
class DocumentDirectory(DocumentCorpus):
"""A directory of files as a full text document corpus"""
def __init__(self, path, require_subjects=False):
self.path = path
self.require_subjects = require_subjects
def __iter__(self):
"""Iterate through the directory, yielding tuples of (docfile,
subjectfile) containing file paths. If there is no key file and
require_subjects is False, the subjectfile will be returned as None."""
for filename in sorted(glob.glob(os.path.join(self.path, '*.txt'))):
tsvfilename = re.sub(r'\.txt$', '.tsv', filename)
try:
return _analyzers[analyzer](param)
except KeyError:
raise ValueError("No such analyzer {}".format(analyzer))
register_analyzer(simple.SimpleAnalyzer)
register_analyzer(snowball.SnowballAnalyzer)
# Optional analyzers
try:
from . import voikko
register_analyzer(voikko.VoikkoAnalyzer)
except ImportError:
annif.logger.debug("voikko not available, not enabling voikko analyzer")
raise ValueError("No such backend type {}".format(backend_id))
register_backend(dummy.DummyBackend)
register_backend(ensemble.EnsembleBackend)
register_backend(http.HTTPBackend)
register_backend(tfidf.TFIDFBackend)
register_backend(doc2vec.Doc2VecBackend)
register_backend(pav.PAVBackend)
# Optional backends
try:
from . import fasttext
register_backend(fasttext.FastTextBackend)
except ImportError:
annif.logger.debug("fastText not available, not enabling fasttext backend")
try:
from . import vw_multi
register_backend(vw_multi.VWMultiBackend)
except ImportError:
annif.logger.debug(
"vowpalwabbit not available, not enabling vw_multi backend")
register_backend(tfidf.TFIDFBackend)
register_backend(doc2vec.Doc2VecBackend)
register_backend(pav.PAVBackend)
# Optional backends
try:
from . import fasttext
register_backend(fasttext.FastTextBackend)
except ImportError:
annif.logger.debug("fastText not available, not enabling fasttext backend")
try:
from . import vw_multi
register_backend(vw_multi.VWMultiBackend)
except ImportError:
annif.logger.debug(
"vowpalwabbit not available, not enabling vw_multi backend")
def info(self, message):
"""Log an info message from this backend"""
logger.info("Backend {}: {}".format(self.backend_id, message))
method parameter is given) into the given directory with the given
filename, using a temporary file and renaming the temporary file to the
final name."""
prefix, suffix = os.path.splitext(filename)
tempfd, tempfilename = tempfile.mkstemp(
prefix=prefix, suffix=suffix, dir=dirname)
os.close(tempfd)
logger.debug('saving %s to temporary file %s', str(obj), tempfilename)
if method is not None:
method(obj, tempfilename)
else:
obj.save(tempfilename)
for fn in glob.glob(tempfilename + '*'):
newname = fn.replace(tempfilename, os.path.join(dirname, filename))
logger.debug('renaming temporary file %s to %s', fn, newname)
os.rename(fn, newname)