Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_project_learn_not_supported(app, tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('\tkey1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('\tkey2')
docdir = annif.corpus.DocumentDirectory(str(tmpdir))
with app.app_context():
project = annif.project.get_project('tfidf-fi')
with pytest.raises(NotSupportedException):
project.learn(docdir)
def test_omikuji_train_nodocuments(datadir, project, empty_corpus):
omikuji_type = annif.backend.get_backend('omikuji')
omikuji = omikuji_type(
backend_id='omikuji',
config_params={},
project=project)
with pytest.raises(NotSupportedException):
omikuji.train(empty_corpus)
def test_maui_train_nodocuments(maui, project, empty_corpus):
with pytest.raises(NotSupportedException) as excinfo:
maui.train(empty_corpus)
assert 'training backend maui with no documents' in str(excinfo.value)
def test_fasttext_train_nodocuments(tmpdir, datadir, project):
fasttext_type = annif.backend.get_backend("fasttext")
fasttext = fasttext_type(
backend_id='fasttext',
config_params={
'limit': 50,
'dim': 100,
'lr': 0.25,
'epoch': 20,
'loss': 'hs'},
datadir=str(datadir))
empty_file = tmpdir.ensure('empty.tsv')
empty_document_corpus = annif.corpus.DocumentFile(str(empty_file))
with pytest.raises(NotSupportedException) as excinfo:
fasttext.train(empty_document_corpus, project)
assert 'training backend fasttext with no documents' in str(excinfo.value)
def train(self, corpus):
if corpus.is_empty():
raise NotSupportedException(
'Cannot train omikuji project with no documents')
input = (doc.text for doc in corpus.documents)
params = {'min_df': int(self.params['min_df']),
'tokenizer': self.project.analyzer.tokenize_words}
veccorpus = self.create_vectorizer(input, params)
self._create_train_file(veccorpus, corpus)
self._create_model()
def train(self, corpus):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self._initialize_tagger()
self._upload_vocabulary()
self._create_train_file(corpus)
self._upload_train_file()
self._wait_for_train()
def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self._create_train_file(corpus, project)
self._create_model()
def learn(self, corpus):
"""further train the project using documents from a metadata source"""
corpus.set_subject_index(self.subjects)
if isinstance(
self.backend,
annif.backend.backend.AnnifLearningBackend):
self.backend.learn(corpus, project=self)
else:
raise NotSupportedException("Learning not supported by backend",
project_id=self.project_id)
def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException(
'Cannot train tfidf project with no documents')
self.info('transforming subject corpus')
subjects = self._generate_subjects_from_documents(corpus, project)
self.info('creating vectorizer')
self._vectorizer = TfidfVectorizer()
veccorpus = self._vectorizer.fit_transform(subjects)
annif.util.atomic_save(
self._vectorizer,
self.datadir,
self.VECTORIZER_FILE,
method=joblib.dump)
self._create_index(veccorpus)
def train(self, corpus, project):
if corpus.is_empty():
raise NotSupportedException('training backend {} with no documents'
.format(self.backend_id))
self.info("creating PAV models")
sources = annif.util.parse_sources(self.params['sources'])
min_docs = int(self.params['min-docs'])
for source_project_id, _ in sources:
self._create_pav_model(source_project_id, min_docs, corpus)