Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_train_charlm_load_use_classifier(results_base_path, tasks_base_path):
corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
label_dict = corpus.make_label_dictionary()
embedding: TokenEmbeddings = FlairEmbeddings("news-forward-fast")
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
[embedding], 128, 1, False, 64, False, False
)
model: TextClassifier = TextClassifier(document_embeddings, label_dict, False)
trainer = ModelTrainer(model, corpus)
trainer.train(results_base_path, max_epochs=2, shuffle=False)
sentence = Sentence("Berlin is a really nice city.")
for s in model.predict(sentence):
for l in s.labels:
assert l.value is not None
assert 0.0 <= l.score <= 1.0
assert type(l.score) is float
def test_loading_not_existing_char_lm_embedding():
with pytest.raises(ValueError):
FlairEmbeddings('other')
resources_path / "corpora/lorem_ipsum",
dictionary,
language_model.is_forward_lm,
character_level=True,
)
# train the language model
trainer: LanguageModelTrainer = LanguageModelTrainer(
language_model, corpus, test_mode=True
)
trainer.train(
results_base_path, sequence_length=10, mini_batch_size=10, max_epochs=2
)
# use the character LM as embeddings to embed the example sentence 'I love Berlin'
char_lm_embeddings: TokenEmbeddings = FlairEmbeddings(
str(results_base_path / "best-lm.pt")
)
sentence = Sentence("I love Berlin")
char_lm_embeddings.embed(sentence)
text, likelihood = language_model.generate_text(number_of_characters=100)
assert text is not None
assert len(text) >= 100
# clean up results directory
shutil.rmtree(results_base_path, ignore_errors=True)
def init_document_embeddings():
text = 'I love Berlin. Berlin is a great place to live.'
sentence: Sentence = Sentence(text)
glove: TokenEmbeddings = WordEmbeddings('turian')
charlm: TokenEmbeddings = FlairEmbeddings('news-forward-fast')
return sentence, glove, charlm
def test_compute_perplexity():
from flair.embeddings import FlairEmbeddings
language_model = FlairEmbeddings("news-forward-fast").lm
grammatical = "The company made a profit"
perplexity_gramamtical_sentence = language_model.calculate_perplexity(grammatical)
ungrammatical = "Nook negh qapla!"
perplexity_ungramamtical_sentence = language_model.calculate_perplexity(
ungrammatical
)
print(f'"{grammatical}" - perplexity is {perplexity_gramamtical_sentence}')
print(f'"{ungrammatical}" - perplexity is {perplexity_ungramamtical_sentence}')
assert perplexity_gramamtical_sentence < perplexity_ungramamtical_sentence
language_model = FlairEmbeddings("news-backward-fast").lm
def post_init(self):
from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, FlairEmbeddings
self._flair = DocumentPoolEmbeddings(
[WordEmbeddings(self.word_embedding),
FlairEmbeddings(self.flair_embeddings[0]),
FlairEmbeddings(self.flair_embeddings[1])],
pooling=self.pooling_strategy)
return content.decode("utf-8")
embeddingList=[]
for i in range(1,len(sys.argv)):
arg = sys.argv[i]
typeAndPath = arg.split(":")
type = typeAndPath[0]
path = typeAndPath[1]
if type == "word":
embeddingList.append(WordEmbeddings(path))
if type == "char":
embeddingList.append(CharacterEmbeddings(path))
if type == "bytepair":
embeddingList.append(BytePairEmbeddings(path))
if type == "flair":
embeddingList.append(FlairEmbeddings(path))
if type == "bert":
embeddingList.append(BertEmbeddings(path))
if type == "elmo":
embeddingList.append(ELMoEmbeddings(path))
if len(embeddingList) > 1:
embeddings = StackedEmbeddings(embeddings=embeddingList)
else:
embeddings = embeddingList[0]
stdbuffer = sys.stdin.buffer
print("Script is ready")
while True:
line = decodeString(stdbuffer)
if line.strip() == "exit":
sys.exit(0)
sentenceTaggingRequests = json.loads(line)
word_tokens_namespace: str = "tokens",
):
""" Flair Embeddings. This is used to produce Named Entity Recognition. Note: This only
works if your tokens are produced by splitting based on white space
Parameters
----------
embedding_type
datasets_manager
device
word_tokens_namespace
"""
super(FlairEmbedder, self).__init__()
self.allowed_type = ["en", "news"]
assert embedding_type in self.allowed_type
self.embedder_forward = FlairEmbeddings(f"{embedding_type}-forward")
self.embedder_backward = FlairEmbeddings(f"{embedding_type}-backward")
self.embedder_name = f"FlairEmbedder-{embedding_type}"
self.datasets_manager = datasets_manager
self.device = torch.device(device) if isinstance(device, str) else device
self.word_tokens_namespace = word_tokens_namespace
def get_embeddings(embeddings: List[str], character: bool, lang: str, bpe_size: int) -> StackedEmbeddings:
"""To Construct and return a embedding model"""
stack = []
for e in embeddings:
if e != '':
if 'forward' in e or 'backward' in e:
stack.append(FlairEmbeddings(e))
else:
stack.append(WordEmbeddings(e))
if character:
stack.append(CharacterEmbeddings())
if bpe_size > 0:
stack.append(BytePairEmbeddings(language=lang, dim=bpe_size))
return StackedEmbeddings(embeddings=stack)
"""
from flair.embeddings import FlairEmbeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import StackedEmbeddings
embeddings = []
if word_embeddings:
fasttext_embedding = WordEmbeddings('da')
embeddings.append(fasttext_embedding)
if direction == 'bi' or direction == 'fwd':
fwd_weight_path = download_model('flair.fwd', cache_dir,
verbose=verbose,
process_func=_unzip_process_func)
embeddings.append(FlairEmbeddings(fwd_weight_path))
if direction == 'bi' or direction == 'bwd':
bwd_weight_path = download_model('flair.bwd', cache_dir,
verbose=verbose,
process_func=_unzip_process_func)
embeddings.append(FlairEmbeddings(bwd_weight_path))
if len(embeddings) == 1:
return embeddings[0]
return StackedEmbeddings(embeddings=embeddings)