Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
if freqs_loc is not None:
with msg.loading("Counting frequencies..."):
probs, _ = read_freqs(freqs_loc)
msg.good("Counted frequencies")
else:
probs, _ = ({}, DEFAULT_OOV_PROB) # noqa: F841
if clusters_loc:
with msg.loading("Reading clusters..."):
clusters = read_clusters(clusters_loc)
msg.good("Read clusters")
else:
clusters = {}
lex_attrs = []
sorted_probs = sorted(probs.items(), key=lambda item: item[1], reverse=True)
if len(sorted_probs):
for i, (word, prob) in tqdm(enumerate(sorted_probs)):
attrs = {"orth": word, "id": i, "prob": prob}
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
if word in clusters:
attrs["cluster"] = int(clusters[word][::-1], 2)
else:
attrs["cluster"] = 0
lex_attrs.append(attrs)
if iter_since_best >= n_early_stopping:
msg.text(
"Early stopping, best iteration "
"is: {}".format(i - iter_since_best)
)
msg.text(
"Best score = {}; Final iteration "
"score = {}".format(best_score, current_score)
)
break
finally:
with nlp.use_params(optimizer.averages):
final_model_path = output_path / "model-final"
nlp.to_disk(final_model_path)
msg.good("Saved model to output directory", final_model_path)
with msg.loading("Creating best model..."):
best_model_path = _collate_best_model(meta, output_path, nlp.pipe_names)
msg.good("Created best model", best_model_path)
losses = {}
data = tqdm.tqdm(train_data, leave=False)
for batch in spacy.util.minibatch(data, size=batch_size):
texts, annots = zip(*batch)
nlp.update(texts, annots, drop=0.2, losses=losses)
with nlp.use_params(optimizer.averages):
sc = nlp.evaluate(eval_data)
if sc.ents_f > best_acc:
best_acc = sc.ents_f
if output:
best_model = nlp.to_bytes()
acc = (f"{sc.ents_p:.3f}", f"{sc.ents_r:.3f}", f"{sc.ents_f:.3f}")
msg.row((i + 1, f"{losses['ner']:.2f}", *acc), widths=row_widths)
msg.text(f"Best F-Score: {best_acc:.3f}")
if output and best_model:
with msg.loading("Saving model..."):
nlp.from_bytes(best_model).to_disk(output)
msg.good("Saved model", output)
def train_model(
model, train_path, eval_path, n_iter=10, output=None, tok2vec=None,
):
"""
Train a model from Prodigy annotations and optionally save out the best
model to disk.
"""
spacy.util.fix_random_seed(0)
with msg.loading(f"Loading '{model}'..."):
if model.startswith("blank:"):
nlp = spacy.blank(model.replace("blank:", ""))
else:
nlp = spacy.load(model)
msg.good(f"Loaded model '{model}'")
train_data, labels = format_data(srsly.read_jsonl(train_path))
eval_data, _ = format_data(srsly.read_jsonl(eval_path))
textcat = nlp.create_pipe("textcat")
for label in labels:
textcat.add_label(label)
nlp.add_pipe(textcat)
cfg = {"textcat": {"exclusive_classes": True}}
optimizer = nlp.begin_training(component_cfg=cfg)
batch_size = spacy.util.compounding(1.0, 16.0, 1.001)
best_acc = 0
best_model = None
losses = {}
data = tqdm.tqdm(train_data, leave=False)
for batch in spacy.util.minibatch(data, size=batch_size):
texts, annots = zip(*batch)
nlp.update(texts, annots, drop=0.2, losses=losses)
with nlp.use_params(optimizer.averages):
sc = nlp.evaluate(eval_data)
if sc.textcat_score > best_acc:
best_acc = sc.textcat_score
if output:
best_model = nlp.to_bytes()
acc = f"{sc.textcat_score:.3f}"
msg.row((i + 1, f"{losses['textcat']:.2f}", acc), widths=row_widths)
msg.text(f"Best F-Score: {best_acc:.3f}")
if output and best_model:
with msg.loading("Saving model..."):
nlp.from_bytes(best_model).to_disk(output)
msg.good("Saved model", output)
)
model = PolicyValueModel(args=args, predictions=predictions, name="agent")
init_inputs = initial_state.to_inputs()
model.compile(
optimizer=model.optimizer, loss="binary_crossentropy", metrics=["accuracy"]
)
model.build(initial_state.to_input_shapes())
model.predict(init_inputs)
model.predict_next(init_inputs)
opt = f"{model_path}.optimizer"
mod = f"{model_path}.h5"
if os.path.exists(mod):
if is_main and args.verbose:
with msg.loading(f"Loading model: {mod}..."):
_load_model(model, mod, opt)
msg.good(f"Loaded model: {mod}")
else:
_load_model(model, mod, opt)
# If we're doing transfer, reset optimizer steps
if is_main and args.init_model_from is not None:
msg.info("reset optimizer steps to 0 for transfer model")
model.optimizer.iterations.assign(0)
elif required:
print_error(
ValueError("Model Not Found"),
f"Cannot find model: {mod}",
print_error=False,
)
elif is_main:
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [("F-Score", f"{sc.textcat_score:.3f}")]
msg.table(result)
"The -f and -c arguments are deprecated, and not compatible "
"with the -j argument, which should specify the same "
"information. Either merge the frequencies and clusters data "
"into the JSONL-formatted file (recommended), or use only the "
"-f and -c files, without the other lexical attributes.",
)
jsonl_loc = ensure_path(jsonl_loc)
lex_attrs = srsly.read_jsonl(jsonl_loc)
else:
clusters_loc = ensure_path(clusters_loc)
freqs_loc = ensure_path(freqs_loc)
if freqs_loc is not None and not freqs_loc.exists():
msg.fail("Can't find words frequencies file", freqs_loc, exits=1)
lex_attrs = read_attrs_from_deprecated(freqs_loc, clusters_loc)
with msg.loading("Creating model..."):
nlp = create_model(lang, lex_attrs, name=model_name)
msg.good("Successfully created model")
if vectors_loc is not None:
add_vectors(nlp, vectors_loc, prune_vectors, vectors_name)
vec_added = len(nlp.vocab.vectors)
lex_added = len(nlp.vocab)
msg.good(
"Sucessfully compiled vocab",
"{} entries, {} vectors".format(lex_added, vec_added),
)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
return nlp
def evaluate_model(model, eval_path):
"""
Evaluate a trained model on Prodigy annotations and print the accuracy.
"""
with msg.loading(f"Loading model '{model}'..."):
nlp = spacy.load(model)
data, _ = format_data(srsly.read_jsonl(eval_path))
sc = nlp.evaluate(data)
result = [
("Precision", f"{sc.ents_p:.3f}"),
("Recall", f"{sc.ents_r:.3f}"),
("F-Score", f"{sc.ents_f:.3f}"),
]
msg.table(result)