Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_default(self, spacy_doc):
result = list(extract.noun_chunks(spacy_doc))
assert all(isinstance(span, Span) for span in result)
for bigram in bigrams:
assert isinstance(bigram, Span)
assert len(bigram) == 2
trigrams = list(
extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2)
)[:10]
for trigram in trigrams:
assert isinstance(trigram, Span)
assert len(trigram) == 3
nes = list(
extract.entities(doc, drop_determiners=False, exclude_types="numeric")
)[:10]
for ne in nes:
assert isinstance(ne, Span)
assert ne.label_
assert ne.label_ != "QUANTITY"
pos_regex_matches = list(
extract.pos_regex_matches(doc, constants.POS_REGEX_PATTERNS["en"]["NP"])
)[:10]
for match in pos_regex_matches:
assert isinstance(match, Span)
stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
for stmt in stmts:
assert isinstance(stmt, list)
assert isinstance(stmt[0], compat.unicode_)
assert len(stmt) == 3
kts = textacy.ke.textrank(doc, topn=10)
"""Initialise the pipeline component.
"""
self._has_entities, self._is_entity, self._entity_desc, self._entities, self.canonical = attrs
# Set up the KeywordProcessor
self.keyword_processor = KeywordProcessor(case_sensitive=case_sensitive)
self.keyword_processor.add_keywords_from_list(keywords_list)
self.keyword_processor.add_keywords_from_dict(keywords_dict)
if keywords_file:
self.keyword_processor.add_keyword_from_file(keywords_file)
self.label = label
# Register attribute on the Doc and Span
Doc.set_extension(self._has_entities, getter=self.has_entities, force=True)
Doc.set_extension(self._entities, getter=self.iter_entities, force=True)
Span.set_extension(self._has_entities, getter=self.has_entities, force=True)
Span.set_extension(self._entities, getter=self.iter_entities, force=True)
# Register attribute on the Token.
Token.set_extension(self._is_entity, default=False, force=True)
Token.set_extension(self._entity_desc, getter=self.get_entity_desc, force=True)
Token.set_extension(self.canonical, default=None, force=True)
def do() -> None:
Token.set_extension('censored', default=None)
Token.set_extension('is_profane', getter=SpacyProfanityFilterComponent.token_is_profane)
Token.set_extension('original_profane_word', default=None)
Span.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
Doc.set_extension('is_profane', getter=SpacyProfanityFilterComponent.tokens_are_profane)
cui,
self.cdb.cui2pretty_name.get(cui, ''),
self.cdb.cui2tui.get(cui, ''),
self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''), ''),
float(acc))
elif self.LBL_STYLE == 'ent':
lbl = "{} - {:.2}".format(self.cdb.tui2name.get(
self.cdb.cui2tui.get(cui, ''), ''),
float(acc))
elif self.LBL_STYLE == 'none':
lbl = ""
else:
lbl = cui
lbl = doc.vocab.strings.add(lbl)
ent = Span(doc, tkns[0].i, tkns[-1].i + 1, label=lbl)
if self.ACC_ALWAYS:
acc = self._calc_acc(cui, doc, tkns, name)
ent._.acc = acc
ent._.cui = cui
ent._.tui = self.cdb.cui2tui.get(cui, 'None')
ent._.id = self.ent_id
self.ent_id += 1
doc._.ents.append(ent)
# Increase counter for cui_count_ext if not already added
if cui not in self._cuis:
if cui in self.cdb.cui_count_ext:
self.cdb.cui_count_ext[cui] += 1
def find_matches_for(
self, filtered: List[Tuple[Span, Span]], doc: Doc
) -> List[Tuple[Span, Set[Span]]]:
rules = {}
all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
already_seen_long: Set[str] = set()
already_seen_short: Set[str] = set()
for (long_candidate, short_candidate) in filtered:
short, long = find_abbreviation(long_candidate, short_candidate)
# We need the long and short form definitions to be unique, because we need
# to store them so we can look them up later. This is a bit of a
# pathalogical case also, as it would mean an abbreviation had been
# defined twice in a document. There's not much we can do about this,
# but at least the case which is discarded will be picked up below by
# the global matcher. So it's likely that things will work out ok most of the time.
new_long = long.string not in already_seen_long if long else False
new_short = short.string not in already_seen_short
if long is not None and new_long and new_short:
already_seen_long.add(long.string)
already_seen_short.add(short.string)
all_occurences[long].add(short)
:return:
'''
allowed_pos = [NOUN, ADJ, PUNCT, PROPN]
allowed_dep = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl", "dobj", "attr", "oprd", "pobj", "conj",
"compound", "amod", "punct", "meta", "npadvmod", "nmod"]#, add "prep" to extend for "of and "in"
extended_tokens = [i for i in tok.subtree if (i.dep_ in allowed_dep and i in tok.children) or (i == tok)]
allowed_continous_tokens = []
#break the extened token if something not allowed is between the selected tokens in the subtree
curr_pos = extended_tokens[0].i -1
for ex_t in extended_tokens:
if ex_t.i == curr_pos+1:
curr_pos = ex_t.i
allowed_continous_tokens.append(ex_t)
else:
break
span= Span(self.doc, allowed_continous_tokens[0].i, allowed_continous_tokens[-1].i+1)
return span
in the doc that match the lexicon and overlays the appropriate label as 'feature_is_label_from_lexicon'
over all tokens in the span.
:param doc:
:return:
"""
logging.debug("Called Lexicon Component")
matcher = PhraseMatcher(self.nlp.vocab, max_length=10)
for label in self.lexicon:
Token.set_extension('feature_is_' + label + '_from_lexicon', default=False, force=True)
patterns = [self.nlp.make_doc(term) for term in self.lexicon[label]]
logging.debug(patterns)
matcher.add(label, None, *patterns)
matches = matcher(doc)
for match_id, start, end in matches:
span = Span(doc, start, end)
logging.debug(span)
if span is not None:
logging.debug('Lexicon term matched: %s Label: %s' % (span.text, self.nlp.vocab.strings[match_id]))
for token in span:
token._.set('feature_is_' + self.nlp.vocab.strings[match_id] + '_from_lexicon', True)
return doc
def __call__(self, doc):
nlp = self.nlp
with doc.retokenize() as retokenizer:
# match and frequency indicators
matches = self.frequency_matcher(doc)
for match_id, start, end in matches:
span = Span(doc, start, end, label=nlp.vocab.strings['frequency_indicator'])
for token in span:
token._.feature_is_frequency_indicator = True
if len(span) > 1:
retokenizer.merge(span)
doc.ents = list(doc.ents) + [span]
return doc
def add_cat(self, spacy_cat):
self.nlp.add_pipe(spacy_cat, name='cat', last=True)
# Add custom fields needed for this usecase
Doc.set_extension('ents', default=None, force=True)
Span.set_extension('acc', default=-1, force=True)
Span.set_extension('cui', default=-1, force=True)
Span.set_extension('tui', default=-1, force=True)
Span.set_extension('id', default=0, force=True)