Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# For all uppercase phrases (words, entities, persons),
# maintain a map of last names to full names
parts = w.split()
lastname = parts[-1]
# Clinton -> Hillary [Rodham] Clinton
if lastname[0].isupper():
# Look for Icelandic patronyms/matronyms
_, m = db.lookup_word(lastname, False)
if m and any(mm.fl in {"föð", "móð"} for mm in m):
# We don't store Icelandic patronyms/matronyms
# as surnames
pass
else:
lastnames[lastname] = token
if token.kind == TOK.WORD and upper and w not in Abbreviations.DICT:
if " " in w:
# w may be a person name with more than one embedded word
# parts is assigned in the if statement above
cnt = len(parts)
elif not token.val or ("-" in token.val[0].stofn):
# No BÍN meaning for this token, or the meanings
# were constructed by concatenation (indicated by a hyphen
# in the stem)
weak = False # Accept single-word entity references
# elist is a list of Entity instances
elist = query_entities(w)
else:
elist = []
if elist:
# This word might be a candidate to start an entity reference
def ifd_taglist_word(txt, mlist):
if not mlist:
if txt[0].isupper():
# Óþekkt sérnafn?
# !!! The probabilities below are a rough guess
return [
("nxen-s", 0.6),
("nxeo-s", 0.1),
("nxeþ-s", 0.1),
("nxee-s", 0.2),
]
# Erlent orð?
return [("e", 1.0)]
s = set(ifd_tag(TOK.WORD, txt, m) for m in mlist)
ltxt = txt.lower()
if ltxt in Prepositions.PP:
for case in Prepositions.PP[ltxt]:
if case in self.CASE_TO_TAG:
s.add(self.CASE_TO_TAG[case])
if ltxt in self._CONJ_REF:
# For referential conjunctions,
# add 'ct' as a possibility (it does not come directly from a BÍN mark)
s.add("ct")
# Add a +1 bias to the counts so that no lemma/tag pairs have zero frequency
prob = self.lemma_count(txt) + len(s)
d = self.lemma_tags(txt)
# It is possible for the probabilities of the tags in set s
# not to add up to 1.0. This can happen if the tokenizer has
# eliminated certain BÍN meanings due to updated settings
# in Pref.conf.
def gen_tokens():
""" Generate a Greynir token sequence from a tagging result """
ix = 0
for t in toklist:
if not t.txt:
continue
# The code below should correspond to TreeUtility._describe_token()
d = dict(x=t.txt)
if t.kind == TOK.WORD:
# set d["m"] to the meaning
pass
else:
d["k"] = t.kind
if t.val is not None and t.kind not in {
TOK.WORD,
TOK.ENTITY,
TOK.PUNCTUATION,
}:
# For tokens except words, entities and punctuation, include the val field
if t.kind == TOK.PERSON:
d["v"], d["g"] = TreeUtility.choose_full_name(
t.val, case=None, gender=None
)
else:
d["v"] = t.val
if t.kind in {
TOK.WORD,
TOK.ENTITY,
TOK.PERSON,
TOK.NUMBER,
TOK.YEAR,
def tag_stream(sentence_stream: Iterable[Iterable[Dict[str, Any]]]) -> Iterator[str]:
""" Generator for tag stream from a token stream """
for sent in sentence_stream:
if not sent:
continue
# For each sentence, start and end with empty strings
for _ in range(n - 1):
yield ""
for t in sent:
tag = None
# Skip punctuation
if t.get("k", TOK.WORD) != TOK.PUNCTUATION:
canonicalize_token(t)
tag = str(IFD_Tagset(t))
if tag:
self.lemma_cnt[t["x"]][tag] += 1
if tag:
yield tag
for _ in range(n - 1):
yield ""