Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
TOK.WORD,
TOK.ENTITY,
TOK.PUNCTUATION,
}:
# For tokens except words, entities and punctuation, include the val field
if t.kind == TOK.PERSON:
d["v"], d["g"] = TreeUtility.choose_full_name(
t.val, case=None, gender=None
)
else:
d["v"] = t.val
if t.kind in {
TOK.WORD,
TOK.ENTITY,
TOK.PERSON,
TOK.NUMBER,
TOK.YEAR,
TOK.ORDINAL,
TOK.PERCENT,
}:
d["i"] = tags[ix]
ix += 1
if t.kind == TOK.WORD and " " in d["x"]:
# Some kind of phrase: split it
xlist = d["x"].split()
for x in xlist:
d["x"] = x
if x == "og":
# Probably intermediate word: fjármála- og efnahagsráðherra
yield dict(x="og", i="c")
else:
yield d.copy()
def gen_tokens():
""" Generate a Greynir token sequence from a tagging result """
ix = 0
for t in toklist:
if not t.txt:
continue
# The code below should correspond to TreeUtility._describe_token()
d = dict(x=t.txt)
if t.kind == TOK.WORD:
# set d["m"] to the meaning
pass
else:
d["k"] = t.kind
if t.val is not None and t.kind not in {
TOK.WORD,
TOK.ENTITY,
TOK.PUNCTUATION,
}:
# For tokens except words, entities and punctuation, include the val field
if t.kind == TOK.PERSON:
d["v"], d["g"] = TreeUtility.choose_full_name(
t.val, case=None, gender=None
)
else:
d["v"] = t.val
if t.kind in {
TOK.WORD,
TOK.ENTITY,
TOK.PERSON,
TOK.NUMBER,
TOK.YEAR,
TOK.ORDINAL,
def recognize_entities(
token_stream: Iterator[Tok], enclosing_session=None, token_ctor=TOK
) -> Iterator[Tok]:
""" Parse a stream of tokens looking for (capitalized) entity names
The algorithm implements N-token lookahead where N is the
length of the longest entity name having a particular initial word.
Adds a named entity recognition layer on top of the
reynir.bintokenizer.tokenize() function.
"""
# Token queue
tq = [] # type: List[Tok]
# Phrases we're considering. Note that an entry of None
# indicates that the accumulated phrase so far is a complete
# and valid known entity name.
state = defaultdict(list) # type: Dict[Union[str, None], List[Tuple[List[str], Entity]]]
def _parse(toklist):
""" Parse a token list as a query """
bp = Query._parser
assert bp is not None
num_sent = 0
num_parsed_sent = 0
rdc = Reducer(bp.grammar)
trees = dict()
sent = [] # type: List[Tok]
for t in toklist:
if t[0] == TOK.S_BEGIN:
sent = []
elif t[0] == TOK.S_END:
slen = len(sent)
if not slen:
continue
num_sent += 1
# Parse the accumulated sentence
num = 0
try:
# Parse the sentence
forest = bp.go(sent)
if forest is not None:
num = Fast_Parser.num_combinations(forest)
if num > 1:
# Reduce the resulting forest
forest = rdc.go(forest)
# add 'ct' as a possibility (it does not come directly from a BÍN mark)
s.add("ct")
# Add a +1 bias to the counts so that no lemma/tag pairs have zero frequency
prob = self.lemma_count(txt) + len(s)
d = self.lemma_tags(txt)
# It is possible for the probabilities of the tags in set s
# not to add up to 1.0. This can happen if the tokenizer has
# eliminated certain BÍN meanings due to updated settings
# in Pref.conf.
return [(tag, (d.get(tag, 0) + 1) / prob) for tag in s]
if token.kind == TOK.WORD:
taglist = ifd_taglist_word(token.txt, token.val)
elif token.kind == TOK.ENTITY:
taglist = ifd_taglist_entity(token.txt)
elif token.kind == TOK.PERSON:
taglist = ifd_taglist_person(token.txt, token.val)
elif token.kind == TOK.NUMBER:
taglist = [("tfkfn", 1.0)] # !!!
elif token.kind == TOK.YEAR:
taglist = [("ta", 1.0)]
elif token.kind == TOK.PERCENT:
taglist = [("tp", 1.0)]
elif token.kind == TOK.ORDINAL:
taglist = [("lxexsf", 1.0)]
# elif token.kind == TOK.CURRENCY:
# taglist = None
# elif token.kind == TOK.AMOUNT:
# taglist = None
# elif token.kind == TOK.DATE:
# taglist = None
elif token.kind == TOK.PUNCTUATION:
def create_name_register(tokens, session, all_names=False) -> RegisterType:
""" Assemble a dictionary of person and entity names
occurring in the token list """
register = {} # type: RegisterType
for t in tokens:
if t.kind == TOK.PERSON:
gn = t.val
for pn in gn:
add_name_to_register(pn.name, register, session, all_names=all_names)
elif t.kind == TOK.ENTITY:
add_entity_to_register(t.txt, register, session, all_names=all_names)
return register