Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Mér er sagt að Geysir sé hættur að gjósa.
Geysir er hættur að gjósa.
Geysir er gamall goshver.
Fyrirtækið Apple-búðin selur Apple Mac tölvur.
Fyrirtækið Origo selur IBM tölvur.
Íslendingar stofnuðu skipafélagið Eimskipafélag Íslands hf.
"""
toklist = tokenize(text)
fp = Fast_Parser(verbose=False)
ip = IncrementalParser(fp, toklist, verbose=False)
# Dict of parse trees in string dump format,
# stored by sentence index (1-based)
trees = OrderedDict()
num_sent = 0
for p in ip.paragraphs():
for sent in p.sentences():
num_sent += 1
num_tokens = len(sent)
assert sent.parse(), "Sentence does not parse: " + sent.text
# Obtain a text representation of the parse tree
token_dicts = TreeUtility.dump_tokens(sent.tokens, sent.tree)
# Create a verbose text representation of
# the highest scoring parse tree
tree = ParseForestDumper.dump_forest(sent.tree, token_dicts=token_dicts)
# Add information about the sentence tree's score
# and the number of tokens
def _process_toklist(parser, session, toklist, xform):
""" Low-level utility function to parse token lists and return
the result of a transformation function (xform) for each sentence """
pgs = [] # Paragraph list, containing sentences, containing tokens
ip = IncrementalParser(parser, toklist, verbose=True)
for p in ip.paragraphs():
pgs.append([])
for sent in p.sentences():
if sent.parse():
# Parsed successfully
pgs[-1].append(xform(sent.tokens, sent.tree, None))
else:
# Error in parse
pgs[-1].append(xform(sent.tokens, None, sent.err_index))
stats = dict(
num_tokens=ip.num_tokens,
num_sentences=ip.num_sentences,
num_parsed=ip.num_parsed,
ambiguity=ip.ambiguity,
num_combinations=ip.num_combinations,
def _parse(self, enclosing_session=None, verbose=False):
""" Parse the article content to yield parse trees and annotated token list """
with SessionContext(enclosing_session) as session:
# Convert the content soup to a token iterable (generator)
toklist = Fetcher.tokenize_html(self._url, self._html, session)
bp = self.get_parser()
ip = IncrementalParser(bp, toklist, verbose=verbose)
# List of paragraphs containing a list of sentences containing
# token lists for sentences in string dump format
# (1-based paragraph and sentence indices)
pgs = [] # type: List[List[Dict[str, Any]]]
# Dict of parse trees in string dump format,
# stored by sentence index (1-based)
trees = OrderedDict()
# Word stem dictionary, indexed by (stem, cat)
words = defaultdict(int) # type: Dict[Tuple[str, str], int]
num_sent = 0
for p in ip.paragraphs():