Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def tokenize_and_merge_possible_mw_tokens(text, flat_tree):
mw_tokens = list(bintokenizer.tokenize(text)) # multi-word tokens
mw_tokens = [tok.txt.split(" ") for tok in mw_tokens if tok.txt is not None]
sw_tokens = [tok for toks in mw_tokens for tok in toks] # single-word tokens
parse_tokens = list(flat_tree.split(" "))
parse_terminals = filter(lambda x: x[1][0].islower(), enumerate(parse_tokens))
leaf_idx_to_parse_idx = {
leaf_idx: ptok_idx
for (leaf_idx, (ptok_idx, ptok)) in enumerate(parse_terminals)
}
offset = 0
merge_list = []
for mw_token in mw_tokens:
sw_count = len(mw_token)
idxed_mw_token = [(idx + offset, token) for (idx, token) in enumerate(mw_token)]
def _normalize_text(cls, text):
""" Preprocess text and normalize for parsing network """
pgs = text.split("\n")
normalized_pgs = [
[
tok.txt
for tok in list(bintokenizer.tokenize(pg))
if BIN_Token.is_understood(tok)
]
for pg in pgs
]
return [
" ".join(tok for tok in npg if tok) for npg in normalized_pgs
]
def index_text(text: str) -> Tuple[Dict[int, List[int]], Dict[int, str]]:
""" Segments contiguous (Icelandic) text into paragraphs and sentences
and returns:
dictionary of sentence indices to sentences
dictionary of paragraph index to constituent sentence indices"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
pg_idx_to_sent_idx = dict() # type: Dict[int, List[int]]
sent_idx_to_sent = dict() # type: Dict[int, str]
curr_sent_idx = 0
curr_pg_idx = 0
for pg in pgs:
sent_idxs = []
for _, sent in pg:
curr_sent = list(filter(BIN_Token.is_understood, sent)) # type: List[Tok]
curr_sent_text = tokenizer.normalized_text_from_tokens(curr_sent)
sent_idxs.append(curr_sent_idx)
sent_idx_to_sent[curr_sent_idx] = curr_sent_text
curr_sent_idx += 1
pg_idx_to_sent_idx[curr_pg_idx] = sent_idxs
def _normalize_sentence(cls, single_sentence):
""" Preprocess text and normalize for parsing network """
return [
tok.txt
for tok in bintokenizer.tokenize(single_sentence)
if BIN_Token.is_understood(tok)
]
w = san
elif t.kind == TOK.PERSON:
cat = "person_" + t.val[0].gender
elif t.kind == TOK.ENTITY:
cat = "entity"
return (w, cat)
# Parse arg string into word/cat tuples
wds = _str2words(warg)
# Try to tokenize each item that doesn't have a category
nwds = []
for w, c in wds:
if c is None or c == CAT_UNKNOWN:
# Try to tokenize
tokens = list(filter(lambda x: x.kind in _VALID_TOKENS, tokenize(w)))
for t in tokens:
nwds.append(cat4token(t))
else:
nwds.append((w, c))
# Filter all words not in allowed category and restrict no. words
words = list(filter(lambda x: x[1] in _VALID_WCATS, nwds))
words = words[:_MAX_NUM_WORDS]
# Generate date labels
now = datetime.utcnow()
delta = date_to - date_from
with changedlocale(category="LC_TIME"):
# Group by week if period longer than 3 months
label_date_strings = [] # type: List[Union[str, Tuple[str, str]]]
if delta.days >= _SHOW_WEEKS_CUTOFF:
def split_text(text: str) -> List[List[str]]:
""" Segments contiguous (Icelandic) text into paragraphs and sentences
and returns a list of lists
"""
text = prep_text_for_tokenizer(text)
tok_stream = bintokenizer.tokenize(text)
pgs = tokenizer.paragraphs(tok_stream)
data = [] # type: List[List[str]]
for pg in pgs:
pg_data = [] # type: List[str]
for _, sentence in pg:
sentence = list(filter(BIN_Token.is_understood, sentence))
sentence_text = tokenizer.normalized_text_from_tokens(sentence)
pg_data.append(sentence_text)
data.append(pg_data)
return data