Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_casting():
""" Test functions to cast words in nominative case to other cases """
from reynir.bindb import BIN_Db
db = BIN_Db()
assert db.cast_to_accusative("") == ""
assert db.cast_to_dative("") == ""
assert db.cast_to_genitive("") == ""
assert db.cast_to_accusative("xxx") == "xxx"
assert db.cast_to_dative("xxx") == "xxx"
assert db.cast_to_genitive("xxx") == "xxx"
assert db.cast_to_accusative("maðurinn") == "manninn"
assert db.cast_to_dative("maðurinn") == "manninum"
assert db.cast_to_genitive("maðurinn") == "mannsins"
assert db.cast_to_accusative("mennirnir") == "mennina"
assert db.cast_to_dative("mennirnir") == "mönnunum"
assert db.cast_to_genitive("mennirnir") == "mannanna"
sys.exit(1)
db_conn = sqlite3.connect(db_path, check_same_thread=False)
db_conn.row_factory = lambda c, r: dict(zip([col[0] for col in c.description], r))
q = "SELECT DISTINCT nafn FROM ornefni;"
res = db_conn.cursor().execute(q)
matches = [row["nafn"] for row in res]
num_bin = 0
num_comb = 0
num_fail = 0
with BIN_Db.get_db() as db:
for m in matches:
w = m.strip()
if " " in w or "-" in w or "-" in w:
continue
# Direct BÍN lookup
meanings = db.meanings(w)
if meanings:
num_bin += 1
continue
# Lookup using BÍN and combinator
_, meanings = db.lookup_word(w, auto_uppercase=True)
if meanings:
num_comb += 1
continue
def process(self, session, processor, **kwargs):
""" Process a tree for an entire article """
# For each sentence in turn, do a depth-first traversal,
# visiting each parent node after visiting its children
# Initialize the running state that we keep between sentences
article_begin = getattr(processor, "article_begin", None) if processor else None
article_end = getattr(processor, "article_end", None) if processor else None
sentence = getattr(processor, "sentence", None) if processor else None
# If visit(state, node) returns False for a node, do not visit child nodes
visit = getattr(processor, "visit", None) if processor else None
# If no handler exists for a nonterminal, call default() instead
default = getattr(processor, "default", None) if processor else None
with BIN_Db.get_db() as bin_db:
state = {
"session": session,
"processor": processor,
"bin_db": bin_db,
"url": self.url,
"authority": self.authority,
"_sentence": sentence,
"_visit": visit,
"_default": default,
"index": 0,
}
# Add state parameters passed via keyword arguments, if any
state.update(kwargs)
# Call the article_begin(state) function, if it exists
def top_authors(days=_TOP_AUTHORS_PERIOD, session=None):
""" Generate list of top authors w. parse percentage. """
end = datetime.utcnow()
start = end - timedelta(days=days)
authors = BestAuthorsQuery.period(
start, end, enclosing_session=session, min_articles=10
)[:20]
authresult = list()
with BIN_Db.get_db() as bindb:
for a in authors:
name = a[0]
gender = bindb.lookup_name_gender(name)
if gender == "hk": # Skip unnamed authors (e.g. "Ritstjórn Vísis")
continue
perc = round(float(a[4]), 2)
authresult.append({"name": name, "gender": gender, "perc": perc})
return authresult[:10]
] # Go through up to 2 * N records
)
def is_better_title(new_title, old_title):
len_new = len(new_title)
len_old = len(old_title)
if len_old >= _MAX_TITLE_LENGTH:
# Too long: we want a shorter one
return len_new < len_old
if len_new >= _MAX_TITLE_LENGTH:
# This one is too long: we don't want it
return False
# Otherwise, longer is better
return len_new > len_old
with BIN_Db.get_db() as bindb:
for p in q:
# Insert the name into the list if it's not already there,
# or if the new title is longer than the previous one
if p.name not in toplist or is_better_title(
p.title, toplist[p.name][0]
):
toplist[p.name] = (
correct_spaces(p.title),
p.article_url,
p.id,
bindb.lookup_name_gender(p.name),
)
if len(toplist) >= limit:
# We now have as many names as we initially wanted: terminate the loop
break
def tagset(self, word, at_sentence_start=False):
""" Return a list of (probability, tag) tuples for the given word """
toklist = list(parse_tokens(" ".join(word)))
token = toklist[0]
w = word[0]
if token.kind == TOK.WORD and token.val is None:
try:
with BIN_Db.get_db() as db:
w, m = db.lookup_word(token.txt, at_sentence_start)
except Exception:
w, m = token.txt, []
token = TOK.Word(w, m)
return self._ngram_tagger.tag_single_token(token)
# Get country code
cc = isocode_for_country_name(country)
if not cc:
logging.warning("No CC for country {0}".format(country))
return False
# Find capital city, given the country code
capital = capital_for_cc(cc)
if not capital:
return False
# Use the Icelandic name for the city
ice_cname = icelandic_city_name(capital["name_ascii"])
# Look up genitive country name for voice description
bres = BIN_Db().lookup_genitive(country, cat="no")
country_gen = bres[0].ordmynd if bres else country
answer = ice_cname
response = dict(answer=answer)
voice = "Höfuðborg {0} er {1}".format(country_gen, answer)
q.set_answer(response, answer, voice)
q.set_key("Höfuðborg {0}".format(country_gen))
q.set_context(dict(subject=ice_cname))
return True
def lookup_best_word(word):
""" Look up word in BÍN, pick right one acc. to a criterion. """
with BIN_Db().get_db() as db:
def nouns_only(bin_meaning):
return bin_meaning.ordfl in ("kk", "kvk", "hk")
res = list(filter(nouns_only, db.lookup_nominative(word)))
if not res:
# Try with uppercase first char
capw = word.capitalize()
res = list(filter(nouns_only, db.lookup_nominative(capw)))
if not res:
return None
# OK, we have one or more matching nouns
if len(res) == 1:
m = res[0]
else:
def nom2dat(w):
""" Look up the dative form of a noun in BÍN. """
if not w:
return ""
def sort_by_preference(m_list):
""" Discourage rarer declension forms, i.e. ÞGF2 and ÞGF3 """
return sorted(m_list, key=lambda m: "2" in m.beyging or "3" in m.beyging)
with BIN_Db().get_db() as db:
return db.cast_to_dative(w, meaning_filter_func=sort_by_preference)
def QGeoSubject(node, params, result):
n = capitalize_placename(result._text)
bin_res = BIN_Db().lookup_nominative(n)
res = bin_res[0].stofn if bin_res else n
result.subject = res