Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
unit = l.NAMES[item['unit']]
except KeyError:
try:
entity = item['entity']
except KeyError:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
if entity == 'unknown':
derived = [{
'base': l.NAMES[i['base']].entity.name,
'power': i['power']
} for i in item['dimensions']]
entity = c.Entity(name='unknown', dimensions=derived)
elif entity in l.ENTITIES:
entity = l.ENTITIES[entity]
else:
print(('Could not find %s, provide "derived" and'
' "entity"' % item['unit']))
return
unit = c.Unit(
name=item['unit'],
dimensions=item['dimensions'],
entity=entity)
try:
span = next(
re.finditer(re.escape(item['surface']),
test['req'])).span()
except StopIteration:
print('Surface mismatch for "%s"' % test['req'])
return
uncert = None
def disambiguate_unit(unit_surface, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
:returns (str) unit name of the resolved unit
"""
if clf.USE_CLF:
base = clf.disambiguate_unit(unit_surface, text, lang).name
else:
base = (
load.units(lang).symbols[unit_surface]
or load.units(lang).surfaces[unit_surface]
or load.units(lang).surfaces_lower[unit_surface.lower()]
or load.units(lang).symbols_lower[unit_surface.lower()]
)
if len(base) > 1:
base = no_clf.disambiguate_no_classifier(base, text, lang)
elif len(base) == 1:
base = next(iter(base))
if base:
base = base.name
else:
base = "unk"
return base
def disambiguate_no_classifier(entities, text, lang="en_US"):
"""
Disambiguate the entity or unit without a classifier
:param entities:
:param text:
:param lang:
:return: a single entity or unit that has been chosen for
"""
word_sets = load.training_set(lang)
max_entity, max_count, max_relative = None, 0, 0
for entity in entities:
count = 0
total = 0
for word_set in word_sets:
if word_set["unit"] == entity.name:
total += len(word_set["text"])
for word in word_set["text"].split(" "):
count += 1 if word in text else 0
try:
relative = count / total
except ZeroDivisionError:
relative = 0
if relative > max_relative or (relative == max_relative and count > max_count):
max_entity, max_count, max_relative = entity, count, relative
def disambiguate_unit(unit, text, lang="en_US"):
"""
Resolve ambiguity between units with same names, symbols or abbreviations.
"""
new_unit = (
load.units(lang).symbols.get(unit)
or load.units(lang).surfaces.get(unit)
or load.units(lang).surfaces_lower.get(unit.lower())
or load.units(lang).symbols_lower.get(unit.lower())
)
if not new_unit:
raise KeyError('Could not find unit "%s" from "%s"' % (unit, text))
if len(new_unit) > 1:
transformed = classifier(lang).tfidf_model.transform([clean_text(text, lang)])
scores = classifier(lang).classifier.predict_proba(transformed).tolist()[0]
scores = zip(scores, classifier(lang).target_names)
# Filter for possible names
names = [i.name for i in new_unit]
scores = [i for i in scores if i[1] in names]
# Sort by rank
scores = sorted(scores, key=lambda x: x[0], reverse=True)
def build_quantity(orig_text, text, item, values, unit, surface, span, uncert):
"""
Build a Quantity object out of extracted information.
"""
# TODO rerun if change occurred
# Re parse unit if a change occurred
dimension_change = True
# Extract "absolute " ...
_absolute = "absolute "
if (
unit.name == "dimensionless"
and _absolute == orig_text[span[0] - len(_absolute) : span[0]]
):
unit = load.units(lang).names["kelvin"]
unit.original_dimensions = unit.dimensions
surface = _absolute + surface
span = (span[0] - len(_absolute), span[1])
dimension_change = True
# Usually "$3T" does not stand for "dollar tesla"
# this holds as well for "3k miles"
# TODO use classifier to decide if 3K is 3 thousand or 3 Kelvin
if unit.entity.dimensions:
if (
len(unit.entity.dimensions) > 1
and unit.entity.dimensions[0]["base"] == "currency"
and unit.original_dimensions[1]["surface"] in reg.suffixes(lang).keys()
):
suffix = unit.original_dimensions[1]["surface"]
# Only apply if at least last value is suffixed by k, M, etc
if isinstance(unit, classes.Unit):
surfaces.update(unit.surfaces)
surfaces.update(unit.symbols)
for surface in surfaces:
neighbours = v.most_similar(
v.query(surface), topn=topn, min_similarity=min_similarity)
training_set.append({
'unit':
name,
'text':
' '.join(neighbour[0] for neighbour in neighbours)
})
print('Done')
with open(
os.path.join(load.TOPDIR, 'similars.json'), 'w',
encoding='utf8') as file:
json.dump(training_set, file, sort_keys=True, indent=4)
and len(unit.original_dimensions) > 1
and unit.original_dimensions[-1]["base"] == "count"
):
unit.original_dimensions = unit.original_dimensions[:-1]
dimension_change = True
surface = surface[:-5]
span = (span[0], span[1] - 5)
_LOGGER.debug('\tCorrect for "time"')
if dimension_change:
if unit.original_dimensions:
unit = parser.get_unit_from_dimensions(
unit.original_dimensions, orig_text, lang
)
else:
unit = load.units(lang).names["dimensionless"]
# Discard irrelevant txt2float extractions, cardinal numbers, codes etc.
if (
surface.lower() in ["a", "an", "one"]
or re.search(r"1st|2nd|3rd|[04-9]th", surface)
or re.search(r"\d+[A-Z]+\d+", surface)
or re.search(r"\ba second\b", surface, re.IGNORECASE)
):
_LOGGER.debug('\tMeaningless quantity ("%s"), discard', surface)
return
objs = []
for value in values:
obj = cls.Quantity(
value=value,
unit=unit,
def get_entity_from_dimensions(dimensions, text, lang="en_US"):
"""
Infer the underlying entity of a unit (e.g. "volume" for "m^3") based on
its dimensionality.
"""
new_derived = [
{"base": load.units(lang).names[i["base"]].entity.name, "power": i["power"]}
for i in dimensions
]
final_derived = sorted(new_derived, key=lambda x: x["base"])
key = load.get_key_from_dimensions(final_derived)
ent = dis.disambiguate_entity(key, text, lang)
if ent is None:
_LOGGER.debug("\tCould not find entity for: %s", key)
ent = cls.Entity(name="unknown", dimensions=new_derived)
return ent