Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@staticmethod
def load_same_stroke(path, sep='\t'):
"""
加载形似字
:param path:
:param sep:
:return:
"""
result = dict()
if not os.path.exists(path):
logger.warn("file not exists:" + path)
return result
with codecs.open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('#'):
continue
parts = line.split(sep)
if parts and len(parts) > 1:
for i, c in enumerate(parts):
result[c] = set(list(parts[:i] + parts[i + 1:]))
return result
@staticmethod
def load_same_pinyin(path, sep='\t'):
"""
加载同音字
:param path:
:param sep:
:return:
"""
result = dict()
if not os.path.exists(path):
logger.warn("file not exists:" + path)
return result
with codecs.open(path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith('#'):
continue
parts = line.split(sep)
if parts and len(parts) > 2:
key_char = parts[0]
same_pron_same_tone = set(list(parts[1]))
same_pron_diff_tone = set(list(parts[2]))
value = same_pron_same_tone.union(same_pron_diff_tone)
if len(key_char) > 1 or not value:
continue
result[key_char] = value
return result
# 取疑似错字信息
for i in self._get_maybe_error_index(sent_scores):
token = sentence[i]
# pass filter word
if self.is_filter_token(token):
continue
# pass in stop word dict
if token in self.stopwords:
continue
maybe_err = [token, i + start_idx, i + start_idx + 1,
ErrorType.char] # token, begin_idx, end_idx, error_type
self._add_maybe_error_item(maybe_err, maybe_errors)
except IndexError as ie:
logger.warn("index error, sentence:" + sentence + str(ie))
except Exception as e:
logger.warn("detect error, sentence:" + sentence + str(e))
return sorted(maybe_errors, key=lambda k: k[1], reverse=False)