Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# if no orthography profile is specified, simply return
# Unicode grapheme clusters, regex pattern "\X"
if self.orthography_profile == None:
return self.grapheme_clusters(string)
parses = []
for word in string.split():
parse = getParse(self.root, word)
# case where the parsing fails
if len(parse) == 0:
# replace characters in string but not in orthography profile with
parse = " "+self.find_missing_characters(self.characters(word))
# write problematic stuff to standard error
log.debug("The string '{0}' does not parse given the specified orthography profile {1}.\n".format(word, self.orthography_profile))
parses.append(parse)
# remove the outter word boundaries
result = "".join(parses).replace("##", "#")
result = result.rstrip("#")
result = result.lstrip("#")
return result.strip()
tokens = line.split("\t")
grapheme = tokens[0].strip()
# check for duplicates in the orthography profile (fail if dups)
if not grapheme in self.op_graphemes:
self.op_graphemes[grapheme] = 1
else:
raise Exception("You have a duplicate in your orthography profile.")
if len(tokens) == 1:
continue
for i, token in enumerate(tokens):
token = token.strip()
self.mappings[grapheme, self.column_labels[i].lower()] = token
log.debug('%s %s' % (grapheme, self.column_labels[i].lower()))
# print the tree structure if debug mode is on
if log.get_logger().getEffectiveLevel() <= logging.INFO:
log.debug("A graphical representation of your orthography profile in a tree ('*' denotes sentinels):\n")
printTree(self.root, "")
print()
def tsv2triple(wordlist, outfile=None):
"""
Function converts a wordlist to a triple data structure.
Notes
-----
The basic values of which the triples consist are:
* ID (the ID in the TSV file)
* COLUMN (the column in the TSV file)
* VALUE (the entry in the TSV file)
"""
tstore = []
for head in wordlist.header:
log.debug('tsv2triple: ' + head)
for key in wordlist:
tstore.append((key, head.upper(), wordlist[key, head]))
if outfile:
out = ''
for a, b, c in tstore:
if isinstance(c, list):
c = ' '.join([text_type(x) for x in c])
if c != '-':
out += '{0}\t{1}\t{2}\n'.format(a, b, c)
util.write_text_file(outfile, out, normalize='NFC')
return tstore
"""
# if no orthography profile was initiated, this method can't be called
# if self.orthography_profile == None:
# raise Exception("This function requires that an orthography profile is specified.")
# if no orthography profile rules file has been specified, simply return the string
if self.orthography_profile_rules == None:
return string
result = unicodedata.normalize("NFD", string)
for i in range(0, len(self.op_rules)):
match = self.op_rules[i].search(result)
if match:
result = re.sub(self.op_rules[i], self.op_replacements[i], result)
log.debug("Input/output:"+"\t"+string+"\t"+result)
log.debug("Pattern/replacement:"+"\t"+self.op_rules[i].pattern+"\t"+self.op_replacements[i])
# this is incase someone introduces a non-NFD ordered sequence of characters
# in the orthography profile
result = unicodedata.normalize("NFD", result)
return result
"""
# if no orthography profile was initiated, this method can't be called
# if self.orthography_profile == None:
# raise Exception("This function requires that an orthography profile is specified.")
# if no orthography profile rules file has been specified, simply return the string
if self.orthography_profile_rules == None:
return string
result = unicodedata.normalize("NFD", string)
for i in range(0, len(self.op_rules)):
match = self.op_rules[i].search(result)
if match:
result = re.sub(self.op_rules[i], self.op_replacements[i], result)
log.debug("Input/output:"+"\t"+string+"\t"+result)
log.debug("Pattern/replacement:"+"\t"+self.op_rules[i].pattern+"\t"+self.op_replacements[i])
# this is incase someone introduces a non-NFD ordered sequence of characters
# in the orthography profile
result = unicodedata.normalize("NFD", result)
return result
stmts += ["line[{0}] ".format(idx) + value]
log.debug("calculated what should be excluded")
# get the data
out = {}
for key, line in self._data.items():
log.debug(key)
if rows:
if eval(" and ".join(stmts)):
out[key] = [line[i] for i in indices]
else:
out[key] = [line[i] for i in indices]
log.debug("passing data to wl2qlc")
return wl2qlc(header, out, **keywords)
# output dst-format (phylip)
if fileformat == 'dst':
# check for distances as keyword
if 'distances' not in self._meta:
self._meta['distances'] = wl2dst(self, **keywords)
out = matrix2dst(self._meta['distances'], self.taxa,
stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
return _write_file(keywords['filename'], out, fileformat)
# output tre-format (newick)
if fileformat in ['tre', 'nwk']: # ,'cluster','groups']:
if 'tree' not in self._meta:
# check for distances
def _normalize(self):
"""
Function to Unicode normalize (NFD) cells in the input matrix.
"""
for i in range(0, len(self.matrix)):
for j in range(0, len(self.matrix[i])):
normalized_cell = unicodedata.normalize("NFD", self.matrix[i][j])
if not normalized_cell == self.matrix[i][j]:
log.debug("Cell at <"+self.matrix[i][j]+"> ["+str(i)+","+str(j)+"] not in Unicode NFD. Normalizing.")
self.matrix[i][j] = normalized_cell
if not grapheme in self.op_graphemes:
self.op_graphemes[grapheme] = 1
else:
raise Exception("You have a duplicate in your orthography profile.")
if len(tokens) == 1:
continue
for i, token in enumerate(tokens):
token = token.strip()
self.mappings[grapheme, self.column_labels[i].lower()] = token
log.debug('%s %s' % (grapheme, self.column_labels[i].lower()))
# print the tree structure if debug mode is on
if log.get_logger().getEffectiveLevel() <= logging.INFO:
log.debug("A graphical representation of your orthography profile in a tree ('*' denotes sentinels):\n")
printTree(self.root, "")
print()
# then check for alms consisting only of gaps
cols = misc.transpose(alm_clone)
idxs = []
for i, col in enumerate(cols):
if set(col) == set('-'):
idxs += [i]
for idx in idxs[::-1]:
for i, alm in enumerate(alm_clone):
del alm_clone[i][idx]
if alignment != alm_clone:
lgtxt = 'Modified the alignment:\n'
for i in range(len(alignment)):
lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
lgtxt += ' '.join(alm_clone[i]) + '\n'
log.debug(lgtxt)
return alm_clone
else:
return alignment