Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def onekgreek_tei_xml_to_text_capitains():
"""Use MyCapitains program to convert TEI to plaintext."""
file = os.path.expanduser(
get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml')
xml_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/*/*/*.xml')
xml_paths = glob.glob(xml_dir)
if not len(xml_paths):
logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.')
raise FileNotFoundError
xml_paths = [path for path in xml_paths if '__cts__' not in path]
# new dir
new_dir = os.path.normpath(get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/')
if not os.path.isdir(new_dir):
os.makedirs(new_dir)
for xml_path in xml_paths:
_, xml_name = os.path.split(xml_path)
xml_name = xml_name.rstrip('.xml')
xml_name += '.txt'
plain_text = ''
with open(xml_path) as file_open:
text = CapitainsCtsText(resource=file_open)
def _get_corpus_properties(self, corpus_name):
"""Check whether a corpus is available for import.
:type corpus_name: str
:param corpus_name: Name of available corpus.
:rtype : str
"""
try:
# corpora = LANGUAGE_CORPORA[self.language]
corpora = self.all_corpora
except NameError as name_error:
msg = 'Corpus not available for language ' \
'"%s": %s' % (self.language, name_error)
logger.error(msg)
raise CorpusImportError(msg)
for corpus_properties in corpora:
if corpus_properties['name'] == corpus_name:
return corpus_properties
msg = 'Corpus "%s" not available for the ' \
'"%s" language.' % (corpus_name, self.language)
logger.error(msg)
raise CorpusImportError(msg)
list_of_lists = build_concordance(text) # type: List[List[str]]
user_data_rel = get_cltk_data_dir() + '/user_data' # type: str
user_data = os.path.expanduser(user_data_rel) # type: str
if not os.path.isdir(user_data):
os.makedirs(user_data)
file_path = os.path.join(user_data, 'concordance_' + name + '.txt') # type: str
concordance_output = '' # type: str
for word_list in list_of_lists:
for line in word_list:
concordance_output += line + '\n'
try:
with open(file_path, 'w') as open_file:
open_file.write(concordance_output)
logger.info("Wrote concordance to '%s'.", file_path)
except IOError as io_error:
logger.error("Failed to write concordance to '%s'. Error: %s", file_path, io_error)
git_uri = urljoin('https://github.com/cltk/', corpus_name + '.git')
#self._download_corpus(corpus_type, corpus_name, path)
type_dir_rel = os.path.join(CLTK_DATA_DIR, self.language, corpus_type)
type_dir = os.path.expanduser(type_dir_rel)
target_dir = os.path.join(type_dir, corpus_name)
target_file = os.path.join(type_dir, corpus_name, 'README.md')
# check if corpus already present
# if not, clone
if not os.path.isfile(target_file):
if not os.path.isdir(type_dir):
os.makedirs(type_dir)
try:
logger.info("Cloning '%s' from '%s'" % (corpus_name, git_uri))
Repo.clone_from(git_uri, target_dir, depth=1)
except Exception as e:
logger.error("Git clone of '%s' failed: '%s'", (git_uri, e))
# if corpus is present, pull latest
else:
try:
repo = Repo(target_dir)
assert not repo.bare # or: assert repo.exists()
o = repo.remotes.origin
logger.info("Pulling latest '%s' from '%s'." % (corpus_name, git_uri))
o.pull()
except Exception as e:
logger.error("Git pull of '%s' failed: '%s'" % (git_uri, e))
elif location == 'local':
logger.info("Importing from local path: '%s'", local_path)
if corpus_name in ('phi5', 'phi7', 'tlg'):
if corpus_name == 'phi5':
# normalize path for checking dir
if local_path.endswith('/'):
tlgu_path_rel = get_cltk_data_dir() + '/greek/software/greek_software_tlgu'
tlgu_path = os.path.expanduser(tlgu_path_rel)
if not self.testing:
print('Do you want to install TLGU?')
print('To continue, press Return. To exit, Control-C.')
input()
else:
print('Automated or test build, skipping keyboard input confirmation for installation of TLGU.')
try:
command = 'cd {0} && make install'.format(tlgu_path)
print('Going to run command:', command)
p_out = subprocess.call(command, shell=True)
if p_out == 0:
logger.info('TLGU installed.')
else:
logger.error('TLGU install without sudo failed.')
except Exception as exc:
logger.error('TLGU install failed: %s', exc)
else: # for Linux needing root access to '/usr/local/bin'
if not self.testing:
print('Could not install without root access. Do you want to install TLGU with sudo?')
command = 'cd {0} && sudo make install'.format(tlgu_path)
print('Going to run command:', command)
print('To continue, press Return. To exit, Control-C.')
input()
p_out = subprocess.call(command, shell=True)
else:
command = 'cd {0} && sudo make install'.format(tlgu_path)
p_out = subprocess.call(command, shell=True)
if p_out == 0:
logger.info('TLGU installed.')
else:
def make(self):
"""Build program."""
#! for linux install Clan
fp = os.path.expanduser('~/cltk_data/multilingual/software/lapos')
p_out = subprocess.call('cd {} && make'.format(fp), shell=True, stdout=subprocess.DEVNULL)
if p_out == 0:
print('Lapos built successfully.')
logger.info('Lapos build successfully.')
else:
print('Lapos did not build successfully.')
logger.error('Lapos did not build successfully.')
"""
try:
with open(path, 'rb') as opened_pickle:
try:
return pickle.load(opened_pickle)
except Exception as pickle_error:
logger.error(pickle_error)
raise
except FileNotFoundError as fnf_error:
logger.error(fnf_error)
raise
except IOError as io_err:
logger.error(io_err)
raise
except EOFError as eof_error:
logger.error(eof_error)
raise
except pickle.UnpicklingError as unp_error:
logger.error(unp_error)
raise
if corpus in ['tlg', 'phi5', 'phi7']:
orig_path = os.path.join(orig_path, corpus)
if corpus in ['tlg', 'phi7']:
if 'phi7' and latin is True:
latin = True
target_path = os.path.join(target_path, 'latin', 'text', corpus)
else:
latin = None
target_path = os.path.join(target_path, 'greek', 'text', corpus)
else:
target_path = os.path.join(target_path, 'latin', 'text', corpus)
latin = True
try:
corpus_files = os.listdir(orig_path)
except Exception as exception:
logger.error("Failed to find TLG files: %s", exception)
raise
# make a list of files to be converted
txts = []
[txts.append(x) for x in corpus_files if x.endswith('TXT')] # pylint: disable=W0106
# loop through list and convert one at a time
for txt in txts:
orig_txt_path = os.path.join(orig_path, txt)
if markup is None:
target_txt_dir = os.path.join(target_path, 'plaintext')
else:
target_txt_dir = os.path.join(target_path, str(markup))
if not os.path.isdir(target_txt_dir):
os.makedirs(target_txt_dir)
target_txt_path = os.path.join(target_txt_dir, txt)
try:
self.convert(orig_txt_path, target_txt_path, markup=None,
:rtype : str
"""
try:
# corpora = LANGUAGE_CORPORA[self.language]
corpora = self.all_corpora
except NameError as name_error:
msg = 'Corpus not available for language ' \
'"%s": %s' % (self.language, name_error)
logger.error(msg)
raise CorpusImportError(msg)
for corpus_properties in corpora:
if corpus_properties['name'] == corpus_name:
return corpus_properties
msg = 'Corpus "%s" not available for the ' \
'"%s" language.' % (corpus_name, self.language)
logger.error(msg)
raise CorpusImportError(msg)
def _check_import_source():
"""Check if tlgu imported, if not import it."""
path_rel = get_cltk_data_dir() + '/greek/software/greek_software_tlgu/tlgu.h'
path = os.path.expanduser(path_rel)
if not os.path.isfile(path):
try:
corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_software_tlgu')
except Exception as exc:
logger.error('Failed to import TLGU: %s', exc)
raise