Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
with file(args.input) as lines:
line = next(lines).strip()
# print line
names = line.strip().split(",")
line = next(lines).strip()
values = line.strip().split(",")
numerics = []
for value in values:
try:
float(value)
numerics.append(True)
except:
numerics.append(False)
names_numeric = [name for name, numeric in zip(names, numerics) if numeric]
print(names_numeric)
output = vaex.file.other.Hdf5MemoryMapped.create(args.output, row_count, names_numeric)
Ncols = len(names)
cols = [output.columns[name] if numeric else None for name, numeric in zip(names, numerics)]
def copy(line, row_index):
values = line.strip().split(",")
for column_index in range(Ncols):
if numerics[column_index]:
value = float(values[column_index])
cols[column_index][row_index] = value
row = 0
copy(line, row)
row += 1
progressbar = vaex.utils.progressbar(title="exporting") if args.progress else None
for line in lines:
# print line
copy(line, row)
column_names.append(random_index_name)
data_types.append(np.int64().dtype)
data_shapes.append((N,))
ucds.append(None)
units.append(None)
else:
random_index_name = None
# TODO: all expressions can have missing values.. how to support that?
null_values = {key: dataset.columns[key].fill_value for key in dataset.get_column_names() if dataset.is_masked(key) and dataset.dtype(key).kind != "f"}
vaex.file.colfits.empty(path, N, column_names, data_types, data_shapes, ucds, units, null_values=null_values)
if shuffle:
del column_names[-1]
del data_types[-1]
del data_shapes[-1]
dataset_output = vaex.file.other.FitsBinTable(path, write=True)
_export(dataset_input=dataset, dataset_output=dataset_output, path=path, random_index_column=random_index_name,
column_names=column_names, selection=selection, shuffle=shuffle,
progress=progress, sort=sort, ascending=ascending)
dataset_output.close_files()
# sort to get predicatable behaviour (useful for testing)
filenames.extend(list(sorted(glob.glob(path))))
ds = None
if len(filenames) == 0:
raise IOError('Could not open file: {}, it does not exist'.format(path))
filename_hdf5 = _convert_name(filenames, shuffle=shuffle)
filename_hdf5_noshuffle = _convert_name(filenames, shuffle=False)
if len(filenames) == 1:
path = filenames[0]
naked_path = path
if '?' in naked_path:
naked_path = naked_path[:naked_path.index('?')]
ext = os.path.splitext(naked_path)[1]
if os.path.exists(filename_hdf5) and convert: # also check mtime?
if convert:
ds = vaex.file.open(filename_hdf5)
else:
ds = vaex.file.open(filename_hdf5, *args, **kwargs)
else:
if ext == '.csv' or naked_path.endswith(".csv.bz2"): # special support for csv.. should probably approach it a different way
ds = from_csv(path, copy_index=copy_index, **kwargs)
else:
ds = vaex.file.open(path, *args, **kwargs)
if convert and ds:
ds.export_hdf5(filename_hdf5, shuffle=shuffle)
ds = vaex.file.open(filename_hdf5) # argument were meant for pandas?
if ds is None:
if os.path.exists(path):
raise IOError('Could not open file: {}, did you install vaex-hdf5? Is the format supported?'.format(path))
if os.path.exists(path):
raise IOError('Could not open file: {}, it does not exist?'.format(path))
elif len(filenames) > 1:
def samp_table_load_votable(self, url=None, table_id=None, name=None):
filenames = []
if table_id is not None:
filename = table_id
if filename.startswith("file:/"):
filename = filename[5:]
basename, ext = os.path.splitext(filename)
if os.path.exists(filename):
filenames.append(filename)
for other_ext in [".hdf5", ".fits"]:
filename = basename + other_ext
print(filename)
if os.path.exists(filename) and filename not in filenames:
filenames.append(filename)
filenames = list(filter(vaex.file.can_open, filenames))
options = []
for filename in filenames:
options.append(filename + " | read directly from file (faster)")
options.append(url + " | load as VOTable (slower)")
# options.append("link to existing opened dataset")
for dataset in self.dataset_selector.datasets:
options.append("link to existing open dataset: " + dataset.name)
index = choose(self, "SAMP: load table", "Choose how to load table", options)
if index is not None:
if index < len(filenames):
print("open file", filenames[index])
self.load_file(filenames[index], table_id)
elif index == len(filenames):
self.load_votable(url, table_id)
print("load votable", url)
else:
parser_file = subparsers.add_parser('csv', help='use a csv file as source (e.g. .hdf5, .fits, .vot (VO table), .asc (ascii)')
parser_file.add_argument("input", help="input source or file, when prefixed with @ it is assumed to be a text file with a file list (one file per line)")
parser_file.add_argument("output", help="output file (ends in .hdf5)")
parser_file.add_argument("columns", help="list of columns to export (or all when empty)", nargs="*")
args = parser.parse_args(argv[1:])
verbosity = ["ERROR", "WARNING", "INFO", "DEBUG"]
logging.getLogger("vaex").setLevel(verbosity[min(3, args.verbose)])
dataset = None
if args.task == "soneira":
if vaex.utils.check_memory_usage(4 * 8 * 2**args.max_level, vaex.utils.confirm_on_console):
if not args.quiet:
print("generating soneira peebles dataset...")
dataset = vaex.file.other.SoneiraPeebles(args.dimension, 2, args.max_level, args.lambdas)
else:
return 1
if args.task == "tap":
dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
if not args.quiet:
print("exporting from {tap_url} table name {table_name} to {output}".format(tap_url=args.tap_url, table_name=args.table_name, output=args.output))
if args.task == "csv":
# dataset = vaex.dataset.DatasetTap(args.tap_url, args.table_name)
if not args.quiet:
print("exporting from {input} to {output}".format(input=args.input, output=args.output))
if args.task == "file":
if args.input[0] == "@":
inputs = open(args.input[1:]).readlines()
dataset = vaex.open_many(inputs)
else:
dataset = vaex.open(args.input)
def __init__(self, file, byte_offset, length, dtype, write=False, path=None):
self.path = path or file.name
self.file = file
self.tls = threading.local()
# keep a record of all duplicate file handles to we can close them
self.file_handles = []
self.tls.file = vaex.file.dup(file)
self.file_handles.append(file)
self.native = False
# if hasattr(self.file, 'fileno') and osname
# fcntl.fcntl(self.file.fileno(), F_NOCACHE, 1)
# self.native = True
#libc.fcntl(self.file.fileno(), fcntl.F_NOCACHE, 1)
#libc.fcntl(c_int(self.file.fileno()), c_int(fcntl.F_NOCACHE), c_int(1))
self.byte_offset = byte_offset
self.length = length
self.dtype = np.dtype(dtype)
self.shape = (length,)
self.write = write
offset_optimal = offset & ~page_mask
padding = offset - offset_optimal
bytes_read = libc.pread(ctypes.c_int32(self.file.fileno()), ar_ptr, ctypes.c_uint64(N * itemsize + padding), ctypes.c_uint64(offset_optimal))
if (bytes_read-padding) != N * itemsize:
raise IOError('read error: expected %d bytes, read %d, padding: %d' % (N * itemsize, bytes_read, padding))
ar = np.frombuffer(ar_bytes, self.dtype, offset=padding, count=N)
else:
byte_length = items*itemsize
offset = self.byte_offset + start * itemsize
# Quick and safe way to get the thread local file handle:
file = getattr(self.tls, 'file', None)
if file is None:
with cache_lock:
file = getattr(self.tls, 'file', None)
if file is None:
file = self.tls.file = vaex.file.dup(self.file)
self.file_handles.append(file)
# this is the fast path, that avoids a memory copy but gets a view on the underlying data
# cache.py:CachedFile supports this
if hasattr(file, '_as_numpy'):
ar = file._as_numpy(offset, byte_length, self.dtype)
else:
# Traditinal file object go this slower route
# and they need per thread file object since the location (seek)
# is in the state of the file object
file.seek(offset)
data = file.read(byte_length)
ar = np.frombuffer(data, self.dtype, count=N)
if USE_CACHE:
with cache_lock:
cache[key] = ar
def dup(self):
if callable(self.file):
file = self.file
else:
file = vaex.file.dup(self.file)
return CachedFile(file, self.path, self.cache_dir, self.block_size, data_file=self.data_file, mask_file=self.mask_file)
if '?' in naked_path:
naked_path = naked_path[:naked_path.index('?')]
# only use the first item
options = {key: values[0] for key, values in parse_qs(o.query).items()}
options.update(kwargs)
use_cache = options.get('cache', 'true') in ['true', 'True', '1']
if 'cache' in options:
del options['cache']
anon = options.get('anon', 'false') in ['true', 'True', '1']
if 'anon' in options:
del options['anon']
s3 = s3fs.S3FileSystem(anon=anon, default_block_size=1,
default_fill_cache=False, **options)
if use_cache:
fp = lambda: s3.open(naked_path, mode)
fp = vaex.file.cache.CachedFile(fp, naked_path)
else:
fp = s3.open(naked_path, mode)
return fp
def register_opener():
vaex.file.register(ArrowOpener)
vaex.file.register(ParquetOpener)