Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_load_empty_table_arrow(self, con):
con.execute("drop table if exists baz;")
con.execute("create table baz (a int, b float, c text);")
data = [(1, 1.1, 'a'),
(2, 2.2, '2'),
(3, 3.3, '3')]
df = pd.DataFrame(data, columns=list('abc')).astype({
'a': 'int32',
'b': 'float32'
})
table = pa.Table.from_pandas(df, preserve_index=False)
con.load_table("baz", table, method='arrow')
result = sorted(con.execute("select * from baz"))
self.check_empty_insert(result, data)
con.execute("drop table if exists baz;")
def test_fastparquet_read_with_hdfs():
fs = hdfs_test_client()
df = tm.makeDataFrame()
table = pa.Table.from_pandas(df)
path = '/tmp/testing.parquet'
with fs.open(path, 'wb') as f:
pq.write_table(table, f)
parquet_file = fastparquet.ParquetFile(path, open_with=fs.open)
result = parquet_file.to_pandas()
tm.assert_frame_equal(result, df)
print('Writing {:d} rows...'.format(num_rows))
if writer is None:
arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
writer = pq.ParquetWriter(parquet_file, arrow_table.schema, compression=compression, use_deprecated_int96_timestamps=True)
writer.write_table(arrow_table)
else:
arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
writer.write_table(arrow_table)
# Empty the current row set
current_row_set = []
# Add any left over rows and close the Parquet file
if num_rows:
print('Writing {:d} rows...'.format(num_rows))
arrow_table = pa.Table.from_pandas(_make_df(current_row_set))
writer.write_table(arrow_table)
writer.close()
print('Parquet File Complete')
def chunked_write(df_iterator: TextFileReader, parquet_writer: pq.ParquetWriter, date_cols: List[str]):
"""
Writes Parquet version of the chunked dataframe input.
Arrow table creation and Parquet-writes take up around 25% of the time on this function.
The CSV read takes around 75%.
"""
rows_processed = 0
for df in df_iterator:
rows_processed += min(BUFFER_SIZE_ROWS, len(df))
for col_name in date_cols:
df[col_name] = pd.to_datetime(df[col_name], unit="ms")
pa_table = pa.Table.from_pandas(df=df, schema=parquet_writer.schema)
parquet_writer.write_table(pa_table)
print("Rows processed: {}".format(rows_processed), end="\r", flush=True)
print()
def convert(obj, message):
if isinstance(obj, (awkward.array.base.AwkwardArray, numpy.ndarray)):
out = toarrow(obj)
if isinstance(out, pyarrow.Table):
return out
else:
return pyarrow.Table.from_batches([pyarrow.RecordBatch.from_arrays([out], [""])])
else:
raise TypeError(message)
def save(file, table):
"""
Save pandas dataframe to parquet
:param file:
:param table:
:return:
"""
pq.write_table(pa.Table.from_pandas(table), file)
def _load(self):
source = pa.memory_map(self.path)
try:
# first we try if it opens as stream
reader = pa.ipc.open_stream(source)
except pa.lib.ArrowInvalid:
# if not, we open as file
reader = pa.ipc.open_file(source)
# for some reason this reader is not iterable
batches = [reader.get_batch(i) for i in range(reader.num_record_batches)]
else:
# if a stream, we're good
batches = reader # this reader is iterable
table = pa.Table.from_batches(batches)
self._load_table(table)
def df_to_parquet(df, filename, compression='SNAPPY'):
"""write_to_parquet: Converts a Pandas DataFrame into a Parquet file
Args:
df (pandas dataframe): The Pandas Dataframe to be saved as parquet file
filename (string): The full path to the filename for the Parquet file
"""
# Nullable integer arrays are currently not handled by Arrow
# See: https://issues.apache.org/jira/browse/ARROW-5379
# Cast Nullable integer arrays to float32 before serializing
null_int_types = [pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, pd.Int64Dtype]
for col in df:
if type(df[col].dtype) in null_int_types:
df[col] = df[col].astype(np.float32)
arrow_table = pa.Table.from_pandas(df)
if compression == 'UNCOMPRESSED':
compression = None
pq.write_table(arrow_table, filename, compression=compression, flavor='spark')
return statistics.datasets[0]
# If there are multiple datasets, attempt to find the dataset for the
# default slice (i.e., slice for all examples) from among the datasets.
for dataset in statistics.datasets:
if dataset.name == constants.DEFAULT_SLICE_KEY:
logging.warning('Multiple datasets found in statistics. Using the '
'default slice dataset.')
return dataset
# If there are multiple datasets, but the default slice is not found, raise an
# error.
raise ValueError('Only statistics proto with one dataset or the default '
'slice (i.e., "All Examples" slice) is currently supported.')
@beam.typehints.with_input_types(
beam.typehints.Tuple[pa.Table, anomalies_pb2.Anomalies])
@beam.typehints.with_output_types(types.BeamSlicedTable)
class _GenerateAnomalyReasonSliceKeys(beam.DoFn):
"""Yields a slice key for each anomaly reason in the Anomalies proto."""
def process(self, element):
table, anomalies_proto = element
for slice_key in slicing_util.generate_slices(
table, [anomalies_util.anomalies_slicer], anomalies=anomalies_proto):
yield slice_key, table
@beam.typehints.with_input_types(pa.Table)
@beam.typehints.with_output_types(types.BeamSlicedTable)
class IdentifyAnomalousExamples(beam.PTransform):
"""API for identifying anomalous examples.