Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if indices is None:
indices = {}
eltType, name, store = mjolnir.utils.explode_ltr_model_definition(model)
log_query = LtrLoggingQuery(eltType, name, store)
def kafka_handle_response(record):
assert record['status_code'] == 200
parsed = json.loads(record['text'])
response = parsed['responses'][0]
meta = record['meta']
for hit_page_id, features in extract_ltr_log_feature_values(response, feature_names_accu):
yield [meta['wikiid'], meta['query'], hit_page_id, features]
rdd = mjolnir.kafka.client.msearch(
df.groupBy('wikiid', 'query').agg(F.collect_set('hit_page_id').alias('hit_page_ids')),
client_config=brokers,
meta_keys=['wikiid', 'query'],
create_es_query=lambda row: log_query.make_msearch(row, indices),
handle_response=kafka_handle_response)
return df.sql_ctx.createDataFrame(rdd, T.StructType([
df.schema['wikiid'], df.schema['query'], df.schema['hit_page_id'],
T.StructField('features', VectorUDT(), nullable=False)
# We could have gotten duplicate data from kafka. Clean them up.
])).drop_duplicates(['wikiid', 'query', 'hit_page_id'])
>>> ks.Series(['a', None]).hasnans
True
>>> ks.Series([1.0, 2.0, np.nan]).hasnans
True
>>> ks.Series([1, 2, 3]).hasnans
False
>>> ks.Series([1, 2, 3]).rename("a").to_frame().set_index("a").index.hasnans
False
"""
sdf = self._internal._sdf.select(self._scol)
col = self._scol
ret = sdf.select(F.max(col.isNull() | F.isnan(col))).collect()[0][0]
return ret
column_set = set(columns)
remaining_columns = [kdf[column] for column in kdf.columns if column not in column_set]
if any(not isinstance(kdf._internal.spark_type_for(column), _get_dummies_acceptable_types)
for column in columns):
raise ValueError("get_dummies currently only accept {} values"
.format(', '.join([t.typeName() for t in _get_dummies_acceptable_types])))
if prefix is not None and len(columns) != len(prefix):
raise ValueError(
"Length of 'prefix' ({}) did not match the length of the columns being encoded ({})."
.format(len(prefix), len(columns)))
all_values = _reduce_spark_multi(kdf._sdf,
[F.collect_set(kdf._internal.scol_for(column)).alias(column)
for column in columns])
for i, column in enumerate(columns):
values = sorted(all_values[i])
if drop_first:
values = values[1:]
def column_name(value):
if prefix is None:
return str(value)
else:
return '{}{}{}'.format(prefix[i], prefix_sep, value)
for value in values:
remaining_columns.append((kdf[column].notnull() & (kdf[column] == value))
.astype(dtype)
.rename(column_name(value)))
# Cast continuous columns to float & lookup categorical columns.
train_df = cast_columns(train_df, continuous_cols + ['Sales'])
train_df = lookup_columns(train_df, vocab)
test_df = cast_columns(test_df, continuous_cols)
test_df = lookup_columns(test_df, vocab)
# Split into training & validation.
# Test set is in 2015, use the same period in 2014 from the training set as a validation set.
test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0]
test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0]
a_year = datetime.timedelta(365)
val_df = train_df.filter((test_min_date - a_year <= train_df.Date) & (train_df.Date < test_max_date - a_year))
train_df = train_df.filter((train_df.Date < test_min_date - a_year) | (train_df.Date >= test_max_date - a_year))
# Determine max Sales number.
max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0]
print('===================================')
print('Data frame with transformed columns')
print('===================================')
train_df.show()
print('================')
print('Data frame sizes')
print('================')
train_rows, val_rows, test_rows = train_df.count(), val_df.count(), test_df.count()
print('Training: %d' % train_rows)
print('Validation: %d' % val_rows)
print('Test: %d' % test_rows)
# Save data frames as Parquet files.
train_df.write.parquet('%s/train_df.parquet' % DATA_LOCATION, mode='overwrite')
lambda col_name: F.col("`{0}`".format(col_name)).alias(fun(col_name)) if change_col_name(col_name) else F.col("`{0}`".format(col_name)),
df.columns
@fn.pandas_udf(StructType([StructField('histos', BinaryType(), True)]), fn.PandasUDFType.GROUPED_MAP)
def reduce_histos(df):
global processor_instance, lz4_clevel
return reduce_histos_raw(df, processor_instance, lz4_clevel)
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from collections import OrderedDict
from typing import Callable, Any
import numpy as np
from pyspark.sql import functions as F, Column
from pyspark.sql.types import DoubleType, LongType, BooleanType
unary_np_spark_mappings = OrderedDict({
'abs': F.abs,
'absolute': F.abs,
'arccos': F.acos,
'arccosh': F.pandas_udf(lambda s: np.arccosh(s), DoubleType()),
'arcsin': F.asin,
'arcsinh': F.pandas_udf(lambda s: np.arcsinh(s), DoubleType()),
'arctan': F.atan,
'arctanh': F.pandas_udf(lambda s: np.arctanh(s), DoubleType()),
'bitwise_not': F.bitwiseNOT,
'cbrt': F.cbrt,
'ceil': F.ceil,
'conj': lambda _: NotImplemented, # It requires complex type which Koalas does not support yet
'conjugate': lambda _: NotImplemented, # It requires complex type
'cos': F.cos,
'cosh': F.pandas_udf(lambda s: np.cosh(s), DoubleType()),
'deg2rad': F.pandas_udf(lambda s: np.deg2rad(s), DoubleType()),
'degrees': F.degrees,
def _shift(self, periods, fill_value, part_cols=()):
if not isinstance(periods, int):
raise ValueError('periods should be an int; however, got [%s]' % type(periods))
col = self._scol
window = Window.partitionBy(*part_cols).orderBy(self._internal.index_scols)\
.rowsBetween(-periods, -periods)
lag_col = F.lag(col, periods).over(window)
col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
return self._with_new_scol(col).rename(self.name)
def _nunique(self, dropna=True, approx=False, rsd=0.05):
colname = self._internal.data_columns[0]
count_fn = partial(F.approx_count_distinct, rsd=rsd) if approx else F.countDistinct
if dropna:
return count_fn(self._scol).alias(colname)
else:
return (count_fn(self._scol) +
F.when(F.count(F.when(self._scol.isNull(), 1)
.otherwise(None)) >= 1, 1).otherwise(0)).alias(colname)