Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def core_event_distribution(self, core_events, index_col=None, event_col=None,
thresh=None, plotting=True, use_greater=True, **kwargs):
self._init_cols(locals())
if type(core_events) == str:
core_events = [core_events]
self._obj['is_core_event'] = self._obj[self._event_col()].isin(core_events)
rates = self._obj.groupby(self._index_col()).is_core_event.mean()
if plotting:
plot.core_event_dist(rates, thresh, **kwargs)
if use_greater:
f = set(rates[rates >= thresh].index.values)
else:
f = set(rates[rates < thresh].index.values)
return self._obj[self._obj[self._index_col()].isin(f)].reset_index(drop=True)
[self._event_col()]
.value_counts()
.loc[top_cluster['index']]
/ clus2.shape[0]).reset_index()
cr1 = (
clus2[
clus2[self._event_col()] == self.retention_config['positive_target_event']
][self._index_col()].nunique()
) / clus2[self._index_col()].nunique()
top_all.columns = [self._event_col(), 'freq', ]
top_cluster.columns = [self._event_col(), 'freq', ]
top_all['hue'] = 'all' if cl2 is None else f'cluster {cl2}'
top_cluster['hue'] = f'cluster {cl1}'
plot.cluster_event_dist(
top_all.append(top_cluster, ignore_index=True, sort=False),
self._event_col(),
cl1,
[
clus[self._index_col()].nunique() / self._obj[self._index_col()].nunique(),
clus2[self._index_col()].nunique() / self._obj[self._index_col()].nunique(),
],
[cr0, cr1],
cl2
)
features = self.extract_features(**kwargs)
if not hasattr(self, 'clusters') or refit_cluster:
clusterer = getattr(clustering, method)
self.clusters, self._metrics = clusterer(features, **kwargs)
self._create_cluster_mapping(features.index.values)
if hasattr(self, 'datatype') and self.datatype == 'features':
target = kwargs.pop('target')
else:
target = self.get_positive_users(**kwargs)
target = features.index.isin(target)
self._metrics['homogen'] = clustering.homogeneity_score(target, self.clusters)
if hasattr(self, '_tsne'):
features.retention._tsne = self._tsne
if plot_type:
func = getattr(plot, plot_type)
res = func(
features,
clustering.aggregate_cl(self.clusters, 7) if method == 'dbscan' else self.clusters,
target,
metrics=self._metrics,
**kwargs
)
if res is not None:
self._tsne = res
return self.clusters
f_cur = self._obj[self._event_col()] == event_order[0]
f_next = self._obj['next_event'] == event_order[1]
s_next = self._obj[f_cur & f_next].copy()
s_cur = self._obj[f_cur & (~f_next)].copy()
s_cur.time_diff[s_cur.time_diff < limit].hist(alpha=0.5, log=True,
bins=bins, label='Others {:.2f}'.format(
(s_cur.time_diff < limit).sum() / f_cur.sum()
))
s_next.time_diff[s_next.time_diff < limit].hist(alpha=0.7, log=True,
bins=bins,
label='Selected event order {:.2f}'.format(
(s_next.time_diff < limit).sum() / f_cur.sum()
))
plot.sns.mpl.pyplot.legend()
plot.sns.mpl.pyplot.show()
(s_cur.next_event.value_counts() / f_cur.sum()).iloc[:topk].plot.bar()
elif sample_size is not None:
features = features.sample(n=sample_size, random_state=0)
if not (hasattr(self, '_tsne') and not refit):
self._tsne = feature_extraction.learn_tsne(features, **kwargs)
if plot_type == 'clusters':
if kwargs.get('cmethod') is not None:
kwargs['method'] = kwargs.pop('cmethod')
old_targs = targets.copy()
targets = self.get_clusters(plot_type=None, **kwargs)
elif plot_type == 'targets':
targets = self._tsne_targets
else:
return self._tsne
if proj_type == '3d':
plot.tsne_3d(
self._obj,
clustering.aggregate_cl(targets, 7) if kwargs.get('method') == 'dbscan' else targets,
old_targs,
**kwargs
)
else:
plot.cluster_tsne(
self._obj,
clustering.aggregate_cl(targets, 7) if kwargs.get('method') == 'dbscan' else targets,
targets,
**kwargs
)
return self._tsne
piv = pd.DataFrame(res.mean(2), index=base.index, columns=base.columns)
stds = pd.DataFrame(res.std(2), index=base.index, columns=base.columns)
if not kwargs.get('reverse'):
for i in self.retention_config['target_event_list']:
piv = piv.append(self._add_accums(piv, i))
if kwargs.get('thr'):
thr = kwargs.pop('thr')
piv = self._process_thr(piv, thr, kwargs.get('max_steps' or 30), **kwargs)
if kwargs.get('sorting'):
piv = self._sort_matrix(piv)
if not kwargs.get('for_diff'):
if kwargs.get('reverse'):
piv.columns = ['n'] + ['n - {}'.format(i - 1) for i in piv.columns[1:]]
if plot_type:
plot.step_matrix(
piv.round(2),
title=kwargs.get('title',
'Step matrix {}'
.format('reversed' if kwargs.get('reverse') else '')), **kwargs)
plot.step_matrix(
stds.round(3),
title=kwargs.get('title',
'Step matrix std'), **kwargs)
if kwargs.get('dt_means') is not None:
means = np.array(self._obj.groupby('event_rank').apply(
lambda x: (x.next_timestamp - x.event_timestamp).dt.total_seconds().mean()
))
piv = pd.concat([piv, pd.DataFrame([means[:kwargs.get('max_steps' or 30)]],
columns=piv.columns, index=['dt_mean'])])
return piv, stds
kwargs['method'] = kwargs.pop('cmethod')
old_targs = targets.copy()
targets = self.get_clusters(plot_type=None, **kwargs)
elif plot_type == 'targets':
targets = self._tsne_targets
else:
return self._tsne
if proj_type == '3d':
plot.tsne_3d(
self._obj,
clustering.aggregate_cl(targets, 7) if kwargs.get('method') == 'dbscan' else targets,
old_targs,
**kwargs
)
else:
plot.cluster_tsne(
self._obj,
clustering.aggregate_cl(targets, 7) if kwargs.get('method') == 'dbscan' else targets,
targets,
**kwargs
)
return self._tsne