Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dataset.isna().sum()
dataset = dataset.dropna()
origin = dataset.pop('Origin')
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail()
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)
sns.pairplot(
train_dataset[["MPG", "Cylinders", "Displacement", "Weight"]], diag_kind="kde")
train_stats = train_dataset.describe()
train_stats.pop("MPG")
train_stats = train_stats.transpose()
train_stats
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
def norm(x):
return (x - train_stats['mean']) / train_stats['std']
https://seaborn.pydata.org/examples/scatterplot_matrix.html
"""
logger.info("Generating Scatter Plot")
# Get the feature subset
features.append(target)
df = df[features]
# Generate the pair plot
sns.set()
sns_plot = sns.pairplot(df, hue=target)
# Save the plot
write_plot('seaborn', sns_plot, 'scatter_plot', tag, directory)
def plot_pairplot(csv_filename, fig_filename, top=None):
import seaborn as sns
import pandas as pd
sns.set(style="ticks", color_codes=True)
quants = pd.read_csv(csv_filename)
if top is not None:
quants = quants[:top]
g = sns.pairplot(quants, kind='reg', diag_kind='kde', markers='.')
g.savefig(fig_filename)
filename)
exams = pd.read_csv('exams.csv.gz')
# Select data for students present all three years
continuing_students = exams.StudentID.value_counts()
continuing_students = continuing_students[continuing_students > 2].index
exams = exams[exams.StudentID.isin(continuing_students)]
###############################################################
# **Visualization**: Grade at tests in in exams depend on socio-economic
# status, year at school, ...
#
# The simplest way to do this is using seaborn's pairplot function.
import seaborn as sns
sns.pairplot(exams.drop(columns=['StudentID']))
###############################################################
# A more elaborate plot using density estimation gives better
# understanding of the dense regions:
g = sns.PairGrid(exams.drop(columns=['StudentID']),
diag_sharey=False)
g.map_lower(sns.kdeplot)
g.map_upper(plt.scatter, s=2)
g.map_diag(sns.kdeplot, lw=3)
###############################################################
# **Prediction**: Can we predict test grades in maths from demographics
# (ie, not from other grades)?
# A bit of feature engineering to get a numerical matrix (easily done
features = features.drop(target_col, axis=1)
if features.shape[1] == 0:
return
top_for_interactions = 20
features_imp = SimpleImputer().fit_transform(features)
target = X[target_col]
# FIXME if one class only has NaN for a value we crash! :-/
# TODO univariate plot?
# already on diagonal for pairplot but not for many features
if features.shape[1] <= 5:
# for n_dim <= 5 we do full pairplot plot
# FIXME filling in missing values here b/c of a bug in seaborn
# we really shouldn't be doing this
# https://github.com/mwaskom/seaborn/issues/1699
X_imp = X.fillna(features.median(axis=0))
sns.pairplot(X_imp, vars=features.columns,
hue=target_col)
else:
# univariate plots
show_top = _get_n_top(features, "continuous")
f, p = f_classif(features_imp, target)
top_k = np.argsort(f)[-show_top:][::-1]
# FIXME this will fail if a feature is always
# NaN for a particular class
best_features = features.iloc[:, top_k].copy()
best_features[target_col] = target
df = best_features.melt(target_col)
rows, cols = find_pretty_grid(show_top)
g = sns.FacetGrid(df, col='variable', hue=target_col, col_wrap=cols,
sharey=False, sharex=False, hue_order=hue_order)
g = g.map(sns.kdeplot, "value", shade=True)
def visualize_housing_data(df):
sns.set(style='whitegrid', context='notebook')
cols = ['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV']
sns.pairplot(df[cols], size=2.5)
plt.show()
correlation_matrix = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.5)
heatmap = sns.heatmap(
correlation_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 15},
yticklabels=cols,
xticklabels=cols,
)
pC = (conf_matrix[1][1]/(conf_matrix[1][0]+conf_matrix[1][1]))*100
#see the inside details of the classification
print 'For class 0 man cases:',conf_matrix[0][0],'classified correctly and',conf_matrix[0][1],'missclassified,',hC,'accuracy \n'
print 'For class 1 woman cases:',conf_matrix[1][1],'classified correctly and',conf_matrix[1][0],'missclassified,',pC,'accuracy\n'
#plot the training features after the kpca and the lda procedure
embedded_labels = pd.DataFrame({'Feature1': features_embedded[: ,0], 'Feature2': features_embedded[: ,1],'Label': labels})
sns.pairplot(embedded_labels, hue='Label')
#plt.savefig('kpca_trainset_parkinson_healthy.png')
#plt.show()
#plot the training features after the kpca and the lda procedure
embedded_predicted_labels = pd.DataFrame({'Feature1': features_embedded[: ,0], 'Feature2': features_embedded[: ,1],'Label': predicted_labels})
sns.pairplot(embedded_predicted_labels, hue='Label')
#plt.savefig('kpca_trainset_parkinson_healthy.png')
plt.show()
df = pd.DataFrame({
"a":a,
"b":b,
"c":c,
"d":d,
"e":e,
"f":f,
"g":g,
"h":h,
"i":i,
"j":j})
df.to_csv("csv_files/causal_linear.csv", index=False)
#Plotting
sns.pairplot(data=pd.DataFrame({"a":a, "b":b}))
plt.title("Data of Fig. 1, subplot (i)", fontsize=20, y=1.08, x=-0.2)
sns.pairplot(data=pd.DataFrame({"c": c, "d":d}))
plt.title("Data of Fig. 1, subplot(ii)", fontsize=20, y=1.08, x=-0.2)
plt.figure()
sns.pairplot(data=pd.DataFrame({"e":e, "f": f, "g":g}))
plt.title("Data of Fig. 1, subplot (iii)", fontsize=20, y=2.28, x=-0.8)
sns.pairplot(data=pd.DataFrame({"h":h, "i": i, "j":j}))
plt.title("Data of Fig. 1, subplot (iv)", fontsize=20, y=2.28, x=-0.8)
self.figure.clf()
legend_labels = self.df[column_name].unique()
legend_title = column_name
df = prepare_data(self.df[self.available_columns])
if str(self.df[column_name].dtype) == "object":
# Update hue column for categorical data
column_name += "_code"
pub.sendMessage("LOG_MESSAGE", log_message="\nReady to plot...")
try:
# Produce pairpolot using seaborn
pair_plot = sns.pairplot(
df,
hue=column_name,
palette="deep",
size=1.2,
diag_kind="kde",
diag_kws=dict(shade=True),
plot_kws=dict(s=10),
)
# Get the number of rows and columns from the seaborn pairplot
pp_rows = len(pair_plot.axes)
pp_cols = len(pair_plot.axes[0])
# Update axes to the corresponding number of subplots from pairplot
self.axes = self.figure.subplots(pp_rows, pp_cols)
df_labels = pd.DataFrame(labels, columns=[legend_name])
df = pd.concat([df_labels, df], axis=1)
names, counts = np.unique(labels, return_counts=True)
if counts.min() < 2:
diag_kind = "hist"
plot_kws = dict(
alpha=alpha,
s=size,
# edgecolor=None, # could add this latter
linewidth=0,
marker=marker,
)
with sns.plotting_context(context=context, font_scale=font_scale):
if labels is not None:
pairs = sns.pairplot(
df,
hue=legend_name,
vars=variables,
height=height,
palette=palette,
diag_kind=diag_kind,
plot_kws=plot_kws,
)
else:
pairs = sns.pairplot(
df,
vars=variables,
height=height,
palette=palette,
diag_kind=diag_kind,
plot_kws=plot_kws,