Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
"""
Plot an example ROC graph of an SVM model predictions over the Iris dataset.
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
"""
iris = datasets.load_iris()
X = iris.data
y = label_binarize(iris.target, classes=[0, 1, 2])
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
roc_graph(y_test, y_score)
# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
random_state=0)
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
random_state=random_state))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)
roc_graph(y_test,y_score)
categories : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
measurements : list / NumPy ndarray / Pandas Series
A sequence of continuous measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
categories, measurements = replace_nan_with_value(categories, measurements, nan_replace_value)
elif nan_strategy == DROP:
categories, measurements = remove_incomplete_samples(categories, measurements)
categories = convert(categories, 'array')
measurements = convert(measurements, 'array')
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = np.sqrt(numerator/denominator)
return eta
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
micro : Boolean, default = True
Whether to calculate a Micro ROC graph (not applicable for binary cases)
macro : Boolean, default = True
Whether to calculate a Macro ROC graph (not applicable for binary cases)
kwargs : any key-value pairs
Different options and configurations
"""
all_fpr = list()
all_tpr = list()
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1 or y_pred.shape[1] <= 2:
return binary_roc_graph(y_true, y_pred, **kwargs)
else:
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
n = y_pred.shape[1]
plt.figure()
kwargs['new_figure'] = False
kwargs['show_graphs'] = False
kwargs['return_pr'] = True
for i in range(0,n):
pr = binary_roc_graph(y_true[:,i], y_pred[:,i],
color=colors[i % len(colors)],class_label=i, **kwargs)
all_fpr.append(pr['fpr'])
all_tpr.append(pr['tpr'])
0.8. In the first configuration, the input will be: y_true = [0,1], y_pred = [0.6,0.8]. In the second
configuration, the input will be: y_true = [[1,0],[0,1]], y_pred = [[0.6,0.4],[0.2,0.8]].
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
Parameters
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
kwargs : any key-value pairs
Different options and configurations
"""
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1:
y_t = y_true
y_p = y_pred
else:
y_t = [np.argmax(x) for x in y_true]
y_p = [x[1] for x in y_pred]
fpr, tpr, _ = roc_curve(y_t, y_p)
auc_score = auc(fpr,tpr)
color = kwargs.get('color','darkorange')
lw = kwargs.get('lw', 2)
ls = kwargs.get('ls','-')
ms = kwargs.get('ms', 10)
fmt = kwargs.get('fmt','.2f')
plot : Boolean, default = True
If True, plot a heat-map of the correlation matrix
return_results : Boolean, default = False
If True, the function will return a Pandas DataFrame of the computed associations
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop_samples' to remove samples with missing values,
'drop_features' to remove features (columns) with missing values, or 'replace' to replace all missing
values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'
ax : matplotlib ax, default = None
Matplotlib Axis on which the heat-map will be plotted
kwargs : any key-value pairs
Arguments to be passed to used function and methods
"""
dataset = convert(dataset, 'dataframe')
if nan_strategy == REPLACE:
dataset.fillna(nan_replace_value, inplace=True)
elif nan_strategy == DROP_SAMPLES:
dataset.dropna(axis=0, inplace=True)
elif nan_strategy == DROP_FEATURES:
dataset.dropna(axis=1, inplace=True)
columns = dataset.columns
if nominal_columns is None:
nominal_columns = list()
elif nominal_columns == 'all':
nominal_columns = columns
corr = pd.DataFrame(index=columns, columns=columns)
for i in range(0, len(columns)):
for j in range(i, len(columns)):
if i == j:
corr[columns[i]][columns[j]] = 1.0
configuration, the input will be: y_true = [[1,0],[0,1]], y_pred = [[0.6,0.4],[0.2,0.8]].
Based on sklearn examples (as was seen on April 2018):
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
Parameters
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
kwargs : any key-value pairs
Different options and configurations
"""
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1:
y_t = y_true
y_p = y_pred
else:
y_t = [np.argmax(x) for x in y_true]
y_p = [x[1] for x in y_pred]
fpr, tpr, _ = roc_curve(y_t, y_p)
auc_score = auc(fpr,tpr)
color = kwargs.get('color','darkorange')
lw = kwargs.get('lw', 2)
ls = kwargs.get('ls','-')
ms = kwargs.get('ms', 10)
fmt = kwargs.get('fmt','.2f')
if 'class_label' in kwargs:
----------
categories : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
measurements : list / NumPy ndarray / Pandas Series
A sequence of continuous measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
categories, measurements = replace_nan_with_value(categories, measurements, nan_replace_value)
elif nan_strategy == DROP:
categories, measurements = remove_incomplete_samples(categories, measurements)
categories = convert(categories, 'array')
measurements = convert(measurements, 'array')
fcat, _ = pd.factorize(categories)
cat_num = np.max(fcat)+1
y_avg_array = np.zeros(cat_num)
n_array = np.zeros(cat_num)
for i in range(0,cat_num):
cat_measures = measurements[np.argwhere(fcat == i).flatten()]
n_array[i] = len(cat_measures)
y_avg_array[i] = np.average(cat_measures)
y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
if numerator == 0:
eta = 0.0
else:
eta = np.sqrt(numerator/denominator)
Parameters
----------
y_true : list / NumPy ndarray
The true classes of the predicted data
y_pred : list / NumPy ndarray
The predicted classes
micro : Boolean, default = True
Whether to calculate a Micro ROC graph (not applicable for binary cases)
macro : Boolean, default = True
Whether to calculate a Macro ROC graph (not applicable for binary cases)
kwargs : any key-value pairs
Different options and configurations
"""
all_fpr = list()
all_tpr = list()
y_true = convert(y_true, 'array')
y_pred = convert(y_pred, 'array')
if y_pred.shape != y_true.shape:
raise ValueError('y_true and y_pred must have the same shape')
elif len(y_pred.shape) == 1 or y_pred.shape[1] <= 2:
return binary_roc_graph(y_true, y_pred, **kwargs)
else:
colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k']
n = y_pred.shape[1]
plt.figure()
kwargs['new_figure'] = False
kwargs['show_graphs'] = False
kwargs['return_pr'] = True
for i in range(0,n):
pr = binary_roc_graph(y_true[:,i], y_pred[:,i],
color=colors[i % len(colors)],class_label=i, **kwargs)
all_fpr.append(pr['fpr'])
Parameters
----------
x : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
y : list / NumPy ndarray / Pandas Series
A sequence of categorical measurements
nan_strategy : string, default = 'replace'
How to handle missing values: can be either 'drop' to remove samples with missing values, or 'replace'
to replace all missing values with the nan_replace_value. Missing values are None and np.nan.
nan_replace_value : any, default = 0.0
The value used to replace missing values with. Only applicable when nan_strategy is set to 'replace'.
"""
if nan_strategy == REPLACE:
x, y = replace_nan_with_value(x, y, nan_replace_value)
elif nan_strategy == DROP:
x, y = remove_incomplete_samples(x, y)
s_xy = conditional_entropy(x,y)
x_counter = Counter(x)
total_occurrences = sum(x_counter.values())
p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
s_x = ss.entropy(p_x)
if s_x == 0:
return 1
else:
return (s_x - s_xy) / s_x