Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, X, y=None):
"""Fit detector. y is optional for unsupervised methods.
Parameters
----------
X : numpy array of shape (n_samples, n_features)
The input samples.
y : numpy array of shape (n_samples,), optional (default=None)
The ground truth of the input samples (labels).
"""
# Validate inputs X and y
X = check_array(X)
self._set_n_classes(y)
self.detector_ = PyOD_LSCP(detector_list=self.base_estimators,
local_region_size=self.local_region_size,
local_max_features=self.local_max_features,
n_bins=self.n_bins,
random_state=self.random_state,
contamination=self.contamination)
self.detector_.fit(X)
self.decision_scores_ = self.detector_.decision_scores_
self._process_decision_scores()
return self
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)])
]
model = SUOD(base_estimators=base_estimators, n_jobs=6, bps_flag=True,
contamination=contamination, approx_flag_global=True)
model.fit(X_train) # fit all models with X
model.approximate(X_train) # conduct model approximation if it is enabled
predicted_labels = model.predict(X_test) # predict labels
predicted_scores = model.decision_function(X_test) # predict scores
predicted_probs = model.predict_proba(X_test) # predict scores
###########################################################################
# compared with other approaches
evaluate_print('majority vote', y_test, majority_vote(predicted_labels))
evaluate_print('average', y_test, average(predicted_scores))
n_train = 200 # number of training points
n_test = 100 # number of testing points
# Generate sample data
X_train, y_train, X_test, y_test = \
generate_data(n_train=n_train,
n_test=n_test,
contamination=contamination,
random_state=42)
X_train, X_test = standardizer(X_train, X_test)
# train lscp
clf_name = 'LSCP'
detector_list = [LOF(n_neighbors=15), LOF(n_neighbors=20),
LOF(n_neighbors=25), LOF(n_neighbors=35)]
clf = LSCP(detector_list, random_state=42)
clf.fit(X_train)
# get the prediction labels and outlier scores of the training data
y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers)
y_train_scores = clf.decision_scores_ # raw outlier scores
# get the prediction on the test data
y_test_pred = clf.predict(X_test) # outlier labels (0 or 1)
y_test_scores = clf.decision_function(X_test) # outlier scores
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, y_train, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, y_test, y_test_scores)
LOF(n_neighbors=5, contamination=contamination),
LOF(n_neighbors=15, contamination=contamination),
LOF(n_neighbors=25, contamination=contamination),
LOF(n_neighbors=35, contamination=contamination),
LOF(n_neighbors=45, contamination=contamination),
HBOS(contamination=contamination),
PCA(contamination=contamination),
OCSVM(contamination=contamination),
KNN(n_neighbors=5, contamination=contamination),
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)])
]
# number of the parallel jobs
n_jobs = 6
n_estimators = len(base_estimators)
# the algorithms that should be be using random projection
rp_clf_list = ['LOF', 'KNN', 'ABOD']
# the algorithms that should NOT use random projection
rp_ng_clf_list = ['IForest', 'PCA', 'HBOS']
# global flag for random projection
rp_flag_global = True
objective_dim = 6
rp_method = 'discrete'
'Average KNN': KNN(method='mean',
contamination=outliers_fraction),
# 'Median KNN': KNN(method='median',
# contamination=outliers_fraction),
'Local Outlier Factor (LOF)':
LOF(n_neighbors=35, contamination=outliers_fraction),
# 'Local Correlation Integral (LOCI)':
# LOCI(contamination=outliers_fraction),
'Minimum Covariance Determinant (MCD)': MCD(
contamination=outliers_fraction, random_state=random_state),
'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
'Principal Component Analysis (PCA)': PCA(
contamination=outliers_fraction, random_state=random_state),
# 'Stochastic Outlier Selection (SOS)': SOS(
# contamination=outliers_fraction),
'Locally Selective Combination (LSCP)': LSCP(
detector_list, contamination=outliers_fraction,
random_state=random_state),
# 'Connectivity-Based Outlier Factor (COF)':
# COF(n_neighbors=35, contamination=outliers_fraction),
# 'Subspace Outlier Detection (SOD)':
# SOD(contamination=outliers_fraction),
}
# Show all detectors
for i, clf in enumerate(classifiers.keys()):
print('Model', i + 1, clf)
# Fit the models with the generated data and
# compare model performances
for i, offset in enumerate(clusters_separation):
np.random.seed(42)
def __init__(self, detector_list, local_region_size=30,
local_max_features=1.0, n_bins=10,
random_state=None, contamination=0.1):
super(LSCP, self).__init__(contamination=contamination)
self.detector_list = detector_list
self.n_clf = len(self.detector_list)
self.local_region_size = local_region_size
self.local_region_min = 30
self.local_region_max = 200
self.local_max_features = local_max_features
self.local_min_features = 0.5
self.local_region_iterations = 20
self.local_region_threshold = int(self.local_region_iterations / 2)
self.n_bins = n_bins
self.n_selected = 1
self.random_state = random_state
def make_mlo(hub, data, train):
'''
Create the Machine Learning Object used for this sequence
'''
return LSCP(contamination=0.001)
KNN(n_neighbors=15, contamination=contamination),
KNN(n_neighbors=25, contamination=contamination),
KNN(n_neighbors=35, contamination=contamination),
KNN(n_neighbors=45, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
IForest(n_estimators=50, contamination=contamination),
IForest(n_estimators=100, contamination=contamination),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)]),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)]),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)]),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)]),
LSCP(detector_list=[LOF(contamination=contamination),
LOF(contamination=contamination)]),
]
# model = SUOD(base_estimators=base_estimators, rp_flag_global=True,
# n_jobs=6, bps_flag=False, contamination=contamination,
# approx_flag_global=True)
model = SUOD(base_estimators=base_estimators, rp_flag_global=True,
n_jobs=6, bps_flag=True, contamination=contamination,
approx_flag_global=True)