Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_get_online_dataset_format(self):
# Phoneme dataset
dataset_id = 77
dataset = openml.datasets.get_dataset(dataset_id, download_data=False)
self.assertEqual(
(dataset.format).lower(),
_get_online_dataset_format(dataset_id),
"The format of the ARFF files is different"
)
def test_get_dataset_lazy(self):
dataset = openml.datasets.get_dataset(1, download_data=False)
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.name, 'anneal')
self._datasets_retrieved_successfully([1], metadata_only=True)
self.assertGreater(len(dataset.features), 1)
self.assertGreater(len(dataset.qualities), 4)
dataset.get_data()
self._datasets_retrieved_successfully([1], metadata_only=False)
# Issue324 Properly handle private datasets when trying to access them
openml.config.server = self.production_server
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45, False)
def test_get_dataset_by_name(self):
dataset = openml.datasets.get_dataset('anneal')
self.assertEqual(type(dataset), OpenMLDataset)
self.assertEqual(dataset.dataset_id, 1)
self._datasets_retrieved_successfully([1], metadata_only=False)
self.assertGreater(len(dataset.features), 1)
self.assertGreater(len(dataset.qualities), 4)
# Issue324 Properly handle private datasets when trying to access them
openml.config.server = self.production_server
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
def setUp(self):
super(OpenMLDatasetTest, self).setUp()
openml.config.server = self.production_server
# Load dataset id 2 - dataset 2 is interesting because it contains
# missing values, categorical features etc.
self.dataset = openml.datasets.get_dataset(2, download_data=False)
# titanic as missing values, categories, and string
self.titanic = openml.datasets.get_dataset(40945, download_data=False)
# these datasets have some boolean features
self.pc4 = openml.datasets.get_dataset(1049, download_data=False)
self.jm1 = openml.datasets.get_dataset(1053, download_data=False)
self.iris = openml.datasets.get_dataset(61, download_data=False)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# Train a scikit-learn model on the data manually.
dataset = openml.datasets.get_dataset(68)
X, y = dataset.get_data(
target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical = dataset.get_data(
target=dataset.default_target_attribute,
return_categorical_indicator=True,
)
print("Categorical features: %s" % categorical)
enc = preprocessing.OneHotEncoder(categorical_features=categorical)
X = enc.fit_transform(X)
clf.fit(X, y)
############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.
# Get a task
task = openml.tasks.get_task(403)
# ^^^^^^^^^
#
# Try to build the best possible models on several OpenML tasks,
# compare your results with the rest of the class and learn from
# them. Some tasks you could try (or browse openml.org):
#
# * EEG eye state: data_id:`1471 `_, task_id:`14951 `_
# * Volcanoes on Venus: data_id:`1527 `_, task_id:`10103 `_
# * Walking activity: data_id:`1509 `_, task_id:`9945 `_, 150k instances.
# * Covertype (Satellite): data_id:`150 `_, task_id:`218 `_, 500k instances.
# * Higgs (Physics): data_id:`23512 `_, task_id:`52950 `_, 100k instances, missing values.
# Easy benchmarking:
for task_id in [115, ]: # Add further tasks. Disclaimer: they might take some time
task = openml.tasks.get_task(task_id)
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
flow = openml.flows.sklearn_to_flow(clf)
run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
myrun = run.publish()
print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
"""
Get the numbers of data points and features of the datasets.
Args:
default_error_matrix (pandas.core.frame.DataFrame): The default error matrix DataFrame.
Returns:
dataset_sizes (np.ndarray): The dataset sizes; each row is [dataset_index, number_of_data_points, number_of_features].
"""
openml_datasets = openml.datasets.list_datasets()
openml_datasets = pd.DataFrame.from_dict(openml_datasets, orient='index')
dataset_sizes = openml_datasets[['NumberOfInstances', 'NumberOfFeatures']]
dataset_sizes = np.concatenate((np.array([dataset_sizes.index]).T, dataset_sizes.values), axis=1)
indices = default_error_matrix.index.tolist()
for i in set(indices).difference(set(dataset_sizes[:, 0])):
dataset=openml.datasets.get_dataset(i)
data_numeric, data_labels, categorical = dataset.get_data(target=dataset.default_target_attribute,return_categorical_indicator=True)
dataset_sizes = np.concatenate((dataset_sizes, np.array([[i, data_numeric.shape[0], data_numeric.shape[1]]])))
return dataset_sizes
def load_task(task_id):
"""Function used for loading data."""
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices()
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
dataset = openml.datasets.get_dataset(task.dataset_id)
_, _, cat = dataset.get_data(return_categorical_indicator=True,
target=task.target_name)
del _
del dataset
cat = ['categorical' if c else 'numerical' for c in cat]
unique = np.unique(y_train)
mapping = {unique_value: i for i, unique_value in enumerate(unique)}
y_train = np.array([mapping[value] for value in y_train])
y_test = np.array([mapping[value] for value in y_test])
return X_train, y_train, X_test, y_test, cat