Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
df['play'] = df['play'].astype('category')
# meta-information
name = '%s-pandas_testing_dataset' % self._get_sentinel()
description = 'Synthetic dataset created from a Pandas DataFrame'
creator = 'OpenML tester'
collection_date = '01-01-2018'
language = 'English'
licence = 'MIT'
default_target_attribute = 'play'
citation = 'None'
original_data_url = 'http://openml.github.io/openml-python'
paper_url = 'http://openml.github.io/openml-python'
# pass a list to ignore_attribute
ignore_attribute = ['outlook', 'windy']
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
creator=creator,
contributor=None,
collection_date=collection_date,
language=language,
licence=licence,
default_target_attribute=default_target_attribute,
row_id_attribute=None,
ignore_attribute=ignore_attribute,
citation=citation,
attributes='auto',
data=df,
version_label='test',
original_data_url=original_data_url,
paper_url=paper_url
openml.config.start_using_configuration_for_example()
# NOTE: We are using dataset 68 from the test server: https://test.openml.org/d/68
dataset = openml.datasets.get_dataset(68)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format='array',
target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=1)
clf.fit(X, y)
############################################################################
# You can also ask for meta-data to automatically preprocess the data.
#
# * e.g. categorical features -> do feature encoding
dataset = openml.datasets.get_dataset(17)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format='array',
target=dataset.default_target_attribute
)
print(f"Categorical features: {categorical_indicator}")
transformer = compose.ColumnTransformer(
[('one_hot_encoder', preprocessing.OneHotEncoder(categories='auto'), categorical_indicator)])
X = transformer.fit_transform(X)
clf.fit(X, y)
############################################################################
# Runs: Easily explore models
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We can run (many) scikit-learn algorithms on (many) OpenML tasks.
# Get a task
import openml
from sklearn import ensemble, neighbors
############################################################################
# Train a machine learning model
# ==============================
#
# .. warning:: This example uploads data. For that reason, this example
# connects to the test server at test.openml.org. This prevents the main
# server from crowding with example datasets, tasks, runs, and so on.
openml.config.start_using_configuration_for_example()
# NOTE: We are using dataset 20 from the test server: https://test.openml.org/d/20
dataset = openml.datasets.get_dataset(20)
X, y, categorical_indicator, attribute_names = dataset.get_data(
dataset_format='array',
target=dataset.default_target_attribute
)
clf = neighbors.KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)
############################################################################
# Running a model on a task
# =========================
task = openml.tasks.get_task(119)
clf = ensemble.RandomForestClassifier()
run = openml.runs.run_model_on_task(clf, task)
print(run)
#regression datasets have a value of -1. Classification datasets specify the number of classes
if all_datasets[id]['NumberOfClasses'] == -1:
print(f"Skipping dataset {id}, {all_datasets[id]['name']}. This is a regression dataset.")
continue
if all_datasets[id]['NumberOfMissingValues'] > 0:
print(f"Skipping dataset {id}, {all_datasets[id]['name']} due to missing values.")
continue
if all_datasets[id]['NumberOfInstances'] > NUMBER_OF_INSTANCES_CUTOFF_NUMBER:
print(f"Skipping dataset {id}, {all_datasets[id]['name']}. It has more than {NUMBER_OF_INSTANCES_CUTOFF_NUMBER} instances.")
continue
print(f"Trying to download dataset {id}, {all_datasets[id]['name']}")
try:
dataset = openml.datasets.get_dataset(id)
X, names = dataset.get_data(return_attribute_names=True)
metadata = {
'class_name': dataset.__dict__['default_target_attribute'],
'source': 'OpenML',
'dataset_name':dataset.__dict__['name'],
'dataset_id': id
}
class_name_index = names.index(metadata['class_name'])
#Normalize the data
# scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
# scaler.fit(X)
# x_transformed = scaler.transform(X)
# x_transformed[:,class_name_index] = X[:, class_name_index]
def load_task(task_id):
task = openml.tasks.get_task(task_id)
X, y = task.get_X_and_y()
train_indices, test_indices = task.get_train_test_split_indices()
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]
dataset = openml.datasets.get_dataset(task.dataset_id)
_, _, cat, _ = dataset.get_data(target=task.target_name)
del _
del dataset
cat = ['categorical' if c else 'numerical' for c in cat]
unique = np.unique(y_train)
mapping = {unique_value: i for i, unique_value in enumerate(unique)}
y_train = np.array([mapping[value] for value in y_train])
y_test = np.array([mapping[value] for value in y_test])
return X_train, y_train, X_test, y_test, cat
#
# * Add the line **cachedir = 'MYDIR'** to the config file, replacing
# 'MYDIR' with the path to the cache directory. By default, OpenML
# will use **~/.openml/cache** as the cache directory.
# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
# Uncomment and set your OpenML cache directory
# import os
# openml.config.cache_directory = os.path.expanduser('YOURDIR')
############################################################################
# Simple Example
# ^^^^^^^^^^^^^^
# Download the OpenML task for the eeg-eye-state.
task = openml.tasks.get_task(403)
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
run = openml.runs.run_model_on_task(clf, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not crowd the main server with runs created by examples.
myrun = run.publish()
print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
############################################################################
openml.config.stop_using_configuration_for_example()
def _get_rest_api_type_alias(oml_object: 'OpenMLBase') -> str:
""" Return the alias of the openml entity as it is defined for the REST API. """
rest_api_mapping = [
(openml.datasets.OpenMLDataset, 'data'),
(openml.flows.OpenMLFlow, 'flow'),
(openml.tasks.OpenMLTask, 'task'),
(openml.runs.OpenMLRun, 'run'),
((openml.study.OpenMLStudy, openml.study.OpenMLBenchmarkSuite), 'study')
] # type: List[Tuple[Union[Type, Tuple], str]]
_, api_type_alias = [(python_type, api_alias)
for (python_type, api_alias) in rest_api_mapping
if isinstance(oml_object, python_type)][0]
return api_type_alias
def get_dataset(self) -> datasets.OpenMLDataset:
"""Download dataset associated with task"""
return datasets.get_dataset(self.dataset_id)
# ^^^^^^^
# When downloading datasets, tasks, runs and flows, they will be cached to retrieve them without calling the server later. As with the API key, the cache directory can be either specified through the config file or through the API:
#
# * Add the line **cachedir = 'MYDIR'** to the config file, replacing 'MYDIR' with the path to the cache directory. By default, OpenML will use **~/.openml/cache** as the cache directory.
# * Run the code below, replacing 'YOURDIR' with the path to the cache directory.
import os
# Uncomment and set your OpenML cache directory
# openml.config.cache_directory = os.path.expanduser('YOURDIR')
############################################################################
# Simple Example
# ^^^^^^^^^^^^^^
# Download the OpenML task for the eeg-eye-state.
task = openml.tasks.get_task(403)
data = openml.datasets.get_dataset(task.dataset_id)
clf = neighbors.KNeighborsClassifier(n_neighbors=5)
flow = openml.flows.sklearn_to_flow(clf)
run = openml.runs.run_flow_on_task(flow, task, avoid_duplicate_runs=False)
# Publish the experiment on OpenML (optional, requires an API key).
# For this tutorial, our configuration publishes to the test server
# as to not pollute the main server.
myrun = run.publish()
print("kNN on %s: http://test.openml.org/r/%d" % (data.name, myrun.run_id))
# * Find datasets with more than 10000 examples.
# * Find a dataset called 'eeg_eye_state'.
# * Find all datasets with more than 50 classes.
datalist[datalist.NumberOfInstances > 10000
].sort_values(['NumberOfInstances']).head(n=20)
############################################################################
datalist.query('name == "eeg-eye-state"')
############################################################################
datalist.query('NumberOfClasses > 50')
############################################################################
# Download datasets
# =================
# This is done based on the dataset ID ('did').
dataset = openml.datasets.get_dataset(68)
# NOTE: Dataset 68 exists on the test server https://test.openml.org/d/68
# Print a summary
print("This is dataset '%s', the target feature is '%s'" %
(dataset.name, dataset.default_target_attribute))
print("URL: %s" % dataset.url)
print(dataset.description[:500])
############################################################################
# Get the actual data.
#
# The dataset can be returned in 2 possible formats: as a NumPy array, a SciPy
# sparse matrix, or as a Pandas DataFrame (or SparseDataFrame). The format is
# controlled with the parameter ``dataset_format`` which can be either 'array'
# (default) or 'dataframe'. Let's first build our dataset from a NumPy array
# and manually create a dataframe.