How to use the openml.datasets.list_datasets function in openml

To help you get started, we’ve selected a few openml examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github openml / openml-python / tests / test_datasets / test_dataset.py View on Github external
def test_tagging(self):
        tag = "testing_tag_{}_{}".format(self.id(), time())
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 0)
        self.dataset.push_tag(tag)
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 1)
        self.assertIn(125, ds_list)
        self.dataset.remove_tag(tag)
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 0)
github openml / openml-python / tests / test_utils / test_utils.py View on Github external
def test_list_datasets_with_high_size_parameter(self):
        # Testing on prod since concurrent deletion of uploded datasets make the test fail
        openml.config.server = self.production_server

        datasets_a = openml.datasets.list_datasets()
        datasets_b = openml.datasets.list_datasets(size=np.inf)

        # Reverting to test server
        openml.config.server = self.test_server

        self.assertEqual(len(datasets_a), len(datasets_b))
github openml / openml-python / tests / test_datasets / test_dataset.py View on Github external
def test_tagging(self):
        tag = "testing_tag_{}_{}".format(self.id(), time())
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 0)
        self.dataset.push_tag(tag)
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 1)
        self.assertIn(125, ds_list)
        self.dataset.remove_tag(tag)
        ds_list = openml.datasets.list_datasets(tag=tag)
        self.assertEqual(len(ds_list), 0)
github openml / openml-python / tests / test_datasets / test_dataset_functions.py View on Github external
def test_list_datasets_by_number_features(self):
        datasets = openml.datasets.list_datasets(number_features="50..100")
        self.assertGreaterEqual(len(datasets), 8)
        self._check_datasets(datasets)
github openml / openml.org / src / dashboard / layouts.py View on Github external
def get_dataset_overview():
    """

    :return: overview of datasets page
    """
    df = datasets.list_datasets(output_format='dataframe')
    df.dropna(inplace=True)
    bins_1 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, max(df["NumberOfInstances"])]
    bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
    df["Number of instances"] = pd.cut(df["NumberOfInstances"], bins=bins_1).astype(str)
    df["Number of features"] = pd.cut(df["NumberOfFeatures"], bins=bins_2).astype(str)

    title = ["Number of instances across datasets",
             "Number of features across datasets",
             "Attribute Type percentage distribution",
             "Number of classes"]

    fig = plotly.subplots.make_subplots(rows=4, cols=1, subplot_titles=tuple(title))

    for col in ["Number of instances", "Number of features"]:
        df[col] = df[col].str.replace(',', ' -')
        df[col] = df[col].str.replace('(', "")
github openml / openml-python / develop / _downloads / b95c071188526f5ef5d991e382df9fa5 / datasets_tutorial.py View on Github external
# License: BSD 3-Clauses

import openml
import pandas as pd

############################################################################
# Exercise 0
# **********
#
# * List datasets
#
#   * Use the output_format parameter to select output type
#   * Default gives 'dict' (other option: 'dataframe', see below)

openml_list = openml.datasets.list_datasets()  # returns a dict

# Show a nice table with some key data properties
datalist = pd.DataFrame.from_dict(openml_list, orient='index')
datalist = datalist[[
    'did', 'name', 'NumberOfInstances',
    'NumberOfFeatures', 'NumberOfClasses'
]]

print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)

# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format='dataframe')
openml_df.head(n=10)

############################################################################
github alan-turing-institute / mlaut / download_openml.py View on Github external
import openml
from mlaut.data import Data
import pandas as pd
from sklearn import preprocessing
import os
import sys

apikey = 'd2b1d13981d4abfb22895337baca924c'
openml.config.apikey = apikey
openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache'))
NUMBER_OF_INSTANCES_CUTOFF_NUMBER = 10000 #

all_datasets = openml.datasets.list_datasets()

data = Data()
input_io = data.open_hdf5('data/openml.h5', mode='a')

num_saved_datasets = 0

for id in all_datasets.keys():
    #regression datasets have a value of -1. Classification datasets specify the number of classes
    if all_datasets[id]['NumberOfClasses'] == -1:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']}. This is a regression dataset.")
        continue
    if all_datasets[id]['NumberOfMissingValues'] > 0:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']} due to missing values.")
        continue
    if all_datasets[id]['NumberOfInstances'] > NUMBER_OF_INSTANCES_CUTOFF_NUMBER:
        print(f"Skipping dataset {id}, {all_datasets[id]['name']}. It has more than {NUMBER_OF_INSTANCES_CUTOFF_NUMBER} instances.")
github HDI-Project / ATM / collect-openml-data / gather_data.py View on Github external
def write_csv(name, id_list, datasets):
	ofile = open(name + '.csv', 'wb')
	writer = csv.writer(ofile, delimiter=",")

	writer.writerow(['Dataset ID', 'Dataset Name','Number of Instances', 'Number of Instances With Missing Values', 'Number of Classes', 'Number of Features'])

	for did in id_list:
		dataset = datasets[did]
		writer.writerow([did, dataset['name'], dataset['NumberOfInstances'], dataset['NumberOfInstancesWithMissingValues'], dataset['NumberOfClasses'], dataset['NumberOfFeatures']])

	ofile.close()

apikey = 'c0bbf61f0ca7139a3db5562edcbe10e5'
openml.config.apikey = apikey

datasets = openml.datasets.list_datasets()
metric_dict = {}

for key in datasets:
	dataset = datasets[key]

	try:
		if dataset['status'] != 'active':
			continue
		else:
			data_id = key
			num_instances = dataset['NumberOfInstances']
			num_missing_instances = dataset['NumberOfInstancesWithMissingValues']
			num_features = dataset['NumberOfFeatures']

			metric_dict[data_id] = num_instances / (num_missing_instances + 1.0)
github openml / openml-python / examples / 20_basic / simple_datasets_tutorial.py View on Github external
========

A basic tutorial on how to list, load and visualize datasets.
"""
############################################################################
# In general, we recommend working with tasks, so that the results can
# be easily reproduced. Furthermore, the results can be compared to existing results
# at OpenML. However, for the purposes of this tutorial, we are going to work with
# the datasets directly.

import openml
############################################################################
# List datasets
# =============

datasets_df = openml.datasets.list_datasets(output_format='dataframe')
print(datasets_df.head(n=10))

############################################################################
# Download a dataset
# ==================

# Iris dataset https://www.openml.org/d/61
dataset = openml.datasets.get_dataset(61)

# Print a summary
print(f"This is dataset '{dataset.name}', the target feature is "
      f"'{dataset.default_target_attribute}'")
print(f"URL: {dataset.url}")
print(dataset.description[:500])

############################################################################