Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_tagging(self):
tag = "testing_tag_{}_{}".format(self.id(), time())
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
self.dataset.push_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 1)
self.assertIn(125, ds_list)
self.dataset.remove_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
def test_list_datasets_with_high_size_parameter(self):
# Testing on prod since concurrent deletion of uploded datasets make the test fail
openml.config.server = self.production_server
datasets_a = openml.datasets.list_datasets()
datasets_b = openml.datasets.list_datasets(size=np.inf)
# Reverting to test server
openml.config.server = self.test_server
self.assertEqual(len(datasets_a), len(datasets_b))
def test_tagging(self):
tag = "testing_tag_{}_{}".format(self.id(), time())
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
self.dataset.push_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 1)
self.assertIn(125, ds_list)
self.dataset.remove_tag(tag)
ds_list = openml.datasets.list_datasets(tag=tag)
self.assertEqual(len(ds_list), 0)
def test_list_datasets_by_number_features(self):
datasets = openml.datasets.list_datasets(number_features="50..100")
self.assertGreaterEqual(len(datasets), 8)
self._check_datasets(datasets)
def get_dataset_overview():
"""
:return: overview of datasets page
"""
df = datasets.list_datasets(output_format='dataframe')
df.dropna(inplace=True)
bins_1 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000, max(df["NumberOfInstances"])]
bins_2 = [1, 500, 1000, 5000, 10000, 50000, 100000, 500000, 1000000]
df["Number of instances"] = pd.cut(df["NumberOfInstances"], bins=bins_1).astype(str)
df["Number of features"] = pd.cut(df["NumberOfFeatures"], bins=bins_2).astype(str)
title = ["Number of instances across datasets",
"Number of features across datasets",
"Attribute Type percentage distribution",
"Number of classes"]
fig = plotly.subplots.make_subplots(rows=4, cols=1, subplot_titles=tuple(title))
for col in ["Number of instances", "Number of features"]:
df[col] = df[col].str.replace(',', ' -')
df[col] = df[col].str.replace('(', "")
# License: BSD 3-Clauses
import openml
import pandas as pd
############################################################################
# Exercise 0
# **********
#
# * List datasets
#
# * Use the output_format parameter to select output type
# * Default gives 'dict' (other option: 'dataframe', see below)
openml_list = openml.datasets.list_datasets() # returns a dict
# Show a nice table with some key data properties
datalist = pd.DataFrame.from_dict(openml_list, orient='index')
datalist = datalist[[
'did', 'name', 'NumberOfInstances',
'NumberOfFeatures', 'NumberOfClasses'
]]
print(f"First 10 of {len(datalist)} datasets...")
datalist.head(n=10)
# The same can be done with lesser lines of code
openml_df = openml.datasets.list_datasets(output_format='dataframe')
openml_df.head(n=10)
############################################################################
import openml
from mlaut.data import Data
import pandas as pd
from sklearn import preprocessing
import os
import sys
apikey = 'd2b1d13981d4abfb22895337baca924c'
openml.config.apikey = apikey
openml.config.set_cache_directory(os.path.expanduser('~/.openml/cache'))
NUMBER_OF_INSTANCES_CUTOFF_NUMBER = 10000 #
all_datasets = openml.datasets.list_datasets()
data = Data()
input_io = data.open_hdf5('data/openml.h5', mode='a')
num_saved_datasets = 0
for id in all_datasets.keys():
#regression datasets have a value of -1. Classification datasets specify the number of classes
if all_datasets[id]['NumberOfClasses'] == -1:
print(f"Skipping dataset {id}, {all_datasets[id]['name']}. This is a regression dataset.")
continue
if all_datasets[id]['NumberOfMissingValues'] > 0:
print(f"Skipping dataset {id}, {all_datasets[id]['name']} due to missing values.")
continue
if all_datasets[id]['NumberOfInstances'] > NUMBER_OF_INSTANCES_CUTOFF_NUMBER:
print(f"Skipping dataset {id}, {all_datasets[id]['name']}. It has more than {NUMBER_OF_INSTANCES_CUTOFF_NUMBER} instances.")
def write_csv(name, id_list, datasets):
ofile = open(name + '.csv', 'wb')
writer = csv.writer(ofile, delimiter=",")
writer.writerow(['Dataset ID', 'Dataset Name','Number of Instances', 'Number of Instances With Missing Values', 'Number of Classes', 'Number of Features'])
for did in id_list:
dataset = datasets[did]
writer.writerow([did, dataset['name'], dataset['NumberOfInstances'], dataset['NumberOfInstancesWithMissingValues'], dataset['NumberOfClasses'], dataset['NumberOfFeatures']])
ofile.close()
apikey = 'c0bbf61f0ca7139a3db5562edcbe10e5'
openml.config.apikey = apikey
datasets = openml.datasets.list_datasets()
metric_dict = {}
for key in datasets:
dataset = datasets[key]
try:
if dataset['status'] != 'active':
continue
else:
data_id = key
num_instances = dataset['NumberOfInstances']
num_missing_instances = dataset['NumberOfInstancesWithMissingValues']
num_features = dataset['NumberOfFeatures']
metric_dict[data_id] = num_instances / (num_missing_instances + 1.0)
========
A basic tutorial on how to list, load and visualize datasets.
"""
############################################################################
# In general, we recommend working with tasks, so that the results can
# be easily reproduced. Furthermore, the results can be compared to existing results
# at OpenML. However, for the purposes of this tutorial, we are going to work with
# the datasets directly.
import openml
############################################################################
# List datasets
# =============
datasets_df = openml.datasets.list_datasets(output_format='dataframe')
print(datasets_df.head(n=10))
############################################################################
# Download a dataset
# ==================
# Iris dataset https://www.openml.org/d/61
dataset = openml.datasets.get_dataset(61)
# Print a summary
print(f"This is dataset '{dataset.name}', the target feature is "
f"'{dataset.default_target_attribute}'")
print(f"URL: {dataset.url}")
print(dataset.description[:500])
############################################################################