How to use the sdgym.utils.data.utils.CATEGORICAL function in sdgym

To help you get started, we’ve selected a few sdgym examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github DAI-Lab / SDGym / sdgym / utils / data / real / adult.py View on Github external
except:
        pass

    df = pd.read_csv("data/raw/adult/adult.data", dtype='str', header=-1)
    df = df.apply(lambda x: x.str.strip(' \t.'))

    col_type = [
        ("age", CONTINUOUS),
        ("workclass", CATEGORICAL),
        ("fnlwgt", CONTINUOUS),
        ("education", ORDINAL, ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Prof-school", "Assoc-voc", "Assoc-acdm", "Some-college", "Bachelors", "Masters", "Doctorate"]),
        ("education-num", CONTINUOUS),
        ("marital-status", CATEGORICAL),
        ("occupation", CATEGORICAL),
        ("relationship", CATEGORICAL),
        ("race", CATEGORICAL),
        ("sex", CATEGORICAL),
        ("capital-gain", CONTINUOUS),
        ("capital-loss", CONTINUOUS),
        ("hours-per-week", CONTINUOUS),
        ("native-country", CATEGORICAL),
        ("label", CATEGORICAL)
    ]

    meta = []
    for id_, info in enumerate(col_type):
        if info[1] == CONTINUOUS:
            meta.append({
                "name": info[0],
                "type": info[1],
                "min": np.min(df.iloc[:, id_].values.astype('float')),
                "max": np.max(df.iloc[:, id_].values.astype('float'))
github DAI-Lab / SDGym / sdgym / utils / data / real / credit.py View on Github external
for i in range(28):
        meta.append({
            "name": "V%d" % i,
            "type": CONTINUOUS,
            "min": np.min(values[:, i]),
            "max": np.max(values[:, i])
        })
    meta.append({
        "name": "Amount",
        "type": CONTINUOUS,
        "min": np.min(values[:, 28]),
        "max": np.max(values[:, 28])
    })
    meta.append({
        "name": "label",
        "type": CATEGORICAL,
        "size": 2,
        "i2s": ["0", "1"]
    })

    np.random.seed(0)
    np.random.shuffle(values)
    t_train = values[:-20000].astype('float32')
    t_test = values[-20000:].astype('float32')

    name = "credit"
    with open("{}/{}.json".format(output_dir, name), 'w') as f:
        json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))
    np.savez("{}/{}.npz".format(output_dir, name), train=t_train, test=t_test)

    verify("{}/{}.npz".format(output_dir, name),
            "{}/{}.json".format(output_dir, name))
github DAI-Lab / SDGym / sdgym / utils / data / real / intrusion.py View on Github external
("dst_host_rerror_rate", CONTINUOUS),
        ("dst_host_srv_rerror_rate", CONTINUOUS),
        ("label", CATEGORICAL)
    ]

    meta = []
    for id_, info in enumerate(col_type):
        if info[1] == CONTINUOUS:
            meta.append({
                "name": info[0],
                "type": info[1],
                "min": np.min(df.iloc[:, id_].values.astype('float')),
                "max": np.max(df.iloc[:, id_].values.astype('float'))
            })
        else:
            if info[1] == CATEGORICAL:
                value_count = list(dict(df.iloc[:, id_].value_counts()).items())
                value_count = sorted(value_count, key=lambda x: -x[1])
                mapper = list(map(lambda x: x[0], value_count))
            else:
                mapper = info[2]

            meta.append({
                "name": info[0],
                "type": info[1],
                "size": len(mapper),
                "i2s": mapper
            })


    tdata = project_table(df, meta)
github DAI-Lab / SDGym / sdgym / utils / data / real / census.py View on Github external
("capital gains", CONTINUOUS),
        ("capital losses", CONTINUOUS),
        ("dividends from stocks", CONTINUOUS),
        ("tax filer stat", CATEGORICAL),
        ("region of previous residence", CATEGORICAL),
        ("state of previous residence", CATEGORICAL),
        ("detailed household and family stat", CATEGORICAL),
        ("detailed household summary in household", CATEGORICAL),
        ("migration code-change in msa", CATEGORICAL),
        ("migration code-change in reg", CATEGORICAL),
        ("migration code-move within reg", CATEGORICAL),
        ("live in this house 1 year ago", CATEGORICAL),
        ("migration prev res in sunbelt", CATEGORICAL),
        ("num persons worked for employer", CONTINUOUS),
        ("family members under 18", CATEGORICAL),
        ("country of birth father", CATEGORICAL),
        ("country of birth mother", CATEGORICAL),
        ("country of birth self", CATEGORICAL),
        ("citizenship", CATEGORICAL),
        ("own business or self employed", CATEGORICAL),
        ("fill inc questionnaire for veteran's admin", CATEGORICAL),
        ("veterans benefits", CATEGORICAL),
        ("weeks worked in year", CONTINUOUS),
        ("year", CATEGORICAL),
        ("label", CATEGORICAL)
    ]

    meta = []
    for id_, info in enumerate(col_type):
        if info[1] == CONTINUOUS:
            meta.append({
                "name": info[0],
github DAI-Lab / SDGym / sdgym / utils / data / real / mnist.py View on Github external
def make_data(t_train, t_test, wh, name):
    np.random.seed(0)

    assert t_train.shape[1] == wh * wh + 1
    assert t_test.shape[1] == wh * wh + 1

    meta = []
    for i in range(wh):
        for j in range(wh):
            meta.append({
                "name": "%02d%02d" % (i, j),
                "type": CATEGORICAL,
                "size": 2,
                "i2s": ["0", "1"]
            })
    meta.append({
        "name": "label",
        "type": CATEGORICAL,
        "size": 10,
        "i2s": [str(x) for x in range(10)]
    })

    with open("{}/{}.json".format(output_dir, name), 'w') as f:
        json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))


    np.random.shuffle(t_train)
github DAI-Lab / SDGym / sdgym / utils / data / real / news.py View on Github external
try:
        os.mkdir(temp_dir)
    except:
        pass

    df = pd.read_csv("data/raw/news/OnlineNewsPopularity.csv", dtype='str', header=0)
    df = df.apply(lambda x: x.str.strip(' \t.'))
    df.drop(['url', ' timedelta'], axis=1, inplace=True)

    meta = []
    for col_name in df.columns:
        if "is_" in col_name:
            meta.append({
                "name": col_name,
                "type": CATEGORICAL,
                "size": 2,
                "i2s": ['0', '1']
            })
        else:
            meta.append({
                "name": "label" if col_name.strip() == "shares" else col_name.strip(),
                "type": CONTINUOUS,
                "min": np.min(df[col_name].values.astype('float')),
                "max": np.max(df[col_name].values.astype('float'))
            })

    tdata = df.values.astype('float32')

    np.random.seed(0)
    np.random.shuffle(tdata)
github DAI-Lab / SDGym / sdgym / utils / data / real / covtype.py View on Github external
df = pd.read_csv("data/raw/covtype/covtype.data", dtype='str', header=-1)

    col_type = [
        ("Elevation", CONTINUOUS),
        ("Aspect", CONTINUOUS),
        ("Slope", CONTINUOUS),
        ("Horizontal_Distance_To_Hydrology", CONTINUOUS),
        ("Vertical_Distance_To_Hydrology", CONTINUOUS),
        ("Horizontal_Distance_To_Roadways", CONTINUOUS),
        ("Hillshade_9am", CONTINUOUS),
        ("Hillshade_Noon", CONTINUOUS),
        ("Hillshade_3pm", CONTINUOUS),
        ("Horizontal_Distance_To_Fire_Points", CONTINUOUS)
    ] + [
        ("Wilderness_Area_{}".format(i), CATEGORICAL) for i in range(4)
    ] + [
        ("Soil_Type{}".format(i), CATEGORICAL) for i in range(40)
    ] + [
        ("label", CATEGORICAL)
    ]

    meta = []
    for id_, info in enumerate(col_type):
        if info[1] == CONTINUOUS:
            meta.append({
                "name": info[0],
                "type": info[1],
                "min": np.min(df.iloc[:, id_].values.astype('float')),
                "max": np.max(df.iloc[:, id_].values.astype('float'))
            })
        else: