Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
except:
pass
df = pd.read_csv("data/raw/adult/adult.data", dtype='str', header=-1)
df = df.apply(lambda x: x.str.strip(' \t.'))
col_type = [
("age", CONTINUOUS),
("workclass", CATEGORICAL),
("fnlwgt", CONTINUOUS),
("education", ORDINAL, ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "HS-grad", "Prof-school", "Assoc-voc", "Assoc-acdm", "Some-college", "Bachelors", "Masters", "Doctorate"]),
("education-num", CONTINUOUS),
("marital-status", CATEGORICAL),
("occupation", CATEGORICAL),
("relationship", CATEGORICAL),
("race", CATEGORICAL),
("sex", CATEGORICAL),
("capital-gain", CONTINUOUS),
("capital-loss", CONTINUOUS),
("hours-per-week", CONTINUOUS),
("native-country", CATEGORICAL),
("label", CATEGORICAL)
]
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
"type": info[1],
"min": np.min(df.iloc[:, id_].values.astype('float')),
"max": np.max(df.iloc[:, id_].values.astype('float'))
for i in range(28):
meta.append({
"name": "V%d" % i,
"type": CONTINUOUS,
"min": np.min(values[:, i]),
"max": np.max(values[:, i])
})
meta.append({
"name": "Amount",
"type": CONTINUOUS,
"min": np.min(values[:, 28]),
"max": np.max(values[:, 28])
})
meta.append({
"name": "label",
"type": CATEGORICAL,
"size": 2,
"i2s": ["0", "1"]
})
np.random.seed(0)
np.random.shuffle(values)
t_train = values[:-20000].astype('float32')
t_test = values[-20000:].astype('float32')
name = "credit"
with open("{}/{}.json".format(output_dir, name), 'w') as f:
json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))
np.savez("{}/{}.npz".format(output_dir, name), train=t_train, test=t_test)
verify("{}/{}.npz".format(output_dir, name),
"{}/{}.json".format(output_dir, name))
("dst_host_rerror_rate", CONTINUOUS),
("dst_host_srv_rerror_rate", CONTINUOUS),
("label", CATEGORICAL)
]
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
"type": info[1],
"min": np.min(df.iloc[:, id_].values.astype('float')),
"max": np.max(df.iloc[:, id_].values.astype('float'))
})
else:
if info[1] == CATEGORICAL:
value_count = list(dict(df.iloc[:, id_].value_counts()).items())
value_count = sorted(value_count, key=lambda x: -x[1])
mapper = list(map(lambda x: x[0], value_count))
else:
mapper = info[2]
meta.append({
"name": info[0],
"type": info[1],
"size": len(mapper),
"i2s": mapper
})
tdata = project_table(df, meta)
("capital gains", CONTINUOUS),
("capital losses", CONTINUOUS),
("dividends from stocks", CONTINUOUS),
("tax filer stat", CATEGORICAL),
("region of previous residence", CATEGORICAL),
("state of previous residence", CATEGORICAL),
("detailed household and family stat", CATEGORICAL),
("detailed household summary in household", CATEGORICAL),
("migration code-change in msa", CATEGORICAL),
("migration code-change in reg", CATEGORICAL),
("migration code-move within reg", CATEGORICAL),
("live in this house 1 year ago", CATEGORICAL),
("migration prev res in sunbelt", CATEGORICAL),
("num persons worked for employer", CONTINUOUS),
("family members under 18", CATEGORICAL),
("country of birth father", CATEGORICAL),
("country of birth mother", CATEGORICAL),
("country of birth self", CATEGORICAL),
("citizenship", CATEGORICAL),
("own business or self employed", CATEGORICAL),
("fill inc questionnaire for veteran's admin", CATEGORICAL),
("veterans benefits", CATEGORICAL),
("weeks worked in year", CONTINUOUS),
("year", CATEGORICAL),
("label", CATEGORICAL)
]
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
def make_data(t_train, t_test, wh, name):
np.random.seed(0)
assert t_train.shape[1] == wh * wh + 1
assert t_test.shape[1] == wh * wh + 1
meta = []
for i in range(wh):
for j in range(wh):
meta.append({
"name": "%02d%02d" % (i, j),
"type": CATEGORICAL,
"size": 2,
"i2s": ["0", "1"]
})
meta.append({
"name": "label",
"type": CATEGORICAL,
"size": 10,
"i2s": [str(x) for x in range(10)]
})
with open("{}/{}.json".format(output_dir, name), 'w') as f:
json.dump(meta, f, sort_keys=True, indent=4, separators=(',', ': '))
np.random.shuffle(t_train)
try:
os.mkdir(temp_dir)
except:
pass
df = pd.read_csv("data/raw/news/OnlineNewsPopularity.csv", dtype='str', header=0)
df = df.apply(lambda x: x.str.strip(' \t.'))
df.drop(['url', ' timedelta'], axis=1, inplace=True)
meta = []
for col_name in df.columns:
if "is_" in col_name:
meta.append({
"name": col_name,
"type": CATEGORICAL,
"size": 2,
"i2s": ['0', '1']
})
else:
meta.append({
"name": "label" if col_name.strip() == "shares" else col_name.strip(),
"type": CONTINUOUS,
"min": np.min(df[col_name].values.astype('float')),
"max": np.max(df[col_name].values.astype('float'))
})
tdata = df.values.astype('float32')
np.random.seed(0)
np.random.shuffle(tdata)
df = pd.read_csv("data/raw/covtype/covtype.data", dtype='str', header=-1)
col_type = [
("Elevation", CONTINUOUS),
("Aspect", CONTINUOUS),
("Slope", CONTINUOUS),
("Horizontal_Distance_To_Hydrology", CONTINUOUS),
("Vertical_Distance_To_Hydrology", CONTINUOUS),
("Horizontal_Distance_To_Roadways", CONTINUOUS),
("Hillshade_9am", CONTINUOUS),
("Hillshade_Noon", CONTINUOUS),
("Hillshade_3pm", CONTINUOUS),
("Horizontal_Distance_To_Fire_Points", CONTINUOUS)
] + [
("Wilderness_Area_{}".format(i), CATEGORICAL) for i in range(4)
] + [
("Soil_Type{}".format(i), CATEGORICAL) for i in range(40)
] + [
("label", CATEGORICAL)
]
meta = []
for id_, info in enumerate(col_type):
if info[1] == CONTINUOUS:
meta.append({
"name": info[0],
"type": info[1],
"min": np.min(df.iloc[:, id_].values.astype('float')),
"max": np.max(df.iloc[:, id_].values.astype('float'))
})
else: