Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Assert that the dataframe we passed in didn't change
copy_test_df.equals(test_df)
# Test that the conversion gives us the same columns on a df with different category values
# This also tests NaN in a category column
print('TRANSFORM2')
matrix2 = to_matrix.transform(test_df2)
assert matrix.shape == matrix2.shape
# First two ROWS should be the same
np_test_utils.assert_equal(matrix[0], matrix2[0])
np_test_utils.assert_equal(matrix[1], matrix2[1])
# Test normalize
to_matrix_norm = DataFrameToMatrix()
print('FIT-TRANSFORM')
norm_matrix = to_matrix_norm.fit_transform(test_df)
print(norm_matrix)
assert(norm_matrix[:, 0].min() == 0)
assert(norm_matrix[:, 0].max() == 1)
# Make sure normalize 'does the right thing' when doing transform
print('TRANSFORM')
norm_matrix2 = to_matrix_norm.transform(test_df2)
assert(norm_matrix2[:, 0].min() == 0)
assert(norm_matrix2[:, 0].max() == 2) # Normalization is based on FIT range
# Test div by zero in normalize
test_df3 = test_df2.copy()
test_df3['D'] = [1, 1, 1, 1]
print('FIT-TRANSFORM')
to_matrix = DataFrameToMatrix()
my_matrix = to_matrix.fit_transform(test_df)
kmeans = KMeans(n_clusters=2).fit_predict(my_matrix)
# Now we can put our ML results back onto our dataframe!
test_df['cluster'] = kmeans
cluster_groups = test_df.groupby('cluster')
# Now print out the details for each cluster
for key, group in cluster_groups:
print('Rows in Cluster: {:d}'.format(len(group)))
print(group.head(), '\n')
del test_df['cluster']
# Now we're going to intentionally introduce NaNs in the categorical output just to see what happens
to_matrix = DataFrameToMatrix()
_ = to_matrix.fit_transform(test_df)
my_matrix2 = to_matrix.transform(test_df2)
kmeans = KMeans(n_clusters=2).fit_predict(my_matrix2)
# Now we can put our ML results back onto our dataframe!
test_df2['cluster'] = kmeans
cluster_groups = test_df2.groupby('cluster')
# Now print out the details for each cluster
for key, group in cluster_groups:
print('Rows in Cluster: {:d}'.format(len(group)))
print(group.head(), '\n')
test_df2 = pd.DataFrame(
{'A': pd.Categorical(['a', 'b', 'b', 'a'], ordered=True),
'B': pd.Categorical(['a', 'b', 'd', 'a'], ordered=False),
'C': pd.Categorical(['a', 'b', 'z', 'y'], categories=['a', 'b', 'z', 'd']),
'D': [1, 2, 3, 7],
'E': ['w', 'x', 'z', 'foo'],
'F': [1.1, 2.2, 3.3, 4.4],
'H': [True, False, False, False]
}
)
# Copy the test_df for testing later
copy_test_df = test_df.copy()
# Test the transformation from dataframe to numpy ndarray and back again
to_matrix = DataFrameToMatrix()
print('FIT-TRANSFORM')
matrix = to_matrix.fit_transform(test_df)
print('TRANSFORM')
matrix_test = to_matrix.transform(test_df)
# These two matrices should be the same
np_test_utils.assert_equal(matrix, matrix_test)
# Assert that the dataframe we passed in didn't change
copy_test_df.equals(test_df)
# Test that the conversion gives us the same columns on a df with different category values
# This also tests NaN in a category column
print('TRANSFORM2')
matrix2 = to_matrix.transform(test_df2)
assert matrix.shape == matrix2.shape
os.unlink(temp.name)
# Try 'nullable' integer arrays
null_df = test_df2.copy()
null_df['I'] = pd.Series([10, 11, 12, np.NaN], dtype='UInt64')
print('FIT-TRANSFORM')
matrix = to_matrix.fit_transform(null_df)
print('TRANSFORM')
matrix_test = to_matrix.transform(null_df)
# These two matrices should be the same
np_test_utils.assert_equal(matrix, matrix_test)
# Now actually try the matrix with a scikit-learn algo
from sklearn.cluster import KMeans
to_matrix = DataFrameToMatrix()
my_matrix = to_matrix.fit_transform(test_df)
kmeans = KMeans(n_clusters=2).fit_predict(my_matrix)
# Now we can put our ML results back onto our dataframe!
test_df['cluster'] = kmeans
cluster_groups = test_df.groupby('cluster')
# Now print out the details for each cluster
for key, group in cluster_groups:
print('Rows in Cluster: {:d}'.format(len(group)))
print(group.head(), '\n')
del test_df['cluster']
# Now we're going to intentionally introduce NaNs in the categorical output just to see what happens
to_matrix = DataFrameToMatrix()
_ = to_matrix.fit_transform(test_df)