Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Every 30 seconds grab the dataframe from the cache
if time.time() > timer:
timer = time.time() + time_delta
# Get the windowed dataframe (10 minute window)
bro_df = df_cache.dataframe()
# Compute some addition data
bro_df['query_length'] = bro_df['query'].str.len()
bro_df['answer_length'] = bro_df['answers'].str.len()
bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))
# Use the zat DataframeToMatrix class
features = ['Z', 'proto', 'qtype_name', 'query_length', 'answer_length', 'entropy', 'id.resp_p']
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
bro_matrix = to_matrix.fit_transform(bro_df[features])
print(bro_matrix.shape)
# Print out the range of the daterange and some stats
print('DataFrame TimeRange: {:s} --> {:s}'.format(str(bro_df['ts'].min()), str(bro_df['ts'].max())))
# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(contamination=0.2) # Marking 20% as odd
predictions = odd_clf.fit_predict(bro_matrix)
odd_df = bro_df[predictions == -1]
# Now we're going to explore our odd observations with help from KMeans
odd_matrix = to_matrix.transform(odd_df[features])
batch_kmeans.partial_fit(odd_matrix)
clusters = batch_kmeans.predict(odd_matrix).tolist()
odd_df['cluster'] = clusters
print(bro_df.head())
except IOError:
print('Could not open or parse the specified logfile: %s' % args.bro_log)
sys.exit(1)
print('Read in {:d} Rows...'.format(len(bro_df)))
# Using Pandas we can easily and efficiently compute additional data metrics
# Here we use the vectorized operations of Pandas/Numpy to compute query length
# We'll also compute entropy of the query
if log_type == 'dns':
bro_df['query_length'] = bro_df['query'].str.len()
bro_df['answer_length'] = bro_df['answers'].str.len()
bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))
# Use the zat DataframeToMatrix class
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
bro_matrix = to_matrix.fit_transform(bro_df[features])
print(bro_matrix.shape)
# Train/fit and Predict anomalous instances using the Isolation Forest model
odd_clf = IsolationForest(contamination=0.2) # Marking 20% as odd
odd_clf.fit(bro_matrix)
# Now we create a new dataframe using the prediction from our classifier
predictions = odd_clf.predict(bro_matrix)
odd_df = bro_df[features][predictions == -1]
display_df = bro_df[predictions == -1]
# Now we're going to explore our odd observations with help from KMeans
odd_matrix = to_matrix.fit_transform(odd_df)
num_clusters = min(len(odd_df), 4) # 4 clusters unless we have less than 4 observations
display_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)
log_to_df = log_to_dataframe.LogToDataFrame()
bro_df = log_to_df.create_dataframe(args.bro_log)
# Add query length
bro_df['query_length'] = bro_df['query'].str.len()
# Normalize this field
#ql = bro_df['query_length']
#bro_df['query_length_norm'] = (ql - ql.min()) / (ql.max()-ql.min())
# These are the features we want (note some of these are categorical!)
features = ['AA', 'RA', 'RD', 'TC', 'Z', 'rejected', 'proto', 'qtype_name', 'rcode_name', 'query_length']
feature_df = bro_df[features]
# Use the super awesome DataframeToMatrix class (handles categorical data!)
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
bro_matrix = to_matrix.fit_transform(feature_df)
# Now we're ready for scikit-learn!
# Just some simple stuff for this example, KMeans and PCA
kmeans = KMeans(n_clusters=5).fit_predict(bro_matrix)
pca = PCA(n_components=2).fit_transform(bro_matrix)
# Now we can put our ML results back onto our dataframe!
bro_df['x'] = jitter(pca[:, 0]) # PCA X Column
bro_df['y'] = jitter(pca[:, 1]) # PCA Y Column
bro_df['cluster'] = kmeans
# Now use dataframe group by cluster
show_fields = ['query', 'Z', 'proto', 'qtype_name', 'x', 'y', 'cluster']
cluster_groups = bro_df[show_fields].groupby('cluster')
# Create a Zeek log reader
print('Opening Data File: {:s}'.format(args.bro_log))
reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)
# Create a Zeek IDS log live simulator
print('Opening Data File: {:s}'.format(args.bro_log))
reader = live_simulator.LiveSimulator(args.bro_log, eps=10) # 10 events per second
# Create a Dataframe Cache
df_cache = dataframe_cache.DataFrameCache(max_cache_time=600) # 10 minute cache
# Streaming Clustering Class
batch_kmeans = MiniBatchKMeans(n_clusters=5, verbose=True)
# Use the BroThon DataframeToMatrix class
to_matrix = dataframe_to_matrix.DataFrameToMatrix()
# Add each new row into the cache
time_delta = 10
timer = time.time() + time_delta
FIRST_TIME = True
for row in reader.readrows():
df_cache.add_row(row)
# Every 30 seconds grab the dataframe from the cache
if time.time() > timer:
timer = time.time() + time_delta
# Get the windowed dataframe (10 minute window)
bro_df = df_cache.dataframe()
# Compute some addition data