How to use the zat.dataframe_to_matrix function in zat

To help you get started, we’ve selected a few zat examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github SuperCowPowers / zat / examples / anomaly_detection_streaming.py View on Github external
# Every 30 seconds grab the dataframe from the cache
            if time.time() > timer:
                timer = time.time() + time_delta

                # Get the windowed dataframe (10 minute window)
                bro_df = df_cache.dataframe()

                # Compute some addition data
                bro_df['query_length'] = bro_df['query'].str.len()
                bro_df['answer_length'] = bro_df['answers'].str.len()
                bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))

                # Use the zat DataframeToMatrix class
                features = ['Z', 'proto', 'qtype_name', 'query_length', 'answer_length', 'entropy', 'id.resp_p']
                to_matrix = dataframe_to_matrix.DataFrameToMatrix()
                bro_matrix = to_matrix.fit_transform(bro_df[features])
                print(bro_matrix.shape)

                # Print out the range of the daterange and some stats
                print('DataFrame TimeRange: {:s} --> {:s}'.format(str(bro_df['ts'].min()), str(bro_df['ts'].max())))

                # Train/fit and Predict anomalous instances using the Isolation Forest model
                odd_clf = IsolationForest(contamination=0.2)  # Marking 20% as odd
                predictions = odd_clf.fit_predict(bro_matrix)
                odd_df = bro_df[predictions == -1]

                # Now we're going to explore our odd observations with help from KMeans
                odd_matrix = to_matrix.transform(odd_df[features])
                batch_kmeans.partial_fit(odd_matrix)
                clusters = batch_kmeans.predict(odd_matrix).tolist()
                odd_df['cluster'] = clusters
github SuperCowPowers / zat / examples / anomaly_detection.py View on Github external
print(bro_df.head())
        except IOError:
            print('Could not open or parse the specified logfile: %s' % args.bro_log)
            sys.exit(1)
        print('Read in {:d} Rows...'.format(len(bro_df)))

        # Using Pandas we can easily and efficiently compute additional data metrics
        # Here we use the vectorized operations of Pandas/Numpy to compute query length
        # We'll also compute entropy of the query
        if log_type == 'dns':
            bro_df['query_length'] = bro_df['query'].str.len()
            bro_df['answer_length'] = bro_df['answers'].str.len()
            bro_df['entropy'] = bro_df['query'].map(lambda x: entropy(x))

        # Use the zat DataframeToMatrix class
        to_matrix = dataframe_to_matrix.DataFrameToMatrix()
        bro_matrix = to_matrix.fit_transform(bro_df[features])
        print(bro_matrix.shape)

        # Train/fit and Predict anomalous instances using the Isolation Forest model
        odd_clf = IsolationForest(contamination=0.2)  # Marking 20% as odd
        odd_clf.fit(bro_matrix)

        # Now we create a new dataframe using the prediction from our classifier
        predictions = odd_clf.predict(bro_matrix)
        odd_df = bro_df[features][predictions == -1]
        display_df = bro_df[predictions == -1]

        # Now we're going to explore our odd observations with help from KMeans
        odd_matrix = to_matrix.fit_transform(odd_df)
        num_clusters = min(len(odd_df), 4)  # 4 clusters unless we have less than 4 observations
        display_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)
github SuperCowPowers / zat / examples / zeek_to_scikit.py View on Github external
log_to_df = log_to_dataframe.LogToDataFrame()
        bro_df = log_to_df.create_dataframe(args.bro_log)

        # Add query length
        bro_df['query_length'] = bro_df['query'].str.len()

        # Normalize this field
        #ql = bro_df['query_length']
        #bro_df['query_length_norm'] = (ql - ql.min()) / (ql.max()-ql.min())

        # These are the features we want (note some of these are categorical!)
        features = ['AA', 'RA', 'RD', 'TC', 'Z', 'rejected', 'proto', 'qtype_name', 'rcode_name', 'query_length']
        feature_df = bro_df[features]

        # Use the super awesome DataframeToMatrix class (handles categorical data!)
        to_matrix = dataframe_to_matrix.DataFrameToMatrix()
        bro_matrix = to_matrix.fit_transform(feature_df)

        # Now we're ready for scikit-learn!
        # Just some simple stuff for this example, KMeans and PCA
        kmeans = KMeans(n_clusters=5).fit_predict(bro_matrix)
        pca = PCA(n_components=2).fit_transform(bro_matrix)

        # Now we can put our ML results back onto our dataframe!
        bro_df['x'] = jitter(pca[:, 0]) # PCA X Column
        bro_df['y'] = jitter(pca[:, 1]) # PCA Y Column
        bro_df['cluster'] = kmeans

        # Now use dataframe group by cluster
        show_fields = ['query', 'Z', 'proto', 'qtype_name', 'x', 'y', 'cluster']
        cluster_groups = bro_df[show_fields].groupby('cluster')
github SuperCowPowers / zat / examples / anomaly_detection_streaming.py View on Github external
# Create a Zeek log reader
        print('Opening Data File: {:s}'.format(args.bro_log))
        reader = bro_log_reader.BroLogReader(args.bro_log, tail=True)

        # Create a Zeek IDS log live simulator
        print('Opening Data File: {:s}'.format(args.bro_log))
        reader = live_simulator.LiveSimulator(args.bro_log, eps=10)  # 10 events per second

        # Create a Dataframe Cache
        df_cache = dataframe_cache.DataFrameCache(max_cache_time=600)  # 10 minute cache

        # Streaming Clustering Class
        batch_kmeans = MiniBatchKMeans(n_clusters=5, verbose=True)

        # Use the BroThon DataframeToMatrix class
        to_matrix = dataframe_to_matrix.DataFrameToMatrix()

        # Add each new row into the cache
        time_delta = 10
        timer = time.time() + time_delta
        FIRST_TIME = True
        for row in reader.readrows():
            df_cache.add_row(row)

            # Every 30 seconds grab the dataframe from the cache
            if time.time() > timer:
                timer = time.time() + time_delta

                # Get the windowed dataframe (10 minute window)
                bro_df = df_cache.dataframe()

                # Compute some addition data