diff --git a/contrib/deploy_timesketch.ps1 b/contrib/deploy_timesketch.ps1 index 3c303823f4..126db5a82e 100644 --- a/contrib/deploy_timesketch.ps1 +++ b/contrib/deploy_timesketch.ps1 @@ -66,7 +66,8 @@ Write-Host "* Fetching configuration files.." (Invoke-webrequest -URI $GITHUB_BASE_URL/data/tags.yaml).Content | out-file timesketch\etc\timesketch\tags.yaml -encoding UTF8NoBOM (Invoke-webrequest -URI $GITHUB_BASE_URL/data/plaso.mappings).Content | out-file timesketch\etc\timesketch\plaso.mappings -encoding UTF8NoBOM (Invoke-webrequest -URI $GITHUB_BASE_URL/data/generic.mappings).Content | out-file timesketch\etc\timesketch\generic.mappings -encoding UTF8NoBOM -(Invoke-webrequest -URI $GITHUB_BASE_URL/data/features.yaml).Content | out-file timesketch\etc\timesketch\features.yaml -encoding UTF8NoBOM +(Invoke-webrequest -URI $GITHUB_BASE_URL/data/regex_features.yaml).Content | out-file timesketch\etc\timesketch\regex_features.yaml -encoding UTF8NoBOM +(Invoke-webrequest -URI $GITHUB_BASE_URL/data/winevt_features.yaml).Content | out-file timesketch\etc\timesketch\winevt_features.yaml -encoding UTF8NoBOM (Invoke-webrequest -URI $GITHUB_BASE_URL/data/ontology.yaml).Content | out-file timesketch\etc\timesketch\ontology.yaml -encoding UTF8NoBOM (Invoke-webrequest -URI $GITHUB_BASE_URL/data/intelligence_tag_metadata.yaml).Content | out-file timesketch\etc\timesketch\intelligence_tag_metadata.yaml -encoding UTF8NoBOM (Invoke-webrequest -URI $GITHUB_BASE_URL/data/sigma_config.yaml).Content | out-file timesketch\etc\timesketch\sigma_config.yaml -encoding UTF8NoBOM diff --git a/contrib/deploy_timesketch.sh b/contrib/deploy_timesketch.sh index 42ad9d9ece..1b55dee775 100755 --- a/contrib/deploy_timesketch.sh +++ b/contrib/deploy_timesketch.sh @@ -89,7 +89,8 @@ curl -s $GITHUB_BASE_URL/data/timesketch.conf > timesketch/etc/timesketch/timesk curl -s $GITHUB_BASE_URL/data/tags.yaml > timesketch/etc/timesketch/tags.yaml curl -s $GITHUB_BASE_URL/data/plaso.mappings > timesketch/etc/timesketch/plaso.mappings curl -s $GITHUB_BASE_URL/data/generic.mappings > timesketch/etc/timesketch/generic.mappings -curl -s $GITHUB_BASE_URL/data/features.yaml > timesketch/etc/timesketch/features.yaml +curl -s $GITHUB_BASE_URL/data/regex_features.yaml > timesketch/etc/timesketch/regex_features.yaml +curl -s $GITHUB_BASE_URL/data/winevt_features.yaml > timesketch/etc/timesketch/winevt_features.yaml curl -s $GITHUB_BASE_URL/data/ontology.yaml > timesketch/etc/timesketch/ontology.yaml curl -s $GITHUB_BASE_URL/data/sigma_rule_status.csv > timesketch/etc/timesketch/sigma_rule_status.csv curl -s $GITHUB_BASE_URL/data/tags.yaml > timesketch/etc/timesketch/tags.yaml diff --git a/data/features.yaml b/data/regex_features.yaml similarity index 66% rename from data/features.yaml rename to data/regex_features.yaml index ecbd1e1763..3d30ae4647 100644 --- a/data/features.yaml +++ b/data/regex_features.yaml @@ -183,84 +183,6 @@ ssh_failed_method: store_as: 'authentication_method' re: 'Failed (?P[^\s]+) for .*ssh\d' -win_login_subject_username: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'subject_username' - re: '"SubjectUserName">(?P[^<]+)' - -win_login_subject_domain: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'subject_domain' - re: '"SubjectDomainName">(?P[^<]+)' - -win_login_subject_logon_id: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'subject_logon_id' - re: '"SubjectLogonId">(?P[^<]+)' - -win_login_username: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'username' - re: '"TargetUserName">(?P[^<]+)' - -win_login_domain: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'domain' - re: '"TargetDomainName">(?P[^<]+)' - -win_login_logon_id: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'logon_id' - re: '"TargetLogonId">(?P[^<]+)' - -win_login_logon_type: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'logon_type' - re: '"LogonType">(?P[^<]+)' - -win_login_logon_process_name: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'logon_process_name' - re: '"LogonProcessName">(?P[^<]+)' - -win_login_workstation_name: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'workstation_name' - re: '"WorkstationName">(?P[^<]+)' - -win_login_process_id: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'process_id' - re: '"ProcessId">(?P[^<]+)' - -win_login_process_name: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'process_name' - re: '"ProcessName">(?P[^<]+)' - -win_login_ip_address: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'ip_address' - re: '"IpAddress">(?P[^<]+)' - -win_login_port: - query_string: 'source_name:Microsoft-Windows-Security-Auditing AND (event_identifier:4624 OR event_identifier:4625)' - attribute: 'xml_string' - store_as: 'port' - re: '"IpPort">(?P[^<]+)' - win_bits_client_ipv4_addresses: query_string: 'data_type:"windows:evtx:record" AND source_name:Microsoft-Windows-Bits-Client' attribute: 'strings' diff --git a/docker/dev/build/docker-entrypoint.sh b/docker/dev/build/docker-entrypoint.sh index 8c665f0ca4..5d685d9b3a 100755 --- a/docker/dev/build/docker-entrypoint.sh +++ b/docker/dev/build/docker-entrypoint.sh @@ -9,7 +9,8 @@ if [ "$1" = 'timesketch' ]; then # Copy config files mkdir /etc/timesketch cp /usr/local/src/timesketch/data/timesketch.conf /etc/timesketch/ - cp /usr/local/src/timesketch/data/features.yaml /etc/timesketch/ + cp /usr/local/src/timesketch/data/regex_features.yaml /etc/timesketch/ + cp /usr/local/src/timesketch/data/winevt_features.yaml /etc/timesketch/ cp /usr/local/src/timesketch/data/tags.yaml /etc/timesketch/ cp /usr/local/src/timesketch/data/intelligence_tag_metadata.yaml /etc/timesketch/ cp /usr/local/src/timesketch/data/plaso.mappings /etc/timesketch/ diff --git a/docker/e2e/Dockerfile b/docker/e2e/Dockerfile index 2007d2d479..24b119c2b3 100644 --- a/docker/e2e/Dockerfile +++ b/docker/e2e/Dockerfile @@ -37,7 +37,8 @@ RUN cp /tmp/timesketch/data/timesketch.conf /etc/timesketch/ RUN cp /tmp/timesketch/data/ontology.yaml /etc/timesketch/ RUN cp /tmp/timesketch/data/tags.yaml /etc/timesketch/ RUN cp /tmp/timesketch/data/intelligence_tag_metadata.yaml /etc/timesketch/ -RUN cp /tmp/timesketch/data/features.yaml /etc/timesketch/ +RUN cp /tmp/timesketch/data/regex_features.yaml /etc/timesketch/ +RUN cp /tmp/timesketch/data/winevt_features.yaml /etc/timesketch/ RUN cp /tmp/timesketch/data/plaso.mappings /etc/timesketch/ RUN cp /tmp/timesketch/data/generic.mappings /etc/timesketch/ RUN cp /tmp/timesketch/data/sigma_config.yaml /etc/timesketch/ diff --git a/docs/developers/analyzer-development.md b/docs/developers/analyzer-development.md index 682491e289..a811d07303 100644 --- a/docs/developers/analyzer-development.md +++ b/docs/developers/analyzer-development.md @@ -50,8 +50,8 @@ of the following. If you just want to extract a simple feature, e.g. want to extract a hostname or IP that is somewhere in the message field, or inside another attribute you don't have to write a new analyzer, you can take advantage of the feature_extraction -analyzer. All you need to do is to edit the `features.yaml` file found here: -https://github.com/google/timesketch/blob/master/data/features.yaml +analyzer. All you need to do is to edit the `regex_features.yaml` file found here: +https://github.com/google/timesketch/blob/master/data/regex_features.yaml An example extraction entry looks like this: diff --git a/docs/guides/analyzers/feature_extraction.md b/docs/guides/analyzers/feature_extraction.md index 373fa7b71e..9796aabfdc 100644 --- a/docs/guides/analyzers/feature_extraction.md +++ b/docs/guides/analyzers/feature_extraction.md @@ -3,7 +3,7 @@ hide: - footer --- The feature extraction analyzer creates attributes out of event data based on regular expressions. Different -features can be specified in the `data/features.yaml` file. +features can be specified in the `data/regex_features.yaml` file. Please be aware that this analyzer does *not* extract ipv4, email-addresses and similar from *all* events, but only those that match the query_string. diff --git a/timesketch/lib/analyzers/__init__.py b/timesketch/lib/analyzers/__init__.py index 2f24932576..dd63913dbd 100644 --- a/timesketch/lib/analyzers/__init__.py +++ b/timesketch/lib/analyzers/__init__.py @@ -20,7 +20,6 @@ from timesketch.lib.analyzers import chain from timesketch.lib.analyzers import domain from timesketch.lib.analyzers import expert_sessionizers -from timesketch.lib.analyzers import feature_extraction_plugin from timesketch.lib.analyzers import feature_extraction from timesketch.lib.analyzers import gcp_logging from timesketch.lib.analyzers import geoip diff --git a/timesketch/lib/analyzers/feature_extraction.py b/timesketch/lib/analyzers/feature_extraction.py index 1a7cbbd7a0..258b646fda 100644 --- a/timesketch/lib/analyzers/feature_extraction.py +++ b/timesketch/lib/analyzers/feature_extraction.py @@ -1,353 +1,140 @@ -"""Sketch analyzer plugin for feature extraction.""" -from __future__ import unicode_literals +# Copyright 2023 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Main sketch analyzer for feature extraction.""" import logging +from typing import List, Optional, Dict -import six - -from timesketch.lib import emojis from timesketch.lib.analyzers import interface from timesketch.lib.analyzers import manager -from timesketch.lib.analyzers import utils - +from timesketch.lib.analyzers.feature_extraction_plugins import ( + manager as feature_manager, +) -logger = logging.getLogger("timesketch.analyzers.feature") -RE_FLAGS = [ - "re.ASCII", - "re.IGNORECASE", - "re.LOCALE", - "re.MULTILINE", - "re.DOTALL", - "re.VERBOSE", -] +logger = logging.getLogger("timesketch.analyzers.feature_extraction") class FeatureExtractionSketchPlugin(interface.BaseAnalyzer): - """Analyzer for FeatureExtraction.""" - - NAME = "feature_extraction" - DISPLAY_NAME = "Feature extractor" - DESCRIPTION = "Extract features from event based on stored definitions" - - FORM_FIELDS = [ - { - "name": "query_string", - "type": "ts-dynamic-form-text-input", - "label": "The filter query to narrow down the result set", - "placeholder": "Query", - "default_value": "", - }, - { - "name": "query_dsl", - "type": "ts-dynamic-form-text-input", - "label": "The filter query DSL to narrow down the result", - "placeholder": "Query DSL", - "default_value": "", - }, - { - "name": "attribute", - "type": "ts-dynamic-form-text-input", - "label": "Name of the field to apply regular expression against", - "placeholder": "Field Name", - "default_value": "", - }, - { - "name": "store_as", - "type": "ts-dynamic-form-text-input", - "label": "Name of the field to store the extracted results in", - "placeholder": "Store results as field name", - "default_value": "", - }, - { - "name": "re", - "type": "ts-dynamic-form-text-input", - "label": "The regular expression to extract data from field", - "placeholder": "Regular Expression", - "default_value": "", - }, - { - "name": "re_flags", - "type": "ts-dynamic-form-multi-select-input", - "label": "List of flags to pass to the regular expression", - "placeholder": "Regular Expression flags", - "default_value": [], - "options": RE_FLAGS, - "optional": True, - }, - { - "name": "emojis", - "type": "ts-dynamic-form-multi-select-input", - "label": "List of emojis to add to events with matches", - "placeholder": "Emojis to add to events", - "default_value": [], - "options": [x.code for x in emojis.EMOJI_MAP.values()], - "options-label": [ - "{0:s} - {1:s}".format(x, y.help) for x, y in emojis.EMOJI_MAP.items() - ], - "optional": True, - }, - { - "name": "tags", - "type": "ts-dynamic-form-text-input", - "label": "Tag to add to events with matches", - "placeholder": "Tag to add to events", - "default_value": "", - "optional": True, - }, - { - "name": "create_view", - "type": "ts-dynamic-form-boolean", - "label": "Should a view be created if there is a match", - "placeholder": "Create a view", - "default_value": False, - "optional": True, - }, - { - "name": "store_type_list", - "type": "ts-dynamic-form-boolean", - "label": "Store extracted result in type List", - "placeholder": "Store results as field type list", - "default_value": False, - "optional": True, - }, - { - "name": "overwrite_store_as", - "type": "ts-dynamic-form-boolean", - "label": "Overwrite the field to store if already exist", - "placeholder": "Overwrite the field to store", - "default_value": True, - "optional": True, - }, - { - "name": "overwrite_and_merge_store_as", - "type": "ts-dynamic-form-boolean", - "label": "Overwrite the field to store and merge value if exist", - "placeholder": "Overwrite the field to store and merge value", - "default_value": False, - "optional": True, - }, - { - "name": "keep_multimatch", - "type": "ts-dynamic-form-boolean", - "label": "Keep multi match datas", - "placeholder": "Keep multi match", - "default_value": False, - "optional": True, - }, - { - "name": "aggregate", - "type": "ts-dynamic-form-boolean", - "label": "Should results be aggregated if there is a match", - "placeholder": "Aggregate results", - "default_value": False, - "optional": True, - }, - ] + """Main sketch analyzer for feature extraction. - def __init__(self, index_name, sketch_id, timeline_id=None, **kwargs): - """Initialize The Sketch Analyzer. + This analyzer runs all the feature extractions within the feature_plugins directory. + """ - Args: - index_name: OpenSearch index name - sketch_id: Sketch ID - timeline_id: The ID of the timeline. - """ - self.index_name = index_name - self._feature_name = kwargs.get("feature") - self._feature_config = kwargs.get("feature_config") - super().__init__(index_name, sketch_id, timeline_id=timeline_id) - - def run(self): - """Entry point for the analyzer. - - Returns: - String with summary of the analyzer result. - """ - return self.extract_feature(self._feature_name, self._feature_config) - - @staticmethod - def _get_attribute_value( - current_val, extracted_value, keep_multi, merge_values, type_list - ): - """Returns the attribute value as it should be stored. - - Args: - current_val: current value of store_as. - extracted_value: values matched from regexp (type list). - keep_multi: choice if you keep all match from regex (type boolean). - merge_values: choice if you merge value from extracted - and current (type boolean). - type_list: choice if you store values in list type(type boolean). - - Returns: - Value to store - """ - if not current_val: - merge_values = False - if len(extracted_value) == 1: - keep_multi = False - if type_list: - if merge_values and keep_multi: - return sorted(list(set(current_val) | set(extracted_value))) - if merge_values: - if extracted_value[0] not in current_val: - current_val.append(extracted_value[0]) - return sorted(current_val) - if keep_multi: - return sorted(extracted_value) - return [extracted_value[0]] - if merge_values and keep_multi: - list_cur = current_val.split(",") - merge_list = sorted(list(set(list_cur) | set(extracted_value))) - return ",".join(merge_list) - if merge_values: - if extracted_value[0] in current_val: - return current_val - return f"{current_val},{extracted_value[0]}" - if keep_multi: - return ",".join(extracted_value) - return extracted_value[0] - - def extract_feature(self, name, config): - """Extract features from events. + NAME = "feature_extraction" + DISPLAY_NAME = "Feature Extractions" + DESCRIPTION = ( + "Runs all feature extraction plugins on selected timelines. " + "Currently implemented extractions: * regex features * winevt features." + ) + + DEPENDENCIES = frozenset() + + def __init__( + self, + index_name: str, + sketch_id: int, + timeline_id: Optional[int] = None, + **kwargs, + ) -> None: + """Initializes the sketch analyzer. Args: - name: String with the name describing the feature to be extracted. - config: A dict that contains the configuration for the feature - extraction. See data/features.yaml for fields and further - documentation of what needs to be defined. - - Returns: - String with summary of the analyzer result. + index_name (str): OpenSearch index name. + sketch_id (int): TimeSketch's sketch ID. + timeline_id (int): The ID of the timeline. """ - query = config.get("query_string") - query_dsl = config.get("query_dsl") - attribute = config.get("attribute") - store_type_list = config.get("store_type_list", False) - keep_multimatch = config.get("keep_multimatch", False) - overwrite_store_as = config.get("overwrite_store_as", True) - overwrite_and_merge_store_as = config.get("overwrite_and_merge_store_as", False) - - if not attribute: - logger.warning("No attribute defined.") - return "" - - store_as = config.get("store_as") - if not store_as: - logger.warning("No attribute defined to store results in.") - return "" + self._plugin_name: str = kwargs.get("plugin_name") + self._feature_name: str = kwargs.get("feature_name") + self._feature_config: Dict = kwargs.get("feature_config") - tags = config.get("tags", []) - - expression_string = config.get("re") - if not expression_string: - logger.warning("No regular expression defined.") - return "" - - expression = utils.compile_regular_expression( - expression_string=expression_string, expression_flags=config.get("re_flags") + super().__init__( + index_name=index_name, sketch_id=sketch_id, timeline_id=timeline_id ) - emoji_names = config.get("emojis", []) - emojis_to_add = [emojis.get_emoji(x) for x in emoji_names] + @property + def plugin_name(self) -> str: + return self._plugin_name - return_fields = [attribute, store_as] + @plugin_name.setter + def plugin_name(self, value: str) -> None: + self._plugin_name = value - events = self.event_stream( - query_string=query, query_dsl=query_dsl, return_fields=return_fields - ) + @property + def feature_name(self) -> str: + return self._feature_name - event_counter = 0 - for event in events: - attribute_field = event.source.get(attribute) - if isinstance(attribute_field, six.text_type): - attribute_value = attribute_field - elif isinstance(attribute_field, (list, tuple)): - attribute_value = ",".join(attribute_field) - elif isinstance(attribute_field, (int, float)): - attribute_value = attribute_field - else: - attribute_value = None + @feature_name.setter + def feature_name(self, value: str) -> None: + self._feature_name = value - if not attribute_value: - continue + @property + def feature_config(self) -> Dict: + return self._feature_config - result = expression.findall(attribute_value) - if not result: - continue - result = list(set(result)) + @feature_config.setter + def feature_config(self, value: Dict) -> None: + self._feature_config = value - event_counter += 1 - store_as_current_val = event.source.get(store_as) - if store_as_current_val and not overwrite_store_as: - continue - if isinstance(store_as_current_val, six.text_type): - store_type_list = False - elif isinstance(store_as_current_val, (list, tuple)): - store_type_list = True - new_value = self._get_attribute_value( - store_as_current_val, - result, - keep_multimatch, - overwrite_and_merge_store_as, - store_type_list, - ) - if not new_value: - continue - event.add_attributes({store_as: new_value}) - event.add_emojis(emojis_to_add) - event.add_tags(tags) + def run(self) -> str: + """Entry point for the sketch analyzer. - # Commit the event to the datastore. - event.commit() - - aggregate_results = config.get("aggregate", False) - create_view = config.get("create_view", False) - - # If aggregation is turned on, we automatically create an aggregation. - if aggregate_results: - create_view = True - - if create_view and event_counter: - view = self.sketch.add_view( - name, self.NAME, query_string=query, query_dsl=query_dsl + Returns: + str: A summary of sketch analyzer result. + """ + # Handling unset self._plugin_name + if not self._plugin_name: + logger.debug("Feature extraction plugin name is empty") + return "Feature extraction plugin name is empty" + + try: + plugin_class = feature_manager.PluginManager.get_plugin( + self._plugin_name, self ) - - if aggregate_results: - params = { - "field": store_as, - "limit": 20, - "index": [self.timeline_id], - } - self.sketch.add_aggregation( - name="Top 20 for: {0:s} [{1:s}]".format(store_as, name), - agg_name="field_bucket", - agg_params=params, - description="Created by the feature extraction analyzer", - view_id=view.id, - chart_type="hbarchart", + if not plugin_class: + raise ValueError( + f"Feature extraction plugin {self._plugin_name} is not " + "registered. Check if the feature is registered in " + "feature_plugins." ) - return "Feature extraction [{0:s}] extracted {1:d} features.".format( - name, event_counter - ) + return plugin_class.run_plugin(self._feature_name, self._feature_config) + except ValueError as exception: + logger.error(str(exception)) + return f"Error: {str(exception)}" @staticmethod - def get_kwargs(): + def get_kwargs() -> List[Dict]: """Get kwargs for the analyzer. Returns: - List of features to search for. + List[dict]: A list of dict containing plugin name, feature name and feature + config. """ - features_config = interface.get_yaml_config("features.yaml") - if not features_config: - return "Unable to parse the config features file." + feature_kwargs_list = [] + + plugin_classes = feature_manager.PluginManager.get_plugins(None) + for plugin in plugin_classes: + feature_list = plugin.get_kwargs() + if not feature_list: + logger.debug("No configuration for %s", plugin.NAME) + continue + + for feature_config in feature_list: + feature_config["plugin_name"] = plugin.NAME.lower() + feature_kwargs_list.append(feature_config) - features_kwargs = [ - {"feature": feature, "feature_config": config} - for feature, config in features_config.items() - ] - return features_kwargs + return feature_kwargs_list manager.AnalysisManager.register_analyzer(FeatureExtractionSketchPlugin) diff --git a/timesketch/lib/analyzers/feature_extraction_plugin.py b/timesketch/lib/analyzers/feature_extraction_plugin.py deleted file mode 100644 index 21bb1df84f..0000000000 --- a/timesketch/lib/analyzers/feature_extraction_plugin.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright 2023 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Main sketch analyzer for feature extraction.""" - -import logging -from typing import List, Optional, Dict - -from timesketch.lib.analyzers import interface -from timesketch.lib.analyzers import manager -from timesketch.lib.analyzers.feature_extraction_plugins import ( - manager as feature_manager, -) - -logger = logging.getLogger("timesketch.analyzers.feature_extraction") - - -class FeatureExtractionSketchPlugin(interface.BaseAnalyzer): - """Main sketch analyzer for feature extraction. - - This analyzer runs all the feature extractions within the feature_plugins directory. - """ - - NAME = "feature_extraction_plugin" - DISPLAY_NAME = "Feature Extractions" - DESCRIPTION = ( - "Runs all feature extraction plugins on selected timelines. " - "Currently implemented extractions: * regex features * winevt features." - ) - - DEPENDENCIES = frozenset() - - def __init__( - self, - index_name: str, - sketch_id: int, - timeline_id: Optional[int] = None, - **kwargs, - ) -> None: - """Initializes the sketch analyzer. - - Args: - index_name (str): OpenSearch index name. - sketch_id (int): TimeSketch's sketch ID. - timeline_id (int): The ID of the timeline. - """ - self._plugin_name: str = kwargs.get("plugin_name") - self._feature_name: str = kwargs.get("feature_name") - self._feature_config: Dict = kwargs.get("feature_config") - - super().__init__( - index_name=index_name, sketch_id=sketch_id, timeline_id=timeline_id - ) - - @property - def plugin_name(self) -> str: - return self._plugin_name - - @plugin_name.setter - def plugin_name(self, value: str) -> None: - self._plugin_name = value - - @property - def feature_name(self) -> str: - return self._feature_name - - @feature_name.setter - def feature_name(self, value: str) -> None: - self._feature_name = value - - @property - def feature_config(self) -> Dict: - return self._feature_config - - @feature_config.setter - def feature_config(self, value: Dict) -> None: - self._feature_config = value - - def run(self) -> str: - """Entry point for the sketch analyzer. - - Returns: - str: A summary of sketch analyzer result. - """ - # Handling unset self._plugin_name - if not self._plugin_name: - logger.debug("Feature extraction plugin name is empty") - return "Feature extraction plugin name is empty" - - try: - plugin_class = feature_manager.PluginManager.get_plugin( - self._plugin_name, self - ) - if not plugin_class: - raise ValueError( - f"Feature extraction plugin {self._plugin_name} is not " - "registered. Check if the feature is registered in " - "feature_plugins." - ) - - return plugin_class.run_plugin(self._feature_name, self._feature_config) - except ValueError as exception: - logger.error(str(exception)) - return f"Error: {str(exception)}" - - @staticmethod - def get_kwargs() -> List[Dict]: - """Get kwargs for the analyzer. - - Returns: - List[dict]: A list of dict containing plugin name, feature name and feature - config. - """ - feature_kwargs_list = [] - - plugin_classes = feature_manager.PluginManager.get_plugins(None) - for plugin in plugin_classes: - feature_list = plugin.get_kwargs() - if not feature_list: - logger.debug("No configuration for %s", plugin.NAME) - continue - - for feature_config in feature_list: - feature_config["plugin_name"] = plugin.NAME.lower() - feature_kwargs_list.append(feature_config) - - return feature_kwargs_list - - -manager.AnalysisManager.register_analyzer(FeatureExtractionSketchPlugin) diff --git a/timesketch/lib/analyzers/feature_extraction_plugin_test.py b/timesketch/lib/analyzers/feature_extraction_plugin_test.py deleted file mode 100644 index 47188d5afb..0000000000 --- a/timesketch/lib/analyzers/feature_extraction_plugin_test.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright 2023 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Unit tests for feature extraction.""" - -import os -import re -import textwrap -from typing import List, Dict - -import yaml -import mock - -from timesketch.lib import emojis -from timesketch.lib.analyzers.feature_extraction_plugin import ( - FeatureExtractionSketchPlugin, -) -from timesketch.lib.analyzers.feature_extraction_plugins import regex_features -from timesketch.lib.analyzers.sequence_sessionizer_test import _create_eventObj -from timesketch.lib.testlib import BaseTest -from timesketch.lib.testlib import MockDataStore - - -class TestFeatureExtractionSketchPlugin(BaseTest): - """A class to test FeatureExtractionSketchPlugin class methods.""" - - EXPECTED_RESULT = textwrap.dedent( - """winevt feature extraction: [security_4624_v2] extracted 1 features.""" - ) - - def test_winevt_config(self): - """Tests Windows event log feature extraction config.""" - config_file = os.path.join("data", "winevt_features.yaml") - self.assertTrue(os.path.isfile(config_file)) - - with open(config_file, "r", encoding="utf-8") as fh: - config = yaml.safe_load(fh) - - self.assertIsInstance(config, dict) - - for key, value in config.items(): - self.assertIsInstance(key, str) - self.assertIsInstance(value, dict) - - @mock.patch("timesketch.lib.analyzers.interface.OpenSearchDataStore", MockDataStore) - def test_run(self) -> None: - """Tests run method.""" - plugin_object = FeatureExtractionSketchPlugin( - index_name="test", sketch_id=1, timeline_id=1 - ) - - plugin_object.datastore.client = mock.Mock() - datastore = plugin_object.datastore - - self._create_mock_events(datastore) - - plugin_object.plugin_name = "winevt_extraction_plugin" - plugin_object.feature_name = "security_4624_v2" - plugin_object.feature_config = self._get_feature_config( - "winevt_features.yaml", plugin_object.feature_name - ) - - result = plugin_object.run() - self.assertEqual(self.EXPECTED_RESULT, result) - - def _get_feature_config(self, file_name: str, feature_name: str) -> Dict: - """Returns the feature configuration. - - Args: - file_name (str): Feature configuration file name. - feature_name (str): Feature name in the configuration file. - - Returns: - Dict: Configuration parameter for the feature. - """ - path = os.path.join("data", file_name) - - with open(path, "r", encoding="utf-8") as fh: - config = yaml.safe_load(fh) - - for name, config in config.items(): - if name == feature_name: - return config - - return None # Return None if no match. - - def _create_mock_events(self, datastore) -> None: - """Creates mock events.""" - events: List[Dict] = [] - events.extend(self._create_mock_winevt_events()) - - # Adding new events - # Use the following example to extend the events add add mock events. - # Example: events.extend(self._create_mock_xyz_events()) - - event_id = 0 - timestamp = 1672097149681987 - - for event in events: - _create_eventObj(datastore, event_id, timestamp, event) - event_id += 1 - timestamp += 1000000 - - def _create_mock_winevt_events(self) -> List[Dict]: - """Creates mock Windows event log events. - - Returns: - List[Dict]: A list of dictionary containing Windows event logs. - """ - events = [] - - security_4624_v2_event = { - "source_name": "Microsoft-Windows-Security-Auditing", - "event_identifier": 4624, - "event_version": 2, - "strings": [ - "S-1-5-18", - "WIN-MDLVGLNGOM0$", - "WORKGROUP", - "0x00000000000003e7", - "S-1-5-18", - "SYSTEM", - "NT AUTHORITY", - "0x00000000000003e7", - "5", - "Advapi ", - "Negotiate", - "-", - "{00000000-0000-0000-0000-000000000000}", - "-", - "-", - "0", - "0x000000000000026c", - "C:\\Windows\\System32\\services.exe", - "-", - "-", - "%%1833", - "-", - "-", - "-", - "%%1843", - "0x0000000000000000", - "%%1842", - ], - } - events.append(security_4624_v2_event) - - return events - - # Copied from feature_extraction_test - - def _config_validation(self, config): - """Validate that all items of a config are valid.""" - query = config.get("query_string", config.get("query_dsl")) - self.assertIsNotNone(query) - self.assertIsInstance(query, str) - - attribute = config.get("attribute") - self.assertIsNotNone(attribute) - - store_as = config.get("store_as") - self.assertIsNotNone(store_as) - - expression = config.get("re") - self.assertIsNotNone(expression) - try: - _ = re.compile(expression) - except re.error as exception: - self.assertIsNone(exception) - - emojis_to_add = config.get("emojis") - if emojis_to_add: - self.assertIsInstance(emojis_to_add, (list, tuple)) - for emoji_name in emojis_to_add: - emoji_code = emojis.get_emoji(emoji_name) - self.assertNotEqual(emoji_code, "") - - tags = config.get("tags") - if tags: - self.assertIsInstance(tags, (list, tuple)) - - create_view = config.get("create_view") - if create_view: - self.assertIsInstance(create_view, bool) - - aggregate = config.get("aggregate") - if aggregate: - self.assertIsInstance(aggregate, bool) - - # TODO: Add tests for the feature extraction. - def test_config(self): - """Tests that the config file is valid.""" - config_file = os.path.join("data", "features.yaml") - self.assertTrue(os.path.isfile(config_file)) - - with open(config_file) as fh: - config = yaml.safe_load(fh) - - self.assertIsInstance(config, dict) - - for key, value in iter(config.items()): - self.assertIsInstance(key, str) - self.assertIsInstance(value, dict) - self._config_validation(value) - - # Mock the OpenSearch datastore. - @mock.patch("timesketch.lib.analyzers.interface.OpenSearchDataStore", MockDataStore) - def test_get_attribute_value(self): - """Test function _get_attribute_value().""" - analyzer = FeatureExtractionSketchPlugin( - index_name="test_index", sketch_id=1, timeline_id=1 - ) - plugin = regex_features.RegexFeatureExtractionPlugin(analyzer) - - current_val = ["hello"] - extracted_value = ["hello"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val=current_val, - extracted_value=extracted_value, - keep_multi=True, - merge_values=True, - type_list=True, - ) - new_val.sort() - - self.assertEqual(new_val, ["hello"]) - - current_val = ["hello"] - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, True, True, True - ) - new_val.sort() - - self.assertEqual(new_val, ["hello", "hello2", "hello3"]) - - current_val = ["hello"] - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, False, True, True - ) - new_val.sort() - - self.assertEqual(new_val, ["hello", "hello2"]) - - current_val = ["hello"] - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, False, False, True - ) - new_val.sort() - - self.assertEqual(new_val, ["hello2"]) - - current_val = ["hello"] - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, True, False, True - ) - new_val.sort() - - self.assertEqual(new_val, ["hello2", "hello3"]) - - current_val = "hello" - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, True, True, False - ) - - self.assertEqual(new_val, "hello,hello2,hello3") - - current_val = "hello" - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, False, True, False - ) - - self.assertEqual(new_val, "hello,hello2") - - current_val = "hello" - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, True, False, False - ) - - self.assertEqual(new_val, "hello2,hello3") - - current_val = "hello" - extracted_value = ["hello2", "hello3"] - # pylint: disable=protected-access - new_val = plugin._get_attribute_value( - current_val, extracted_value, False, False, False - ) - - self.assertEqual(new_val, "hello2") diff --git a/timesketch/lib/analyzers/feature_extraction_plugins/__init__.py b/timesketch/lib/analyzers/feature_extraction_plugins/__init__.py index 7022ecfb80..8742ef099f 100644 --- a/timesketch/lib/analyzers/feature_extraction_plugins/__init__.py +++ b/timesketch/lib/analyzers/feature_extraction_plugins/__init__.py @@ -13,5 +13,5 @@ # limitations under the License. """Imports for the feature extraction plugins.""" -from timesketch.lib.analyzers.feature_extraction_plugins import winevt_features from timesketch.lib.analyzers.feature_extraction_plugins import regex_features +from timesketch.lib.analyzers.feature_extraction_plugins import winevt_features diff --git a/timesketch/lib/analyzers/feature_extraction_plugins/regex_features.py b/timesketch/lib/analyzers/feature_extraction_plugins/regex_features.py index b383d1591c..030c05ca25 100644 --- a/timesketch/lib/analyzers/feature_extraction_plugins/regex_features.py +++ b/timesketch/lib/analyzers/feature_extraction_plugins/regex_features.py @@ -210,7 +210,7 @@ def extract_feature(self, name, config): Args: name: String with the name describing the feature to be extracted. config: A dict that contains the configuration for the feature - extraction. See data/features.yaml for fields and further + extraction. See data/regex_features.yaml for fields and further documentation of what needs to be defined. Returns: diff --git a/timesketch/lib/analyzers/feature_extraction_plugins/winevt_features.py b/timesketch/lib/analyzers/feature_extraction_plugins/winevt_features.py index d90b1413a0..223219673b 100644 --- a/timesketch/lib/analyzers/feature_extraction_plugins/winevt_features.py +++ b/timesketch/lib/analyzers/feature_extraction_plugins/winevt_features.py @@ -184,10 +184,18 @@ def extract_features(self, name: str, config: dict) -> str: attribute_name, event.event_id, ) - event.add_comment( - f"Analyzer[{self.NAME}]: [{name}] The index '{string_index}' " - f"for field '{attribute_name}' does not exist in strings!" + comment_message = ( + f"Analyzer[{self.NAME}]: [{name}] The index '{string_index}'" + f" for field '{attribute_name}' does not exist in strings!" ) + add_comment = True + if len(event.get_comments()) > 0: + for comment in event.get_comments(): + if comment_message == comment.comment: + add_comment = False + break + if add_comment: + event.add_comment(comment_message) continue attributes[attribute_name] = attribute_value diff --git a/timesketch/lib/analyzers/feature_extraction_test.py b/timesketch/lib/analyzers/feature_extraction_test.py index 60565c0802..6ce8529e8e 100644 --- a/timesketch/lib/analyzers/feature_extraction_test.py +++ b/timesketch/lib/analyzers/feature_extraction_test.py @@ -1,21 +1,163 @@ -"""Tests for FeatureExtractionPlugin.""" -from __future__ import unicode_literals +# Copyright 2023 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unit tests for feature extraction.""" import os import re -import yaml +import textwrap +from typing import List, Dict +import yaml import mock from timesketch.lib import emojis -from timesketch.lib.analyzers import feature_extraction +from timesketch.lib.analyzers.feature_extraction import ( + FeatureExtractionSketchPlugin, +) +from timesketch.lib.analyzers.feature_extraction_plugins import regex_features +from timesketch.lib.analyzers.sequence_sessionizer_test import _create_eventObj from timesketch.lib.testlib import BaseTest - from timesketch.lib.testlib import MockDataStore -class TestFeatureExtractionPlugin(BaseTest): - """Tests the functionality of the analyzer.""" +class TestFeatureExtractionSketchPlugin(BaseTest): + """A class to test FeatureExtractionSketchPlugin class methods.""" + + EXPECTED_RESULT = textwrap.dedent( + """winevt feature extraction: [security_4624_v2] extracted 1 features.""" + ) + + def test_winevt_config(self): + """Tests Windows event log feature extraction config.""" + config_file = os.path.join("data", "winevt_features.yaml") + self.assertTrue(os.path.isfile(config_file)) + + with open(config_file, "r", encoding="utf-8") as fh: + config = yaml.safe_load(fh) + + self.assertIsInstance(config, dict) + + for key, value in config.items(): + self.assertIsInstance(key, str) + self.assertIsInstance(value, dict) + + @mock.patch("timesketch.lib.analyzers.interface.OpenSearchDataStore", MockDataStore) + def test_run(self) -> None: + """Tests run method.""" + plugin_object = FeatureExtractionSketchPlugin( + index_name="test", sketch_id=1, timeline_id=1 + ) + + plugin_object.datastore.client = mock.Mock() + datastore = plugin_object.datastore + + self._create_mock_events(datastore) + + plugin_object.plugin_name = "winevt_extraction_plugin" + plugin_object.feature_name = "security_4624_v2" + plugin_object.feature_config = self._get_feature_config( + "winevt_features.yaml", plugin_object.feature_name + ) + + result = plugin_object.run() + self.assertEqual(self.EXPECTED_RESULT, result) + + def _get_feature_config(self, file_name: str, feature_name: str) -> Dict: + """Returns the feature configuration. + + Args: + file_name (str): Feature configuration file name. + feature_name (str): Feature name in the configuration file. + + Returns: + Dict: Configuration parameter for the feature. + """ + path = os.path.join("data", file_name) + + with open(path, "r", encoding="utf-8") as fh: + config = yaml.safe_load(fh) + + for name, config in config.items(): + if name == feature_name: + return config + + return None # Return None if no match. + + def _create_mock_events(self, datastore) -> None: + """Creates mock events.""" + events: List[Dict] = [] + events.extend(self._create_mock_winevt_events()) + + # Adding new events + # Use the following example to extend the events add add mock events. + # Example: events.extend(self._create_mock_xyz_events()) + + event_id = 0 + timestamp = 1672097149681987 + + for event in events: + _create_eventObj(datastore, event_id, timestamp, event) + event_id += 1 + timestamp += 1000000 + + def _create_mock_winevt_events(self) -> List[Dict]: + """Creates mock Windows event log events. + + Returns: + List[Dict]: A list of dictionary containing Windows event logs. + """ + events = [] + + security_4624_v2_event = { + "source_name": "Microsoft-Windows-Security-Auditing", + "event_identifier": 4624, + "event_version": 2, + "strings": [ + "S-1-5-18", + "WIN-MDLVGLNGOM0$", + "WORKGROUP", + "0x00000000000003e7", + "S-1-5-18", + "SYSTEM", + "NT AUTHORITY", + "0x00000000000003e7", + "5", + "Advapi ", + "Negotiate", + "-", + "{00000000-0000-0000-0000-000000000000}", + "-", + "-", + "0", + "0x000000000000026c", + "C:\\Windows\\System32\\services.exe", + "-", + "-", + "%%1833", + "-", + "-", + "-", + "%%1843", + "0x0000000000000000", + "%%1842", + ], + } + events.append(security_4624_v2_event) + + return events + + # Test for regex_extraction_plugin (old feature extractor): def _config_validation(self, config): """Validate that all items of a config are valid.""" @@ -58,7 +200,7 @@ def _config_validation(self, config): # TODO: Add tests for the feature extraction. def test_config(self): """Tests that the config file is valid.""" - config_file = os.path.join("data", "features.yaml") + config_file = os.path.join("data", "regex_features.yaml") self.assertTrue(os.path.isfile(config_file)) with open(config_file) as fh: @@ -75,11 +217,15 @@ def test_config(self): @mock.patch("timesketch.lib.analyzers.interface.OpenSearchDataStore", MockDataStore) def test_get_attribute_value(self): """Test function _get_attribute_value().""" - analyzer = feature_extraction.FeatureExtractionSketchPlugin("test_index", 1) + analyzer = FeatureExtractionSketchPlugin( + index_name="test_index", sketch_id=1, timeline_id=1 + ) + plugin = regex_features.RegexFeatureExtractionPlugin(analyzer) + current_val = ["hello"] extracted_value = ["hello"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val=current_val, extracted_value=extracted_value, keep_multi=True, @@ -93,7 +239,7 @@ def test_get_attribute_value(self): current_val = ["hello"] extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, True, True, True ) new_val.sort() @@ -103,7 +249,7 @@ def test_get_attribute_value(self): current_val = ["hello"] extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, False, True, True ) new_val.sort() @@ -113,7 +259,7 @@ def test_get_attribute_value(self): current_val = ["hello"] extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, False, False, True ) new_val.sort() @@ -123,7 +269,7 @@ def test_get_attribute_value(self): current_val = ["hello"] extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, True, False, True ) new_val.sort() @@ -133,7 +279,7 @@ def test_get_attribute_value(self): current_val = "hello" extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, True, True, False ) @@ -142,7 +288,7 @@ def test_get_attribute_value(self): current_val = "hello" extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, False, True, False ) @@ -151,7 +297,7 @@ def test_get_attribute_value(self): current_val = "hello" extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, True, False, False ) @@ -160,7 +306,7 @@ def test_get_attribute_value(self): current_val = "hello" extracted_value = ["hello2", "hello3"] # pylint: disable=protected-access - new_val = analyzer._get_attribute_value( + new_val = plugin._get_attribute_value( current_val, extracted_value, False, False, False )