From a400752a82010999b5025d843009db905d8a9758 Mon Sep 17 00:00:00 2001 From: Saif Kandil <74428638+k0T0z@users.noreply.github.com> Date: Fri, 24 Nov 2023 20:15:37 +0200 Subject: [PATCH] Added .gitignore file, merging https://github.com/elastic/ember/pull/110, https://github.com/elastic/ember/pull/109, https://github.com/elastic/ember/pull/108, https://github.com/elastic/ember/pull/99, and https://github.com/elastic/ember/pull/93 Signed-off-by: Saif Kandil <74428638+k0T0z@users.noreply.github.com> --- .gitignore | 34 ++++++++++++++++++++++ ember/features.py | 72 +++++++++++++++++++++++++++-------------------- 2 files changed, 75 insertions(+), 31 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0b32243 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +/**/* + +!/.gitignore + +!/ember/ +!/ember/__init__.py +!/ember/features.py + +!/licenses/ +!/licenses/AGPL-LICENSE-3.0.txt +!/licenses/MIT-LICENSE.txt + +!/malconv/ +!/malconv/malconv.h5 +!/malconv/malconv.py +!/malconv/multi_gpu.py +!/malconv/README.md + +!/resources/ +!/resources/ember-notebook.ipynb +!/resources/ember2018-notebook.ipynb +!/resources/logo.png + +!/scripts/ +!/scripts/classify_binaries.py +!/scripts/init_ember.py + +!/Dockerfile +!/LICENSE.txt +!/README.md +!/requirements_conda.txt +!/requirements_notebook.txt +!/requirements.txt +!/setup.py diff --git a/ember/features.py b/ember/features.py index bbaa138..839fdf7 100644 --- a/ember/features.py +++ b/ember/features.py @@ -12,16 +12,17 @@ for your modeling problem. ''' +import hashlib +import json +import os import re + import lief -import hashlib import numpy as np -import os -import json from sklearn.feature_extraction import FeatureHasher LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.') -LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 ) +LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10) LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11) @@ -97,7 +98,7 @@ def _entropy_bin_counts(self, block): return Hbin, c def raw_features(self, bytez, lief_binary): - output = np.zeros((16, 16), dtype=np.int) + output = np.zeros((16, 16), dtype=int) a = np.frombuffer(bytez, dtype=np.uint8) if a.shape[0] < self.window: Hbin, c = self._entropy_bin_counts(a) @@ -142,22 +143,23 @@ def raw_features(self, bytez, lief_binary): return {"entry": "", "sections": []} # properties of entry point, or if invalid, the first executable section - + not_found_error_class = lief.lief_errors.not_found if not lief.__version__.startswith("0.9.0") else lief.not_found try: if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12): section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase) + if section is None: - raise lief.not_found + raise not_found_error_class entry_section = section.name - else: # lief < 0.12 + else: # lief < 0.12 entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name - except lief.not_found: - # bad entry point, let's find the first executable section - entry_section = "" - for s in lief_binary.sections: - if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: - entry_section = s.name - break + except not_found_error_class: + # bad entry point, let's find the first executable section + entry_section = "" + for s in lief_binary.sections: + if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: + entry_section = s.name + break raw_obj = {"entry": entry_section} raw_obj["sections"] = [{ @@ -189,7 +191,7 @@ def process_raw_features(self, raw_obj): section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] section_vsize = [(s['name'], s['vsize']) for s in sections] section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] - entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0] + entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0] characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0] @@ -267,7 +269,6 @@ def raw_features(self, bytez, lief_binary): # export is a string (LIEF 0.9.0 and earlier) clipped_exports = [export[:10000] for export in lief_binary.exported_functions] - return clipped_exports def process_raw_features(self, raw_obj): @@ -318,7 +319,7 @@ def process_raw_features(self, raw_obj): raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], raw_obj['symbols'] ], - dtype=np.float32) + dtype=np.float32) class HeaderFileInfo(FeatureType): @@ -499,15 +500,15 @@ class PEFeatureExtractor(object): def __init__(self, feature_version=2, print_feature_warning=True, features_file=''): self.features = [] features = { - 'ByteHistogram': ByteHistogram(), - 'ByteEntropyHistogram': ByteEntropyHistogram(), - 'StringExtractor': StringExtractor(), - 'GeneralFileInfo': GeneralFileInfo(), - 'HeaderFileInfo': HeaderFileInfo(), - 'SectionInfo': SectionInfo(), - 'ImportsInfo': ImportsInfo(), - 'ExportsInfo': ExportsInfo() - } + 'ByteHistogram': ByteHistogram(), + 'ByteEntropyHistogram': ByteEntropyHistogram(), + 'StringExtractor': StringExtractor(), + 'GeneralFileInfo': GeneralFileInfo(), + 'HeaderFileInfo': HeaderFileInfo(), + 'SectionInfo': SectionInfo(), + 'ImportsInfo': ImportsInfo(), + 'ExportsInfo': ExportsInfo() + } if os.path.exists(features_file): with open(features_file, encoding='utf8') as f: @@ -520,22 +521,31 @@ def __init__(self, feature_version=2, print_feature_warning=True, features_file= if not lief.__version__.startswith("0.8.3"): if print_feature_warning: print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print( + f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") print(f"WARNING: in the feature calculations.") elif feature_version == 2: self.features.append(DataDirectories()) if not lief.__version__.startswith("0.9.0"): if print_feature_warning: print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-") - print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") + print( + f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies") print(f"WARNING: in the feature calculations.") else: raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}") self.dim = sum([fe.dim for fe in self.features]) def raw_features(self, bytez): - lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, - RuntimeError) + if lief.__version__.startswith("0.9.0"): + lief_errors = ( + lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError) + else: + lief_errors = ( + lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error, + lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound, + RuntimeError) + try: lief_binary = lief.PE.parse(list(bytez)) except lief_errors as e: