Skip to content

Commit

Permalink
Added .gitignore file, merging elastic/ember#110, elastic/ember#109, e…
Browse files Browse the repository at this point in the history
  • Loading branch information
k0T0z committed Nov 24, 2023
1 parent 5c8f362 commit a400752
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 31 deletions.
34 changes: 34 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/**/*

!/.gitignore

!/ember/
!/ember/__init__.py
!/ember/features.py

!/licenses/
!/licenses/AGPL-LICENSE-3.0.txt
!/licenses/MIT-LICENSE.txt

!/malconv/
!/malconv/malconv.h5
!/malconv/malconv.py
!/malconv/multi_gpu.py
!/malconv/README.md

!/resources/
!/resources/ember-notebook.ipynb
!/resources/ember2018-notebook.ipynb
!/resources/logo.png

!/scripts/
!/scripts/classify_binaries.py
!/scripts/init_ember.py

!/Dockerfile
!/LICENSE.txt
!/README.md
!/requirements_conda.txt
!/requirements_notebook.txt
!/requirements.txt
!/setup.py
72 changes: 41 additions & 31 deletions ember/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
for your modeling problem.
'''

import hashlib
import json
import os
import re

import lief
import hashlib
import numpy as np
import os
import json
from sklearn.feature_extraction import FeatureHasher

LIEF_MAJOR, LIEF_MINOR, _ = lief.__version__.split('.')
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or ( int(LIEF_MAJOR)==0 and int(LIEF_MINOR) >= 10 )
LIEF_EXPORT_OBJECT = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 10)
LIEF_HAS_SIGNATURE = int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 11)


Expand Down Expand Up @@ -97,7 +98,7 @@ def _entropy_bin_counts(self, block):
return Hbin, c

def raw_features(self, bytez, lief_binary):
output = np.zeros((16, 16), dtype=np.int)
output = np.zeros((16, 16), dtype=int)
a = np.frombuffer(bytez, dtype=np.uint8)
if a.shape[0] < self.window:
Hbin, c = self._entropy_bin_counts(a)
Expand Down Expand Up @@ -142,22 +143,23 @@ def raw_features(self, bytez, lief_binary):
return {"entry": "", "sections": []}

# properties of entry point, or if invalid, the first executable section

not_found_error_class = lief.lief_errors.not_found if not lief.__version__.startswith("0.9.0") else lief.not_found
try:
if int(LIEF_MAJOR) > 0 or (int(LIEF_MAJOR) == 0 and int(LIEF_MINOR) >= 12):
section = lief_binary.section_from_rva(lief_binary.entrypoint - lief_binary.imagebase)

if section is None:
raise lief.not_found
raise not_found_error_class
entry_section = section.name
else: # lief < 0.12
else: # lief < 0.12
entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name
except lief.not_found:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break
except not_found_error_class:
# bad entry point, let's find the first executable section
entry_section = ""
for s in lief_binary.sections:
if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists:
entry_section = s.name
break

raw_obj = {"entry": entry_section}
raw_obj["sections"] = [{
Expand Down Expand Up @@ -189,7 +191,7 @@ def process_raw_features(self, raw_obj):
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
entry_name_hashed = FeatureHasher(50, input_type="string").transform([[raw_obj['entry']]]).toarray()[0]
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]

Expand Down Expand Up @@ -267,7 +269,6 @@ def raw_features(self, bytez, lief_binary):
# export is a string (LIEF 0.9.0 and earlier)
clipped_exports = [export[:10000] for export in lief_binary.exported_functions]


return clipped_exports

def process_raw_features(self, raw_obj):
Expand Down Expand Up @@ -318,7 +319,7 @@ def process_raw_features(self, raw_obj):
raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'],
raw_obj['symbols']
],
dtype=np.float32)
dtype=np.float32)


class HeaderFileInfo(FeatureType):
Expand Down Expand Up @@ -499,15 +500,15 @@ class PEFeatureExtractor(object):
def __init__(self, feature_version=2, print_feature_warning=True, features_file=''):
self.features = []
features = {
'ByteHistogram': ByteHistogram(),
'ByteEntropyHistogram': ByteEntropyHistogram(),
'StringExtractor': StringExtractor(),
'GeneralFileInfo': GeneralFileInfo(),
'HeaderFileInfo': HeaderFileInfo(),
'SectionInfo': SectionInfo(),
'ImportsInfo': ImportsInfo(),
'ExportsInfo': ExportsInfo()
}
'ByteHistogram': ByteHistogram(),
'ByteEntropyHistogram': ByteEntropyHistogram(),
'StringExtractor': StringExtractor(),
'GeneralFileInfo': GeneralFileInfo(),
'HeaderFileInfo': HeaderFileInfo(),
'SectionInfo': SectionInfo(),
'ImportsInfo': ImportsInfo(),
'ExportsInfo': ExportsInfo()
}

if os.path.exists(features_file):
with open(features_file, encoding='utf8') as f:
Expand All @@ -520,22 +521,31 @@ def __init__(self, feature_version=2, print_feature_warning=True, features_file=
if not lief.__version__.startswith("0.8.3"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 1 were computed using lief version 0.8.3-18d5b75")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(
f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
elif feature_version == 2:
self.features.append(DataDirectories())
if not lief.__version__.startswith("0.9.0"):
if print_feature_warning:
print(f"WARNING: EMBER feature version 2 were computed using lief version 0.9.0-")
print(f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(
f"WARNING: lief version {lief.__version__} found instead. There may be slight inconsistencies")
print(f"WARNING: in the feature calculations.")
else:
raise Exception(f"EMBER feature version must be 1 or 2. Not {feature_version}")
self.dim = sum([fe.dim for fe in self.features])

def raw_features(self, bytez):
lief_errors = (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound,
RuntimeError)
if lief.__version__.startswith("0.9.0"):
lief_errors = (
lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, lief.read_out_of_bound, RuntimeError)
else:
lief_errors = (
lief.lief_errors.conversion_error, lief.lief_errors.file_error, lief.lief_errors.file_format_error,
lief.lief_errors.corrupted, lief.lief_errors.parsing_error, lief.lief_errors.read_out_of_bound,
RuntimeError)

try:
lief_binary = lief.PE.parse(list(bytez))
except lief_errors as e:
Expand Down

0 comments on commit a400752

Please sign in to comment.