Skip to content

Commit

Permalink
Added a regex in the config file for each resource to exclude certain…
Browse files Browse the repository at this point in the history
… types of files
  • Loading branch information
JulienTD committed Feb 23, 2022
1 parent 3f1e9c9 commit a8b9a76
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 41 deletions.
20 changes: 14 additions & 6 deletions hashtheplanet/config/config.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
"""
This module handles the config file.
"""
from enum import Enum
import json
from typing import Dict, List

from hashtheplanet.resources.git_resource import GitResource
from hashtheplanet.resources.npm_resource import NpmResource

class ConfigField(Enum):
"""
This enum contains every field that can be found in the config file
"""
TARGETS = "targets"
EXCLUDE_REGEX = "exclude_regex"

class Config():
"""
This class implements methods to manipulate the config file.
Expand All @@ -25,15 +33,15 @@ def parse(self, config_path: str):
with open(config_path, "r", encoding="utf-8") as file_fp:
self._config = json.load(file_fp)

def get_targets(self, resource_name: str) -> List[str]:
def get(self, resource_name: str, config_field: ConfigField):
"""
This methods returns the targets used by the given resource.
This methods returns a field content used by the given resource.
"""
module_info: Dict = self._config.get(resource_name)
field_content: Dict = self._config.get(resource_name)

if module_info is None:
return []
return module_info.get("targets")
if not config_field or not field_content:
return None
return field_content.get(config_field.value)

def get_used_resources(self) -> List[str]:
"""
Expand Down
7 changes: 4 additions & 3 deletions hashtheplanet/core/hashtheplanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from sqlalchemy.orm import sessionmaker

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField
from hashtheplanet.executor.executor import Executor
from hashtheplanet.sql.db_connector import Base, DbConnector, Hash

Expand Down Expand Up @@ -90,10 +90,11 @@ def compute_hashs(self):
self._config.parse(self._input_file)

for resource_name in self._config.get_used_resources():
targets = self._config.get_targets(resource_name)
targets = self._config.get(resource_name, ConfigField.TARGETS) or []
exclude_regex = self._config.get(resource_name, ConfigField.EXCLUDE_REGEX)

for target in targets:
self._executor.execute(resource_name, target)
self._executor.execute(resource_name, target, exclude_regex)

logger.info("Computing done")

Expand Down
5 changes: 3 additions & 2 deletions hashtheplanet/executor/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
This module handles the resource executions.
"""
from importlib import import_module
from typing import Optional

from loguru import logger

Expand All @@ -17,7 +18,7 @@ def __init__(self, database: DbConnector, session_scope):
self._database = database
self._session_scope = session_scope

def execute(self, resource_name: str, target: str):
def execute(self, resource_name: str, target: str, exclude_regex: Optional[str] = None):
"""
This method executes a resource to compute hashes.
"""
Expand All @@ -31,4 +32,4 @@ def execute(self, resource_name: str, target: str):
return

resource_instance: Resource = getattr(module, resource_class_name)(self._database)
resource_instance.compute_hashes(self._session_scope, target)
resource_instance.compute_hashes(self._session_scope, target, exclude_regex)
14 changes: 9 additions & 5 deletions hashtheplanet/resources/git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import subprocess
import tempfile
from stat import S_ISDIR, S_ISREG
from typing import List, Tuple
from typing import List, Optional, Tuple

# third party imports
from git import GitCommandError, Repo
Expand Down Expand Up @@ -54,10 +54,11 @@ def get_all_files_from_commit(commit: Commit) -> List[Tuple[FilePath, BlobHash]]
file_list.append((blob.path, blob.hexsha))
return file_list

@staticmethod
def _hash_files(
self,
files: List[GitFileMetadata],
repo_dir_path: str
repo_dir_path: str,
exclude_regex: Optional[str]
) -> List[FileMetadata]:
"""
This method calculates the SHA256 hashes of input files.
Expand All @@ -69,6 +70,8 @@ def _hash_files(
os.chdir(repo_dir_path)

for (file_path, tag_name, blob_hash) in files:
if not self.can_save(exclude_regex, file_path):
continue
try:
# We need to use a subprocess and not the GitPython library
# because when we execute "git cat-file -p [blob]" with it, it always removes the \n from the last line.
Expand Down Expand Up @@ -192,7 +195,7 @@ def _filter_stored_tags(stored_versions: List[VersionTable], found_tags: List[Ta
result.append(found_tag)
return result

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method clones the repository from url, retrieves tags, compares each tags to retrieve only modified files,
computes their hashes and then stores the tags & files information in the database.
Expand All @@ -203,6 +206,7 @@ def compute_hashes(self, session_scope, target: str):

with tempfile.TemporaryDirectory() as tmp_dir_name:
try:
logger.info(f"Cloning {target}")
repo = self.clone_repository(target, tmp_dir_name)
except GitCommandError as error:
logger.warning(f"Error while cloning repository on {target}: {error}")
Expand All @@ -225,7 +229,7 @@ def compute_hashes(self, session_scope, target: str):
files += self._get_diff_files(tags)

logger.info("Generating hashes ...")
files_info = self._hash_files(files, tmp_dir_name)
files_info = self._hash_files(files, tmp_dir_name, exclude_regex)

logger.info("Saving hashes ...")
self._save_hashes(session_scope, files_info, tags, technology)
11 changes: 5 additions & 6 deletions hashtheplanet/resources/npm_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#standard imports
import tarfile
import tempfile
from typing import Dict, List, Set, Tuple
from typing import Dict, List, Optional, Set, Tuple
import requests

# third party imports
Expand Down Expand Up @@ -61,8 +61,7 @@ def save_tar_to_disk(file_path: str, npm_module_name: str, version: str):
with open(file_path, 'wb') as file_fd:
file_fd.write(request.content)

@staticmethod
def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]:
def extract_hashes_from_tar(self, file_path: str, exclude_regex: Optional[str]) -> List[FileMetadata]:
"""
This method returns all hashes of all files contained in a tar file.
"""
Expand All @@ -72,7 +71,7 @@ def extract_hashes_from_tar(file_path: str) -> List[FileMetadata]:
for member in tar.getmembers():
file = tar.extractfile(member)

if file is None:
if file is None or not self.can_save(exclude_regex, member.path):
continue
files.append((member.path, Hash.hash_bytes(file.read())))
return files
Expand All @@ -94,7 +93,7 @@ def _save_hashes(
self._database.insert_file(session, npm_module_name, file_path)
self._database.insert_or_update_hash(session, file_hash, npm_module_name, [version])

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method downloads all versions of an npm module and stores all the versions with their associated files
and hashes and stores them in the database.
Expand All @@ -108,6 +107,6 @@ def compute_hashes(self, session_scope, target: str):
file_path = f"{tmp_dir_name}/{target}-{version}.tgz"

self.save_tar_to_disk(file_path, target, version)
files_info[version] = self.extract_hashes_from_tar(file_path)
files_info[version] = self.extract_hashes_from_tar(file_path, exclude_regex)

self._save_hashes(session_scope, files_info, versions, target)
16 changes: 15 additions & 1 deletion hashtheplanet/resources/resource.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""
This module contains the base class for the resources.
"""
import re
from typing import Optional

from hashtheplanet.sql.db_connector import DbConnector

class Resource(): # pylint: disable=too-few-public-methods
Expand All @@ -12,8 +15,19 @@ class Resource(): # pylint: disable=too-few-public-methods
def __init__(self, database: DbConnector):
self._database = database

def compute_hashes(self, session_scope, target: str):
def compute_hashes(self, session_scope, target: str, exclude_regex: Optional[str]):
"""
This method computes all the versions and their associated files & hashes and stores them in the database.
"""
raise NotImplementedError()

@staticmethod
def can_save(exclude_regex: str, file_path: str):
"""
This method permits to verify if the specified file can be saved in the database or not
"""
if not file_path:
return False
if not exclude_regex:
return True
return not re.search(exclude_regex, file_path)
6 changes: 4 additions & 2 deletions src/tech_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
"https://github.com/drupal/drupal.git",
"https://github.com/magento/magento2.git",
"https://github.com/joomla/joomla-cms.git"
]
],
"exclude_regex": "\\.php|tests\\/|test\\/|\\.gitignore|\\.package|\\.idea|qunit|\\.editorconfig|vendor\\/|\\.vue|\\.scss|\\.less"
},
"npm": {
"targets": [
"underscore",
"jquery"
]
],
"exclude_regex": "tests\\/|test\\/|\\.jshintrc|\\.npmignore|\\.bowerrc|\\.jscsrc|\\.bower.json|\\.eslintrc.json|\\.jshintignore|\\.eslintrc|\\.scss|\\.less|\\.vue|\\.scss|\\.less|\\.idea"
}
}
6 changes: 3 additions & 3 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from unittest.mock import MagicMock, mock_open, patch

# project imports
from hashtheplanet.config.config import Config
from hashtheplanet.config.config import Config, ConfigField

def get_mock_open(files: Dict[str, str]):
def open_mock(filename, *args, **kwargs):
Expand Down Expand Up @@ -61,9 +61,9 @@ def test_get_targets():
config = Config()

with patch.dict(config._config, {"git": {"targets": ["target1", "target2"]}}):
assert len(config.get_targets("git")) == 2
assert len(config.get("git", ConfigField.TARGETS)) == 2

assert len(config.get_targets("npm")) == 0
assert not config.get("npm", ConfigField.TARGETS)

def test_get_used_resources():
config = Config()
Expand Down
33 changes: 27 additions & 6 deletions tests/resources/test_git_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,11 @@ def test_hash_files():

git_resource = GitResource(None)

# when there are no excluded files
with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path")
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", None)

assert sp_mock.call_count == 3
sp_mock.assert_called_with(['git', 'cat-file', '-p', 'e42f952edc48e2c085c206166bf4f1ead4d4b058'], shell=False)
Expand All @@ -114,11 +115,31 @@ def test_hash_files():
assert files_metadata[1][1] == "1.2.5"
assert files_metadata[1][2] == hashlib.sha256(blobs.get("e42f952edc48e2c085c206166bf4f1ead4d4b058")).hexdigest()

git_resource = GitResource(None)

# When the *.cfg files are excluded
with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files(git_files_metadata, "repo_dir_path", "\\.cfg$")

assert sp_mock.call_count == 2

getcwd_mock.assert_called_once()

assert chdir_mock.call_count == 2
chdir_mock.assert_called_with("/foobar/")

assert len(files_metadata) == 1

assert files_metadata[0][0] == "LICENSE"
assert files_metadata[0][1] == "1.2.3"
assert files_metadata[0][2] == hashlib.sha256(blobs.get("d159169d1050894d3ea3b98e1c965c4058208fe1")).hexdigest()

with mock.patch("subprocess.check_output", subprocess_mock(blobs)) as sp_mock, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path")
files_metadata = git_resource._hash_files([["empty", "1.2.1", "empty"]], "repo_dir_path", None)

assert sp_mock.call_count == 1
sp_mock.assert_called_with(['git', 'cat-file', '-p', 'empty'], shell=False)
Expand All @@ -134,7 +155,7 @@ def test_hash_files():
with mock.patch.object(subprocess, "check_output", MagicMock(side_effect=ValueError("error"))) as mock_exec, \
mock.patch("os.getcwd", return_value="/foobar/") as getcwd_mock, \
mock.patch("os.chdir", return_value=None) as chdir_mock:
git_resource._hash_files(git_files_metadata, "repo_dir_path")
git_resource._hash_files(git_files_metadata, "repo_dir_path", None)

getcwd_mock.assert_called_once()

Expand Down Expand Up @@ -411,7 +432,7 @@ def mock_tmp_dir():
session = MagicMock()
git_resource = GitResource(DbConnector())

git_resource.compute_hashes(session, repo_url)
git_resource.compute_hashes(session, repo_url, None)

# In this situation, we verify that by giving a good repo_url & a good tmp_dir_path
# we download the tags, calculate hash & store them in the database
Expand All @@ -420,7 +441,7 @@ def mock_tmp_dir():
mock_get_tag_files.assert_called_once_with(tags[0])
mock_filter_stored_tags.assert_called_once_with([], tags)
mock_get_diff_files.assert_called_once_with(tags)
mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path)
mock_hash_files.assert_called_once_with([1, 2], tmp_dir_path, None)
mock_save_hashes.assert_called_once_with(session, "hashed files", tags, "foobar")

with patch.object(
Expand All @@ -435,7 +456,7 @@ def mock_tmp_dir():
patch.object(GitResource, "_hash_files", return_value="hashed files") as mock_hash_files, \
patch.object(GitResource, "_save_hashes") as mock_save_hashes, \
patch.object(DbConnector, "get_versions") as mock_get_versions:
git_resource.compute_hashes(MagicMock(), repo_url)
git_resource.compute_hashes(MagicMock(), repo_url, None)
mock_clone_repo.assert_called_once_with(repo_url, tmp_dir_path)

# In this situation, we verify that by giving a wrong repository we stop the function
Expand Down
Loading

0 comments on commit a8b9a76

Please sign in to comment.