From 6df1a2de83950160db4233534040274ad236125b Mon Sep 17 00:00:00 2001 From: Oliver Willekens Date: Wed, 30 Jan 2019 22:26:50 +0100 Subject: [PATCH 1/8] Simplify constructs, improve docstring --- dependencies/spark.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/dependencies/spark.py b/dependencies/spark.py index b3f45a1..8116945 100644 --- a/dependencies/spark.py +++ b/dependencies/spark.py @@ -8,8 +8,7 @@ import __main__ from os import environ, listdir, path -from json import loads - +import json from pyspark import SparkFiles from pyspark.sql import SparkSession @@ -29,7 +28,7 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job - configuration), into a dict of ETL job configuration parameters, + configuration) into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None @@ -37,9 +36,9 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an - environment which has a `DEBUG` environment varibale set (e.g. + environment which has a `DEBUG` environment variable set (e.g. setting `DEBUG=1` as an environment variable as part of a debug - configuration within an IDE such as Visual Studio Code or PyCharm in + configuration within an IDE such as Visual Studio Code or PyCharm. In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also @@ -47,7 +46,7 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. - :param master: Cluster connection details (defaults to local[*]. + :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). @@ -57,8 +56,8 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], """ # detect execution environment - flag_repl = False if hasattr(__main__, '__file__') else True - flag_debug = True if 'DEBUG' in environ.keys() else False + flag_repl = not(hasattr(__main__, '__file__')) + flag_debug = 'DEBUG' in environ.keys() if not (flag_repl or flag_debug): # get Spark session factory @@ -95,11 +94,10 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], for filename in listdir(spark_files_dir) if filename.endswith('config.json')] - if len(config_files) != 0: + if config_files: path_to_config_file = path.join(spark_files_dir, config_files[0]) with open(path_to_config_file, 'r') as config_file: - config_json = config_file.read().replace('\n', '') - config_dict = loads(config_json) + config_dict = json.load(config_file) spark_logger.warn('loaded config from ' + config_files[0]) else: spark_logger.warn('no config file found') From 6bf6828baa39e88ae1bb44a9656f8f8fede26885 Mon Sep 17 00:00:00 2001 From: Oliver Willekens Date: Wed, 30 Jan 2019 23:19:31 +0100 Subject: [PATCH 2/8] get conf only once --- dependencies/logging.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dependencies/logging.py b/dependencies/logging.py index 25a56d2..96a6c0b 100644 --- a/dependencies/logging.py +++ b/dependencies/logging.py @@ -15,8 +15,9 @@ class Log4j(object): def __init__(self, spark): # get spark app details with which to prefix all messages - app_id = spark.sparkContext.getConf().get('spark.app.id') - app_name = spark.sparkContext.getConf().get('spark.app.name') + conf = spark.sparkContext.getConf() + app_id = conf.get('spark.app.id') + app_name = conf.get('spark.app.name') log4j = spark._jvm.org.apache.log4j message_prefix = '<' + app_name + ' ' + app_id + '>' From 7ec4d62a62c019e2d65dfee58b9ff0d2151dc105 Mon Sep 17 00:00:00 2001 From: Oliver Willekens Date: Thu, 31 Jan 2019 00:42:13 +0100 Subject: [PATCH 3/8] Added missing subdir annotation, improved spelling. --- README.md | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index ec03c89..d7e853f 100644 --- a/README.md +++ b/README.md @@ -13,14 +13,14 @@ The basic project structure is as follows: ```bash root/ + |-- configs/ + | |-- etl_config.json |-- dependencies/ | |-- logging.py | |-- spark.py |-- jobs/ | |-- etl_job.py - |-- configs/ - | |-- etl_config.json - | tests/ + |-- tests/ | |-- test_data/ | |-- | -- employees/ | |-- | -- employees_report/ @@ -31,19 +31,19 @@ root/ | Pipfile.lock ``` -The main Python module containing the ETL job (which will be sent to the Spark cluster), is `jobs/etl_job.py`. Any external configuration parameters required by `etl_job.py` are stored in JSON format in `configs/etl_config.json`. Additional modules that support this job can be kept in the `dependencies` folder (more on this later). In the project's root we include `build_dependencies.sh`, which is bash script for building these dependencies into a zip-file to be sent to the cluster (`packages.zip`). Unit test modules are kept in the `tests` folder and small chunks of representative input and output data, to be use with the tests, are kept in `tests/test_data` folder. +The main Python module containing the ETL job (which will be sent to the Spark cluster), is `jobs/etl_job.py`. Any external configuration parameters required by `etl_job.py` are stored in JSON format in `configs/etl_config.json`. Additional modules that support this job can be kept in the `dependencies` folder (more on this later). In the project's root we include `build_dependencies.sh`, which is a bash script for building these dependencies into a zip-file to be sent to the cluster (`packages.zip`). Unit test modules are kept in the `tests` folder and small chunks of representative input and output data, to be used with the tests, are kept in `tests/test_data` folder. ## Structure of an ETL Job -In order to facilitate easy debugging and testing, we recommend that the 'Transformation' step be isolated from the 'Extract' and 'Load' steps, into it's own function - taking input data arguments in the form of DataFrames and returning the transformed data as a single DataFrame. Then, the code that surrounds the use of the transformation function in the `main()` job function, is concerned with Extracting the data, passing it to the transformation function and then Loading (or writing) the results to their ultimate destination. Testing is simplified, as mock or test data can be passed to the transformation function and the results explicitly verified, which would not be possible if all of the ETL code resided in `main()` and referenced production data sources and destinations. +In order to facilitate easy debugging and testing, we recommend that the 'Transformation' step be isolated from the 'Extract' and 'Load' steps, into its own function - taking input data arguments in the form of DataFrames and returning the transformed data as a single DataFrame. Then, the code that surrounds the use of the transformation function in the `main()` job function, is concerned with Extracting the data, passing it to the transformation function and then Loading (or writing) the results to their ultimate destination. Testing is simplified, as mock or test data can be passed to the transformation function and the results explicitly verified, which would not be possible if all of the ETL code resided in `main()` and referenced production data sources and destinations. -More generally, transformation functions should be designed to be idempotent. This is technical way of saying that the repeated application of the transformation function should have no impact on the fundamental state of output data, until the instance when the input data changes. One of the key advantages of idempotent ETL jobs, is that they can be set to run repeatedly (e.g. by using `cron` to trigger the `spark-submit` command above, on a pre-defined schedule), rather than having to factor-in potential dependencies on other ETL jobs completing successfully. +More generally, transformation functions should be designed to be _idempotent_. This is a technical way of saying that the repeated application of the transformation function should have no impact on the fundamental state of output data, until the moment the input data changes. One of the key advantages of idempotent ETL jobs, is that they can be set to run repeatedly (e.g. by using `cron` to trigger the `spark-submit` command above, on a pre-defined schedule), rather than having to factor-in potential dependencies on other ETL jobs completing successfully. ## Passing Configuration Parameters to the ETL Job -Although it is possible to pass arguments to `etl_job.py`, as you would for any generic Python module running as a 'main' program - by specifying them after the module's filename and then parsing these command line arguments - this can get very complicated, very quickly, especially when there are lot of parameters (e.g. credentials for multiple databases, table names, SQL snippets, etc.). This also makes debugging the code from within a Python interpreter is extremely awkward, as you don't have access to the command line arguments that would ordinarily be passed to the code, when calling it from the command line. +Although it is possible to pass arguments to `etl_job.py`, as you would for any generic Python module running as a 'main' program - by specifying them after the module's filename and then parsing these command line arguments - this can get very complicated, very quickly, especially when there are lot of parameters (e.g. credentials for multiple databases, table names, SQL snippets, etc.). This also makes debugging the code from within a Python interpreter extremely awkward, as you don't have access to the command line arguments that would ordinarily be passed to the code, when calling it from the command line. -A much more effective solution is to send Spark a separate file - e.g. using the `--files configs/etl_config.json` flag with `spark-subit` - containing the configuration in JSON format, which can be parsed into a Python dictionary in one line of code with `json.loads(config_file_contents)`. Testing the code from within a Python interactive console session is also greatly simplified, as all one has to do to access configuration parameters for testing, is to copy and paste the contents of the file - e.g., +A much more effective solution is to send Spark a separate file - e.g. using the `--files configs/etl_config.json` flag with `spark-submit` - containing the configuration in JSON format, which can be parsed into a Python dictionary in one line of code with `json.loads(config_file_contents)`. Testing the code from within a Python interactive console session is also greatly simplified, as all one has to do to access configuration parameters for testing, is to copy and paste the contents of the file - e.g., ```python import json @@ -61,10 +61,10 @@ In this project, functions that can be used across different ETL jobs are kept i from dependencies.spark import start_spark ``` -This package, together with any additional dependencies referenced within it, must be to copied to each Spark node for all jobs that use `dependencies` to run. This can be achieved in one of several ways: +This package, together with any additional dependencies referenced within it, must be copied to each Spark node for all jobs that use `dependencies` to run. This can be achieved in one of several ways: 1. send all dependencies as a `zip` archive together with the job, using `--py-files` with Spark submit; -2. formally package and upload `dependencies` to somewhere like the `PyPi` archive (or a private version) and then run `pip3 install dependencies` on each node; or, +2. formally package and upload `dependencies` to somewhere like the `PyPI` archive (or a private version) and then run `pip3 install dependencies` on each node; or, 3. a combination of manually copying new modules (e.g. `dependencies`) to the Python path of each node and using `pip3 install` for additional dependencies (e.g. for `requests`). Option (1) is by far the easiest and most flexible approach, so we will make use of this for now. To make this task easier, especially when modules such as `dependencies` have additional dependencies (e.g. the `requests` package), we have provided the `build_dependencies.sh` bash script for automating the production of `packages.zip`, given a list of dependencies documented in `Pipfile` and managed by the `pipenv` python application (discussed below). @@ -96,7 +96,7 @@ Full details of all possible options can be found [here](http://spark.apache.org It is not practical to test and debug Spark jobs by sending them to a cluster using `spark-submit` and examining stack traces for clues on what went wrong. A more productive workflow is to use an interactive console session (e.g. IPython) or a debugger (e.g. the `pdb` package in the Python standard library or the Python debugger in Visual Studio Code). In practice, however, it can be hard to test and debug Spark jobs in this way, as they implicitly rely on arguments that are sent to `spark-submit`, which are not available in a console or debug session. -We wrote the `start_spark` function - found in `dependencies/spark.py` - to facilitate the development of Spark jobs that are aware of the context in which they are being executed - i.e. as `spark-submit` jobs or within an IPython console, etc. The expected location of the Spark and job configuration parameters required by the job, is contingent on which execution context has been detected. The doscstring for `start_spark` gives the precise details, +We wrote the `start_spark` function - found in `dependencies/spark.py` - to facilitate the development of Spark jobs that are aware of the context in which they are being executed - i.e. as `spark-submit` jobs or within an IPython console, etc. The expected location of the Spark and job configuration parameters required by the job, is contingent on which execution context has been detected. The docstring for `start_spark` gives the precise details, ```python def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], @@ -112,7 +112,7 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], This function also looks for a file ending in 'config.json' that can be sent with the Spark job. If it is found, it is opened, the contents parsed (assuming it contains valid JSON for the ETL job - configuration), into a dict of ETL job configuration parameters, + configuration) into a dict of ETL job configuration parameters, which are returned as the last element in the tuple returned by this function. If the file cannot be found then the return tuple only contains the Spark session and Spark logger objects and None @@ -120,9 +120,9 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], The function checks the enclosing environment to see if it is being run from inside an interactive console session or from an - environment which has a `DEBUG` environment varibale set (e.g. + environment which has a `DEBUG` environment variable set (e.g. setting `DEBUG=1` as an environment variable as part of a debug - configuration within an IDE such as Visual Studio Code or PyCharm in + configuration within an IDE such as Visual Studio Code or PyCharm. In this scenario, the function uses all available function arguments to start a PySpark driver from the local PySpark package as opposed to using the spark-submit and Spark cluster defaults. This will also @@ -130,7 +130,7 @@ def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[], sent to spark via the --py-files flag in spark-submit. :param app_name: Name of Spark app. - :param master: Cluster connection details (defaults to local[*]. + :param master: Cluster connection details (defaults to local[*]). :param jar_packages: List of Spark JAR package names. :param files: List of files to send to Spark cluster (master and workers). From 1f099fa895633ce6d036888712f73ca4d08ff707 Mon Sep 17 00:00:00 2001 From: Oliver Willekens Date: Thu, 31 Jan 2019 00:54:34 +0100 Subject: [PATCH 4/8] bump pyspark version: unit tests don't run on 2.3.1 --- Pipfile | 2 +- Pipfile.lock | 89 ++++++++++++++++++++++++---------------------------- 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/Pipfile b/Pipfile index 3d3a8a5..3e98a72 100644 --- a/Pipfile +++ b/Pipfile @@ -6,7 +6,7 @@ name = "pypi" [packages] [dev-packages] -pyspark = "==2.3.1" +pyspark = "==2.4.0" ipython = "*" "flake8" = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 6936186..4f7a48c 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "e253f2db7634a033c099a981183ae534c048ccaa06a10d067764900ba438c718" + "sha256": "6233634aaea8fcac205da57dd6cf6994a6cd2163c027a93883786699f221f912" }, "pipfile-spec": 6, "requires": { @@ -17,14 +17,6 @@ }, "default": {}, "develop": { - "appnope": { - "hashes": [ - "sha256:5b26757dc6f79a3b7dc9fab95359328d5747fcb2409d331ea66d0272b90ab2a0", - "sha256:8b995ffe925347a2138d7ac0fe77155e4311a0ea6d6da4f5128fe4b3cbe5ed71" - ], - "markers": "sys_platform == 'darwin'", - "version": "==0.1.0" - }, "backcall": { "hashes": [ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4", @@ -34,26 +26,33 @@ }, "decorator": { "hashes": [ - "sha256:2c51dff8ef3c447388fe5e4453d24a2bf128d3a4c32af3fabef1f01c6851ab82", - "sha256:c39efa13fbdeb4506c476c9b3babf6a718da943dab7811c206005a4a956c080c" + "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e", + "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b" + ], + "version": "==4.3.2" + }, + "entrypoints": { + "hashes": [ + "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", + "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" ], - "version": "==4.3.0" + "version": "==0.3" }, "flake8": { "hashes": [ - "sha256:7253265f7abd8b313e3892944044a365e3f4ac3fcdcfb4298f55ee9ddf188ba0", - "sha256:c7841163e2b576d435799169b78703ad6ac1bbb0f199994fc05f700b2a90ea37" + "sha256:0323db2e3a72faa2c4cdd61ea87594b9cb343fc4dfa5c24d6b43059d7ba29d0e", + "sha256:a7951ade4814e5e5364bdce1e73862cbacf2bbb9b509a9bf8c130a0414cf0722" ], "index": "pypi", - "version": "==3.5.0" + "version": "==3.7.2" }, "ipython": { "hashes": [ - "sha256:47b17ea874454a5c2eacc2732b04a750d260b01ba479323155ac8a39031f5535", - "sha256:9fed506c3772c875a3048bc134a25e6f5e997b1569b2636f6a5d891f34cbfd46" + "sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12", + "sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742" ], "index": "pypi", - "version": "==7.0.1" + "version": "==7.2.0" }, "ipython-genutils": { "hashes": [ @@ -64,10 +63,10 @@ }, "jedi": { "hashes": [ - "sha256:0ad328f5d9d0a6c8b22a0ca429c7b0cea1974e2b2d5a00e0bc45074dcd44d255", - "sha256:e4db7a2e08980e48c6aec6588483629c81fdcf9b6d9e6a372b40ed7fec91f310" + "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd", + "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191" ], - "version": "==0.13.0" + "version": "==0.13.2" }, "mccabe": { "hashes": [ @@ -78,10 +77,10 @@ }, "parso": { "hashes": [ - "sha256:35704a43a3c113cce4de228ddb39aab374b8004f4f2407d070b6a2ca784ce8a2", - "sha256:895c63e93b94ac1e1690f5fdd40b65f07c8171e3e53cbd7793b5b96c0e0a7f24" + "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d", + "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31" ], - "version": "==0.3.1" + "version": "==0.3.2" }, "pexpect": { "hashes": [ @@ -100,11 +99,11 @@ }, "prompt-toolkit": { "hashes": [ - "sha256:5eff0c9fd652384ecfe730bbcdf3658868725c6928fbf608d9338834d7a974b6", - "sha256:81da9ecf6ca6806a549697529af8ec3ac5b739c13ac14607218e650db1b53131", - "sha256:c67c1c264d8a0d9e1070e9272bacee00f76c81daab7bc4bf09ff991bd1e224a7" + "sha256:88002cc618cacfda8760c4539e76c3b3f148ecdb7035a3d422c7ecdc90c2a3ba", + "sha256:c6655a12e9b08edb8cf5aeab4815fd1e1bdea4ad73d3bbf269cf2e0c4eb75d5e", + "sha256:df5835fb8f417aa55e5cafadbaeb0cf630a1e824aad16989f9f0493e679ec010" ], - "version": "==2.0.5" + "version": "==2.0.8" }, "ptyprocess": { "hashes": [ @@ -122,44 +121,38 @@ }, "pycodestyle": { "hashes": [ - "sha256:682256a5b318149ca0d2a9185d365d8864a768a28db66a84a2ea946bcc426766", - "sha256:6c4245ade1edfad79c3446fadfc96b0de2759662dc29d07d80a6f27ad1ca6ba9" + "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", + "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" ], - "version": "==2.3.1" + "version": "==2.5.0" }, "pyflakes": { "hashes": [ - "sha256:08bd6a50edf8cffa9fa09a463063c425ecaaf10d1eb0335a7e8b1401aef89e6f", - "sha256:8d616a382f243dbf19b54743f280b80198be0bca3a5396f1d2e1fca6223e8805" + "sha256:5e8c00e30c464c99e0b501dc160b13a14af7f27d4dffb529c556e30a159e231d", + "sha256:f277f9ca3e55de669fba45b7393a1449009cff5a37d1af10ebb76c52765269cd" ], - "version": "==1.6.0" + "version": "==2.1.0" }, "pygments": { "hashes": [ - "sha256:78f3f434bcc5d6ee09020f92ba487f95ba50f1e3ef83ae96b9d5ffa1bab25c5d", - "sha256:dbae1046def0efb574852fab9e90209b23f556367b5a320c0bcb871c77c3e8cc" + "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a", + "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d" ], - "version": "==2.2.0" + "version": "==2.3.1" }, "pyspark": { "hashes": [ - "sha256:52d77a7ef43088b0235742cfcafc83435d0d98c5fdded1d8c600f1887e9e0213" + "sha256:c9d7b7c5e91b13488b657e364ff392a80b2e374b182138e5ec8702a1822bffdc" ], "index": "pypi", - "version": "==2.3.1" - }, - "simplegeneric": { - "hashes": [ - "sha256:dc972e06094b9af5b855b3df4a646395e43d1c9d0d39ed345b7393560d0b9173" - ], - "version": "==0.8.1" + "version": "==2.4.0" }, "six": { "hashes": [ - "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", - "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c", + "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73" ], - "version": "==1.11.0" + "version": "==1.12.0" }, "traitlets": { "hashes": [ From 7bbafbbb5e02017c913848d7abd76da734dc9dc3 Mon Sep 17 00:00:00 2001 From: alexioannides Date: Tue, 19 Feb 2019 10:20:44 +0000 Subject: [PATCH 5/8] Updated .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3ce4bae..bf4eb99 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ */__pycache__/* *?metastore_db/* */spark_warehouse/* +.mypy_cache/ .vscode/* +.venv venv/* loaded_data/* derby.log From e43111da69a3467a4c93d795423e280e15cf1e22 Mon Sep 17 00:00:00 2001 From: alexioannides Date: Tue, 19 Feb 2019 10:20:44 +0000 Subject: [PATCH 6/8] Updated .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 3ce4bae..bf4eb99 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ */__pycache__/* *?metastore_db/* */spark_warehouse/* +.mypy_cache/ .vscode/* +.venv venv/* loaded_data/* derby.log From 1a1813624b69ba508009fac3d53048fa92f8dade Mon Sep 17 00:00:00 2001 From: alexioannides Date: Tue, 19 Feb 2019 11:07:05 +0000 Subject: [PATCH 7/8] Corrected error in spark-submit example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7e853f..15b04c5 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Assuming that the `$SPARK_HOME` environment variable points to your local Spark $SPARK_HOME/bin/spark-submit \ --master local[*] \ --packages 'com.somesparkjar.dependency:1.0.0' \ ---py-files dependencies.zip \ +--py-files packages.zip \ --files configs/etl_config.json \ jobs/etl_job.py ``` From 6175a7923ee43d0d24c63983f8047c5e36f6164a Mon Sep 17 00:00:00 2001 From: alexioannides Date: Tue, 19 Feb 2019 11:12:58 +0000 Subject: [PATCH 8/8] Enhancements to build_dependencies.sh --- build_dependencies.sh | 13 +++++++------ packages.zip | Bin 2886 -> 2884 bytes 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/build_dependencies.sh b/build_dependencies.sh index 85e61a2..d390a51 100755 --- a/build_dependencies.sh +++ b/build_dependencies.sh @@ -3,22 +3,23 @@ # check to see if pipenv is installed if [ -x "$(which pipenv)" ] then - # check that there Pipfile exists in root directory - if [ ! -e Pipfile ] + # check that Pipfile.lock exists in root directory + if [ ! -e Pipfile.lock ] then - echo 'ERROR - cannot find Pipfile' + echo 'ERROR - cannot find Pipfile.lock' exit 1 fi # use Pipenv to create a requirement.txt file - echo '... creating requirements.txt from Pipfile' + echo '... creating requirements.txt from Pipfile.lock' pipenv lock -r > requirements.txt # install packages to a temporary directory and zip it + touch requirements.txt # safeguard in case there are no packages pip3 install -r requirements.txt --target ./packages # check to see if there are any external dependencies - # if not then create an empty file to see zip with + # if not then create an empty file to seed zip with if [ -z "$(ls -A packages)" ] then touch packages/empty.txt @@ -46,6 +47,6 @@ then exit 0 else - echo 'ERROR - pipenv is not installed --> run pip3 install pipenv to load pipenv into global site packages or install via a system package manager.' + echo 'ERROR - pipenv is not installed --> run `pip3 install pipenv` to load pipenv into global site packages or install via a system package manager.' exit 1 fi diff --git a/packages.zip b/packages.zip index 231a1ce2b8ff39e2eb394bb1ef787f53c8492aa1..a691372a107603283a3da3ae2e3f508fc9587bb4 100644 GIT binary patch delta 2405 zcmZ{mc{CJiAIE2mEHh=SLs@1t$W~;T%C1Cn=Mu7}!Hc3{7@096GPWc$Dh8>GYpbzT zhAdf=UC3VcvR2j@?$9zWw?8`fyzl#*^ZV!dexK*B&pE$OE36qNWQ_swf&qKyhVuoh zL?JoF8?(Nb_J%}(0D!!kKmdSOd^i48{Oqt>v^gx0`r&6WG4$Ypy*pA*4zh2zzM7bg zr0#{;)Bp8L9m6tUOmlzi>Q@P~U{*9tZ`n(|X1~6l&n=*AMeeKNvZr0CJm`Y4&hT&f zcZXGaNrw9e(DLy5FcrN&y3Og2S5sD6iit5>C}n=e)$>SG&oYZgvIrfd9)GwA^m&Wt z3sJ(_7%1%rHS!R&D`e0$o9VCwM@A$#a(KxWWyZ!VDZ?#aM8dbk&_+x39-%C4SiO18#Z~-Z%bI{jC@I%vQ_|n`t)EGn*y}{;;En(u@9iGs{ULf2Q&yZ)+wXRQ< zdQH15WIao9sDFmB@FZv8f))vrCg+XLMq&wq#&4;g+#SLrLae(1lMQ{gOgTWCh?0Mt zVb$5l_m$Duq00Le_H1lzz@ zj$HNYQAU9}(y+q@rWvv7un5z|jJmUx zs7jqoY{lZ~_KTiuloqiBIw=bQEHrnJn#;a}jbgW|wA8&TUbjJNoUdZIUZj0)^v9UZ zk4lLh10&s03JD4&TYvT~I1WRdzH@gv0eypu-_<4_yxfM;GkV#^iw}TCPe-T*<{N)a zOT7gCB7>+aWBCtD07e2>fYHc<pP*030Yo7=-wLh_pC%`p=-m zCl=M>i1G#YF|bdsBP6eC9lfI2yeXc@BPsQg1(Q^x?novSs*|w^Y5pP{mBr zFs;byZK&dkuyPfk-3+d`7?ocL8g{f#^2;z3SUfv!y=o8-i|qHUS95i1bUM7=pGrBA&OG_m%m2iKvM?3o&rk> znH;c+PwzP7URS=kwNc)e@&PQM5HI|y6m`G;y$n%Va2{-}P6QR$7*%K4JF*D zb)c`~m$6Dqk)0Cjw8E)U`B&Lf$_|XFkm^_O{{I|EQX=$jMzYGnK;TxxVKngHKi&+~gus z5Xs(JJ)ziD)6jjJqSyHqZf?wWm`Dm-7N6(cY$)#~U!ewH3yKGJ6BYM;QST7Mm7}uU z75M!%MLjhUk=9iSiRW{K?C&Vn`H0tq*y$?ANmo`}7Ee~0$G2BKv=w?@6z)Ng5^cRM zZeW$)WS#Sypy4bbmq0|^S#)`J%AHxz1lxo`@C#^3{fMHVm1v3g(XVa8szTUJ8pb!A zm*9^Eu2Ii8nR*qD2j(_V2SI+N+!e^Ypi{bTik*`kpS3VJ4oAAIcf}YGHRI@j(I=iEa zpbqsyY_BzU;APqmw&bcdA2e8L6`>p8-#LFurvjOrnTWLRH0Pma5q_NC<$x%S5ScGh z(KaR~l)jQSkQ+Vh_Lj%INZl(!B%?%y+bi2_zZyyD8uS~!otsKrSTEJb_G?PIAg?y6 zGv!B~YCiG)lX{^Lrfcz*Q{i!c#l`(l#N(<-hk~_`ezT@)^Ch}qc0V7UY>TGHosujpV4bk z3;T2MvuLYlaYRNiNG(%UEl}c!L^>H)Btz5}3)OXuAF0z%iZ}8}x}uZjewaQaQ`OJr z-`N6}^9@Z++LF#s_dHB$6ZNXSR;cZ9fGSCB&-!**;Fxr#<;`Iot`6NI4lcY z@cDxJLji0Qz^L;9kkzE}howJ(O9!k8HeO=#w`mqtXf{33Pw;m7VOeYms(x~|g0iHZ ziDf5fT~wFM_9%&IYjdDm9W3TrI9!S=kz-RY8z3+0D@I?RkkLF4Eq#o3Tca!$P7Q^K zIp@vg4&h^aru-%;A>l@YNwnjbZ6G~j&Yc1KINr6jeynm1^RetJysm$wPuk^;eWIn9 zTVM$(|LzY8Pw{2tVIDV~nOWKa*M%%0>(@nR54|Zph*-~F^FYl9@Ja!#igNWhPUn=o zbb#no|17Za3*W}B^^qxoQaK|f>T6Cg=Z7asJ|Tv6#~Vss>YCD%dW(fPNr(#ztTXJ%WBTDvA^;7(79>oI(rw?O&f-^Dv!Prm6nw7Bf!(rCvy zXY5j&z`hj-?!Icd-K3>@c xLmPbjXXFDu1$@HD0UN{ry~O|k34qi;`)=2I0C|zzB#7+K7xEA|U$Ed_^fxfwQ3(J5 delta 2416 zcmZve2{hDQAIAs7G$Tt)mJE}fsE|ENlF*DblWZ9@mdKKQ{SQqsG{j`j7BLdfBU_~r zYG^XaFhyC4%C3}V8l^nE&wJj^`#$eI=iYPfxu3h7bHC^OK942(B!nGNd|()8yQQ`s zwaXQjQ|^7=;<4SM;{-uGU^Wj31cq-d{|3LdY=KS+J38O}4U#|cN()8lA?y;OC$0@_ z@oTwNionikf=Bj0kD-#y!^X8lX|XAp7Z2MU=d~{*+($91teoLC%`7!vREAt30 zk10bU#|PC-3lqU|0LVF$KL@KhFw-{QO)37dxM+v)*keD|_#Vyoc=H8KZ%+9#&pbB|>T$-oubvS)PIuTSyDeP)YrE-00*@E=P{`ISXU)Q2B67qx+b0MwH}prQ&DfO%s>B+?4nwZC*%>tAa`5Ka~})Y2kWmEXt+X z(01wbHaQ4nZ12$3f!1wcW3q%C9{`=3nFTv1^Y z`l4M+u77^CaMlns`r6tpH(`<~5G3|^fvyK`((OcfXxvi5#d@?pq^?z-$z z+VGz(4ZF;u(X3_?S3VB>y1ub{Gw;3X_|E;5*D)5QgHe!ZLOZ06W9bU<0}i)!*bB+A zs-f1=FP6UVSS-Qk2@=qZ`i=vXTgKDtw@&0yTU2q>Oqn55sesO-ZGGu|caO4-5wyiY z#gx|I#?T~_$qJ+4h%72qdq4L@%mQ%URb}+8>2ZX`p82Za$_63sEBZ9vNX2%z`^MDHLpK6va!;k$x@t4UbI3W-;PE${;b3$Vo>+VEe3rz@?O zK7LLLkim*VAeQl~wJteYTC|s}k|hTF=E{B9C#^1@&f|FtqmGOL97d_4!->vE;b{l7 z2CIWS1P(X!D?08FF!}~l53X|Z6lqLaK_j(T2PoJ}r6}`ddzEWDBQL2*Q13z~HMl~2 zu2b#!w>317*;RY>l8h%@_`?OI{Vtev^+1Op?$S$-v#+NT zIaS3$tWe8xbe%Jxe#AD@u2@WyZ)?e@;_}@DOhx#X3a^E*u{2$#}4rS96wGw5Op1 zX*cTsUNo49jwjB34qQ~s6HzPDGQ8p(^t{PEgpYr}qZ5!J>Mm&c=747GToE(vbS*xl z*rBGOB~{ALEYR{Q*b*d7VC&yBYcYO$U6;BD@9ey|N4b4i z(9oXvGQrm*yJ3Xl1j6+xe8l@hrtemXGku~idxR+8rL^8t_X^?3of}TqN;y{0mcai0 zqXIa+Vg>{T#FXTwJw-#6`Do>eREK^wr2;HzGOMe+cV0F-8?humNf+(9EZh+yue^uY z6O=RUXl))iGTdWTjV4Lo#y-~m`dO15h|m#T!%YuOT#tD-ri6Lb)ltLAIK-{5myGmr znl~&6Z~DX@67W7DI56MCA*ogOlRZOpxXH~NAR}dUl=AM4T~7Q9>z$s??!szptgeUG z8O|I^uKV1ZbLFrlXkUQAg!4$z1m&BqW;2g6)bn6r%!SSn)<6+rUmc&%eI7Wlgwa<) z%ab~WU}xiz=UInK3dxT;Tq{(D)%QL$rY|t8l@1pq_v1WB8;Cm;?G{i_Gmzo8`>9zGD+#Y`i;UpxOXFUjzO4R zmoWLWh8oPp78$HSAl3jOntBSl_roAyXKTy!A+~Zg<@~M16!wg~Wy~4|)}V_%yUEiT zIT;Zf%MqGxvE?B(CY?R4LSh}5SBpF?2`Ba&JdN|EW@M@m9YiQ5h0^KGn>kKI=W6U4 zK#JQHxN-!-y_0BSZ)`BQX>?}U)LajFyi6h}KESS!8EN1V<6mxGaZdf*GiM;4C5AOB zx6PWH(45KlS{SbzjAxrC;Ku2t*FEptC{^iV_=ZJP|DdOZy0bzp{G#~QQD;PB!+!64Wr@V8MEM1(vS2@^#(kt>&+0TEy*Dx)lUxe_b%rq&s z@R`lYBQ@`6e%Fm79~|h@rgs3VH$8XL$qE=QKOuDX;2zilxotW>qWh=lJ2B)*T-ZE> z$j%BQ<^E*i$8sv%Uy1jxx1>KyG4_}l`vy;GPxv(3F#kpgFaWMQiJt5deHc3ju%Sf`{}kd3}^7 zc@(hV$PFqnzh}0u%dGv2Ps#qbXKMZ1I~9YuP2huSk!;#tm}>*y1*|m=vH{M z%!^3!ezqy4W}f&Ve&6#2w=NotdRh+sOTZDu!z;)4+g<)8G5;Q*AgTXM$>mVE;{O$z z-}|?fhWrAm1>SZVG888B-#$auhG~<#VbbJ8m=aGdxe{iM{AaEE`^GK(0o9~($-iK- R;O(kMMhYSY;-K5F`3tI`U!DK}