diff --git a/pyveg/README.md b/pyveg/README.md index 59e69742..927cdc93 100644 --- a/pyveg/README.md +++ b/pyveg/README.md @@ -64,9 +64,19 @@ The download job is fully specified by a configuration file, which you point to Note that we use the [BNG convention](https://britishnationalgrid.uk/) for coordinates, i.e. `(eastings,northings)` and we set bounds for regions to download in the convention `(left, bottom, right, top)`. -#### Generating a download configuration file +### Downloading data on a loop from multiple configuration files + +You might want to download images from several configuration files in one go. This can be done with the following command: + +``` +pyveg_run_pipeline_loop --config_dir configs +``` + +where `config_dir` is the path to a directory where all the config files you want to run are found. The script runs a loop and +exectues the `pyveg_run_pipeline` command on each available file found in that path. -**TODO: To be updated with peep instructions!** + +#### Generating a download configuration file To create a configuration file for use in the pyveg pipeline described above, use the command ``` @@ -83,11 +93,13 @@ this allows the user to specify various characteristics of the data they want to * Landsat5: [Available from 1984-03-10 to 2013-01-31 at 60m resolution.](https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LT05_C01_T1) * Landsat4: [Available from 1982-07-16 to 1993-12-14 at 60m resolution.](https://developers.google.com/earth-engine/datasets/catalog/LANDSAT_LT04_C01_T1) -* `--latitude`: The latitude (in degrees north) of the centre point of the image collection. +* `--left`: The left bound of the region to be downloaded (in Easting BNG coordinates) . + +* `--bottom`: The bottom bound of the region to be downloaded (in Northing BNG coordinates) . -* `--longitude`: The longitude (in degrees east) of the centre point of the image collection. +* `--right`: The right bound of the region to be downloaded (in Easting BNG coordinates) . -* `--country`: The country (for the file name) can either be entered, or use the specified coordinates to look up the country name from the OpenCage database. +* `--top`: The top bound of the region to be downloaded (in Northing BNG coordinates) . * `--start_date`: The start date in the format ‘YYYY-MM-DD’, the default is ‘2015-01-01’ (or ‘2019-01-01’ for a test config file). @@ -95,36 +107,48 @@ this allows the user to specify various characteristics of the data they want to * `--time_per_point`: The option to run the image collection either monthly (‘1m’) or weekly (‘1w’), with the default being monthly. -* `--run_mode`: The option to run time-consuming functions on Azure (‘batch’) or running locally on your own computer (‘local’). The default is local. For info about running on Azure go [here](UsingAzure.md). - * `--output_dir`: The option to write the output to a specified directory, with the default being the current directory. -* `--test_mode`: The option to make a test config file, containing fewer months and a subset of sub-images, with a default option to have a normal config file. - * By choosing the test config file, the start and end dates (see below) are defaulted to cover a smaller time span. - * It is recommended that the test config option should be used purely to determine if the options specified by the user are correct. - - -* `--n_threads`: Finally, how many threads the user would like to use for the time-consuming processes, either 4 (default) or 8. - For example: ``` - pyveg_generate_config --configs_dir "pyveg/configs" --collection_name "Sentinel2" --latitude 11.58 --longitude 27.94 --start_date "2016-01-01" --end_date "2020-06-30" --time_per_point "1m" --run_mode "local" --n_threads 4 + pyveg_generate_config --configs_dir "pyveg/configs" --collection_name "Sentinel2" --left 419840 --bottom 0235520 --right 430080 --top 0245760 --start_date "2016-01-01" --end_date "2016-02-01" --time_per_point "1m" ``` -This generates a file named `config_Sentinel2_11.58N_27.94E_Sudan_2016-01-01_2020-06-30_1m_local.py` along with instructions on how to use this configuration file to download data through the pipeline, in this case the following: +This generates a file named `config_Sentinel2_419840_0235520_430080_0245760_2016-01-01_2016-02-01_1m_local.py` along with instructions on how to use this configuration file to download data through the pipeline, in this case the following: ``` -pyveg_run_pipeline --config_file pyveg/configs/config_Sentinel2_11.58N_27.94E_Sudan_2016-01-01_2020-06-30_1m_local.py +pyveg_run_pipeline --config_file pyveg/configs/config_Sentinel2_419840_0235520_430080_0245760_2016-01-01_2016-02-01_1m_local.py ``` Individual options can be specified by the user via prompt. The options for this can be found by typing ```pyveg_generate_config --help```. +#### Generating many configuration files from a geoparquet file -### More Details on Downloading +If you want to generate a large number of configuration files from a dataset of geometries you can do this using the command `pyveg_generate_config` and +a [geoparquet](https://pypi.org/project/geoparquet/) file (`bounds_file`). + +The geoparquet file must contain a column named `geometry`, and each row correspond to an individual geometry (an example of this +file can be found as `images_1024.parquet` on the `testdata` directory). If provided +the `--bounds_file` flag, the `pyveg_generate_config` script will read the geoparquet file, loop over its rows creating a config file +for each geometry. + +If a column `on_land` is available in the geoparquet file, the script will filter only rows where the column is True. +If not this column is not found, the scrip will loop over all rows of the data. + +Flags such as `start_date`, `end_date`, `time_per_point` must be provided as well, and it will be used for all the config files created. + +Flags such as `configs_dir`, `output_dir` are optional, defining to write the output of the config files or downloads to a specified directory +with the default being the current directory. -During the download job, `pyveg` will break up your specified date range into a time series, and download data at each point in the series. Note that by default the vegetation images downloaded from GEE will be split up into 50x50 pixel images, vegetation metrics are then calculated on the sub-image level. Both colour (RGB) and Normalised Difference Vegetation Index (NDVI) images are downloaded and stored. Vegetation metrics include the mean NDVI pixel intensity across sub-images, and also network centrality metrics, discussed in more detail below. +An example: + +``` +pyveg_generate_config --bounds_file testdata/images_1024.parquet --start_date 2018-04-02 --end_date 2018-10-01 --time_per_point 5m --configs_dir configs --output_dir output_dowloads +``` + +### More Details on Downloading -For weather collections e.g. the ERA5, due to coarser resolution, the precipitation and temperature "images" are averaged into a single value at each point in the time series. +During the download job, `pyveg` will break up your specified date range into a time series defined by the `time_per_point` flag , and download data at each point in the series. Note that by default the images downloaded from GEE will be split up into 32x32 pixel images. Both colour (RGB) and a mosaic with counts of images used in the composite (COUNT) images are downloaded and stored. ### Rerunning partially succeeded jobs diff --git a/pyveg/configs/config_template.py b/pyveg/configs/config_template.py index 9e1915f7..ca67c1b0 100644 --- a/pyveg/configs/config_template.py +++ b/pyveg/configs/config_template.py @@ -12,34 +12,24 @@ # Define location to save all outputs. Note that every time the pipeline job # is rerun, a datestamp will be appended to the output_location. output_location = "OUTPUT_LOCATION" -output_location_type = "OUTPUT_LOCATION_TYPE" +output_location_type = "local" # parse selection. Note (long, lat) GEE convention. -coordinates = [LEFT, BOTTOM, RIGHT, TOP] - -# optional coords_id setting -COORDS_ID_STRING - -# pattern_type description -pattern_type = "PATTERN_TYPE" +bounds = [LEFT, BOTTOM, RIGHT, TOP] date_range = ["START_DATE", "END_DATE"] # From the dictionary entries in data_collections.py, which shall we use # (these will become "Sequences") -collections_to_use = ["COLLECTION_NAME", "WEATHER_COLL_NAME"] +collections_to_use = ["COLLECTION_NAME"] # Dictionary defining what Modules should run in each Sequence. modules_to_use = { - "WEATHER_COLL_NAME": ["WeatherDownloader", "WeatherImageToJSON"], "COLLECTION_NAME": [ "VegetationDownloader", "VegetationImageProcessor", - "NetworkCentralityCalculator", - "NDVICalculator", ], - "combine": ["VegAndWeatherJsonCombiner"], } # The following demonstrates how parameters can be set for individual Modules: @@ -47,19 +37,4 @@ "COLLECTION_NAME": { "time_per_point": "TIME_PER_POINT" }, - "WEATHER_COLL_NAME": { - "time_per_point": "TIME_PER_POINT", - "date_range": ["WEATHER_STARTDATE", "END_DATE"] - }, - "VegetationDownloader": {"region_size": REGION_SIZE}, - "VegetationImageProcessor": {"run_mode": "RUN_MODE"}, - "NetworkCentralityCalculator": { - "n_threads": NUM_THREADS, - "run_mode": "RUN_MODE", - "n_sub_images": NUM_SUBIMAGES - }, - "NDVICalculator": { - "run_mode": "RUN_MODE", - "n_sub_images": NUM_SUBIMAGES - } } diff --git a/pyveg/requirements.txt b/pyveg/requirements.txt index 9f24a2a1..0cbea48e 100644 --- a/pyveg/requirements.txt +++ b/pyveg/requirements.txt @@ -25,3 +25,4 @@ pypandoc mdutils pandoc sphinx_rtd_theme +pyarrow diff --git a/pyveg/scripts/generate_config_file.py b/pyveg/scripts/generate_config_file.py index 8566b1fc..020850f1 100644 --- a/pyveg/scripts/generate_config_file.py +++ b/pyveg/scripts/generate_config_file.py @@ -35,9 +35,9 @@ import re import time +import geopandas + from pyveg.configs import collections -from pyveg.coordinates import coordinate_store -from pyveg.src.coordinate_utils import lookup_country def get_template_text(): @@ -49,51 +49,35 @@ def get_template_text(): return open(template_filepath).read() -def make_output_location(coords_id, collection_name, latitude, longitude, country): - # quite restricted on characters allowed in Azure container names - - # use NSEW rather than negative numbers in coordinates - if latitude.startswith("-"): - latitude = latitude[1:] + "S" - else: - latitude = latitude + "N" - if longitude.startswith("-"): - longitude = longitude[1:] + "W" - else: - longitude = longitude + "E" - +def make_output_location(coords_id, collection_name, left, bottom, right, top): if coords_id: - output_location = ( - f"{coords_id}-{collection_name}-{latitude}-{longitude}-{country}" - ) + output_location = f"{coords_id}-{collection_name}-{left}-{bottom}-{right}-{top}" else: - output_location = f"{collection_name}-{latitude}-{longitude}-{country}" + output_location = f"{collection_name}-{left}-{bottom}-{right}-{top}" return output_location def make_filename( configs_dir, - test_mode, - longitude, - latitude, - country, - pattern_type, + left, + bottom, + right, + top, start_date, end_date, time_per_point, - region_size, collection_name, - run_mode, coords_id, ): """ Construct a filename from the specified parameters. """ - filename_start = "testconfig" if test_mode else "config" + filename_start = "config" if coords_id: - filename_start += "_" + coords_id + filename_start += "_" + str(coords_id) filepath = os.path.join( configs_dir, - f"{filename_start}_{collection_name}_{latitude}N_{longitude}E_{country}_{region_size}_{pattern_type}_{start_date}_{end_date}_{time_per_point}_{run_mode}.py", + f"{filename_start}_{collection_name}_{left}_{bottom}_{right}_{top}_{start_date}_{end_date}_{time_per_point}.py", ) return filepath @@ -101,18 +85,14 @@ def make_filename( def write_file( configs_dir, output_location, - longitude, - latitude, - country, - pattern_type, + left, + bottom, + right, + top, start_date, end_date, time_per_point, - region_size, collection_name, - run_mode, - n_threads, - test_mode=False, coords_id=None, ): """ @@ -120,57 +100,32 @@ def write_file( """ filename = make_filename( configs_dir, - test_mode, - longitude, - latitude, - country, - pattern_type, + left, + bottom, + right, + top, start_date, end_date, time_per_point, - region_size, collection_name, - run_mode, coords_id, ) - if time_per_point.endswith("d") or time_per_point.endswith("w"): - weather_collection_name = "ERA5_daily" - weather_start_date = start_date - else: - weather_collection_name = "ERA5" - if test_mode: - weather_start_date = start_date - else: - # also include historical weather data - weather_start_date = collections.data_collections[weather_collection_name][ - "min_date" - ] - text = get_template_text() current_time = time.strftime("%y-%m-%d %H:%M:%S") text = text.replace("CURRENT_TIME", current_time) - output_location_type = "azure" if run_mode == "batch" else "local" text = text.replace("COLLECTION_NAME", collection_name) - text = text.replace("WEATHER_COLL_NAME", weather_collection_name) - text = text.replace("OUTPUT_LOCATION_TYPE", output_location_type) text = text.replace("OUTPUT_LOCATION", output_location) - text = text.replace("LATITUDE", latitude) - text = text.replace("LONGITUDE", longitude) - text = text.replace("PATTERN_TYPE", pattern_type) + text = text.replace( + "RIGHT", str(int(right)) + ) # hacky way of removing unnecesary zeros + text = text.replace("LEFT", str(int(left))) + text = text.replace("TOP", str(int(top))) + text = text.replace("BOTTOM", str(int(bottom))) text = text.replace("START_DATE", start_date) - text = text.replace("WEATHER_STARTDATE", weather_start_date) text = text.replace("END_DATE", end_date) text = text.replace("TIME_PER_POINT", time_per_point) - text = text.replace("REGION_SIZE", region_size) - text = text.replace("RUN_MODE", run_mode) - text = text.replace("NUM_THREADS", str(n_threads)) - n_subimages = "10" if test_mode else "-1" - text = text.replace("NUM_SUBIMAGES", n_subimages) - if coords_id: - text = text.replace("COORDS_ID_STRING", 'coords_id = "{}"'.format(coords_id)) - else: - text = text.replace("COORDS_ID_STRING", "") + with open(filename, "w") as configfile: configfile.write(text) print( @@ -188,13 +143,12 @@ def main(): for k in collections.data_collections.keys() if collections.data_collections[k]["data_type"] == "vegetation" ] - run_modes = ["local", "batch"] date_regex = re.compile("[\d]{4}-[01][\d]-[0123][\d]") time_per_point_regex = re.compile("[\d]+[dwmy]") - lat_range = [-90.0, 90.0] - long_range = [-180.0, 180.0] - n_threads_range = range(1, 17) - default_n_threads = 4 + left_bound = [0, 700000] + bottom_bound = [0, 1300000] + top_bound = [0, 1300000] + right_bound = [0, 700000] # create argument parser in case user wants to use command line args parser = argparse.ArgumentParser( @@ -203,58 +157,36 @@ def main(): """ ) parser.add_argument( - "--coords_id", help="(optional) ID of location in coordinates.py", type=str + "--bounds_file", + help="Path to a geoparket file. The file should include a geometry column (named 'geometry'). A config file will be created for each row in the geoparket file.", + type=str, ) parser.add_argument( "--configs_dir", help="path to directory containing config files" ) - parser.add_argument("--collection_name", help="collection name (e.g. 'Sentinel2')") parser.add_argument( - "--output_dir", help="Directory for local output data", type=str + "--collection_name", + help="collection name (e.g. 'Sentinel2')", + default="Sentinel2", ) parser.add_argument( - "--test_mode", - help="Run in test mode, over fewer months and with fewer sub-images", - action="store_true", + "--output_dir", help="Directory for local output data", type=str ) - parser.add_argument("--latitude", help="latitude in degrees N", type=float) - parser.add_argument("--longitude", help="longitude in degrees E", type=float) - parser.add_argument("--country", help="Country of location", type=str) + parser.add_argument("--left", help="left bound in Eastings", type=float) + parser.add_argument("--right", help="right bound in Eastings", type=float) + parser.add_argument("--bottom", help="bottom bound in Nothings", type=float) + parser.add_argument("--top", help="top bound in Nothings", type=float) parser.add_argument("--start_date", help="start date, format YYYY-MM-DD", type=str) parser.add_argument("--end_date", help="end date, format YYYY-MM-DD", type=str) parser.add_argument( "--time_per_point", help="frequency of image, e.g. '1m', '1w'", type=str ) - parser.add_argument( - "--region_size", - help="Size of region to download, in degrees lat/long", - type=float, - ) - parser.add_argument( - "--pattern_type", - help="Type of patterned vegetation, e.g. 'spots', 'labyrinths'", - type=str, - ) - parser.add_argument( - "--run_mode", - help=""" - 'local' for running on local machine, 'azure' for running some time-consuming parts (i.e. vegetation image processing) on Azure batch - """, - type=str, - ) - parser.add_argument( - "--n_threads", - help=""" - How many threads (cores) to parallelize some processing functions over - """, - type=int, - ) args = parser.parse_args() # sanity check - if args.coords_id and (args.latitude or args.longitude): - print("Please select EITHER coords_id OR latitude/longitude") + if args.bounds_file and (args.bottom or args.right or args.left or args.right): + print("Please select EITHER coords_id OR bounds") return ############# @@ -278,14 +210,6 @@ def main(): if len(configs_dir) == 0: configs_dir = default_configs_dir - # test mode - test_mode = args.test_mode if args.test_mode else False - if not test_mode: - do_test = input( - "Would you like to make a test config file, with fewer months, and only a subset of sub-images? Press 'y' if so, or press Return for a normal config. : " - ) - test_mode = do_test.startswith("y") or do_test.startswith("Y") - # collection name collection_name = args.collection_name if args.collection_name else None while not collection_name in collection_names: @@ -294,73 +218,9 @@ def main(): collection_names ) ) - - # (optional) ID from coordinates.py - coords_id = args.coords_id if args.coords_id else None - latitude = None - longitude = None - country = None - region_size = None - pattern_type = None - if coords_id: - try: - row = coordinate_store.loc[coords_id] - latitude = row["latitude"] - longitude = row["longitude"] - country = row["country"] - region_size = row["region_size"] - pattern_type = row["type"] - except (KeyError): - print("Unknown id {} - please enter coordinates manually".format(coords_id)) - - # latitude and longitude - if not latitude: - latitude = args.latitude if args.latitude else -999.0 - - while not ( - isinstance(latitude, float) - and latitude > lat_range[0] - and latitude < lat_range[1] - ): - latitude = float( - input( - "please enter Latitude (degrees N) in the range {} : ".format( - lat_range - ) - ) - ) - if not longitude: - longitude = args.longitude if args.longitude else -999.0 - while not ( - isinstance(longitude, float) - and longitude > long_range[0] - and longitude < long_range[1] - ): - longitude = float( - input( - "please enter Longitude (degrees E) in the range {} : ".format( - long_range - ) - ) - ) - - # country - country = args.country if args.country else "" - if not country: - country = input( - "Enter name of country, or press return to use OpenCage country lookup based on coordinates : " - ) - if len(country) == 0: - country = lookup_country(latitude, longitude) - # remove spaces - country = re.sub("[\s]+", "", country) - # start date start_date = args.start_date if args.start_date else "" - if test_mode: - default_start_date = "2019-01-01" - else: - default_start_date = collections.data_collections[collection_name]["min_date"] + default_start_date = collections.data_collections[collection_name]["min_date"] while not date_regex.search(start_date): start_date = input( "Enter start date in format YYYY-MM-DD, or press Return for default ({}) : ".format( @@ -372,10 +232,7 @@ def main(): # end date end_date = args.end_date if args.end_date else "" - if test_mode: - default_end_date = "2019-03-01" - else: - default_end_date = collections.data_collections[collection_name]["max_date"] + default_end_date = collections.data_collections[collection_name]["max_date"] while not date_regex.search(end_date): end_date = input( "Enter end date in format YYYY-MM-DD, or press Return for default ({}) : ".format( @@ -398,146 +255,208 @@ def main(): ) if len(time_per_point) == 0: time_per_point = default_time_per_point - - # region size - if not region_size: - region_size = args.region_size if args.region_size else -1.0 - default_region_size = 0.08 - while not ( - isinstance(region_size, float) and region_size > 0.0 and region_size <= 0.08 - ): - region_size = input( - "Enter region size in degrees latitude/longitude, or press Return for max/default ({}) : ".format( - default_region_size - ) - ) - if len(region_size) == 0: - region_size = default_region_size - else: - region_size = float(region_size) - # now we've established it fulfils the requirements, convert to a str - region_size = str(region_size) - - # pattern_type - if not pattern_type: - pattern_type = args.pattern_type if args.pattern_type else "" - default_pattern_type = "unknown" - while len(pattern_type) < 1: - pattern_type = input( - "Enter type of patterned vegetation (e.g. 'spots', 'labyrinths', or press Return for default ('{}') : ".format( - default_pattern_type - ) - ) - if len(pattern_type) == 0: - pattern_type = default_pattern_type - pattern_type = pattern_type.replace(" ", "-").lower() - - # run mode - run_mode = args.run_mode if args.run_mode else "" - default_run_mode = "local" - while not run_mode in run_modes: - run_mode = input( - "Would you like time-consuming functions to be run on the cloud? Choose from the following: {}, or press Return for default option '{}': ".format( - run_modes, default_run_mode - ) - ) - if len(run_mode) == 0: - run_mode = default_run_mode - - # output directory + # output directory output_dir = args.output_dir if args.output_dir else "" - if run_mode == "local" and not output_dir: + if not output_dir: output_dir = input( "Enter location for output, or press Return for default ('.') : " ) if len(output_dir) == 0: output_dir = "." - lat_string = "{:.2f}".format(latitude) - long_string = "{:.2f}".format(longitude) - output_location = make_output_location( - coords_id, collection_name, lat_string, long_string, country - ) + # (optional) ID from coordinates.py + bounds_file = args.bounds_file if args.bounds_file else None + left = None + right = None + bottom = None + top = None + if bounds_file: + bounds_gdf = geopandas.read_parquet(bounds_file) + bounds_gdf.to_crs("EPSG:27700") + + if "on_land" in bounds_gdf: + index = bounds_gdf[bounds_gdf["on_land"] == True].index + else: + index = bounds_gdf.index + + for i in index: + row = bounds_gdf.iloc[i] + bottom = int(row["geometry"].bounds[1]) + left = int(row["geometry"].bounds[0]) + right = int(row["geometry"].bounds[2]) + top = int(row["geometry"].bounds[3]) + + left_string = "{:0>6}".format(left) + bottom_string = "{:0>7}".format(bottom) + right_string = "{:0>6}".format(right) + top_string = "{:0>7}".format(top) + + output_location = make_output_location( + i, collection_name, left_string, bottom_string, right_string, top_string + ) + + output_location = os.path.join(output_dir, output_location) + + print( + """ + output_location {} + collection: {} + left: {} + bottom: {} + right: {} + top: {} + start_date: {} + end_date: {} + time_per_point: {} + """.format( + output_location, + collection_name, + left_string, + bottom_string, + right_string, + top_string, + start_date, + end_date, + time_per_point, + ) + ) + + config_filename = write_file( + configs_dir, + output_location, + left_string, + bottom_string, + right_string, + top_string, + start_date, + end_date, + time_per_point, + collection_name, + i, + ) - if run_mode == "local": - output_location = os.path.join(output_dir, output_location) + print( + """ + To run pyveg using this configuration, do: - # num threads - n_threads = args.n_threads if args.n_threads else 0 - while not (isinstance(n_threads, int) and n_threads in n_threads_range): - if run_mode == "local": - n_threads = input( - "How many threads would you like time-consuming processing functions to use? (Many computers will have 4 or 8 threads available). Press return for default value {} : ".format( - default_n_threads + pyveg_run_pipeline --config_file {} + """.format( + config_filename ) ) - if len(n_threads) == 0: - n_threads = default_n_threads - else: - try: - n_threads = int(n_threads) - except: - print("Please enter an integer value") - else: - n_threads = 1 + else: + # bounds + if not left: + left = args.left if args.left else 0 + + while not ( + isinstance(left, float) + and left >= left_bound[0] + and left <= left_bound[1] + ): + left = float( + input( + "please enter left bound (eastings) in the range {} : ".format( + left_bound[1] + ) + ) + ) + if not right: + right = args.right if args.right else 0 + while not ( + isinstance(right, float) + and right >= right_bound[0] + and right <= right_bound[1] + ): + right = float( + input( + "please enter right bound (eastings) in the range {} : ".format( + right[1] + ) + ) + ) + if not top: + top = args.top if args.top else 0 + while not ( + isinstance(top, float) and top >= top_bound[0] and top <= top_bound[1] + ): + top = float( + input( + "please enter top bound (degrees northings) in the range {} : ".format( + top_bound[1] + ) + ) + ) + if not bottom: + bottom = args.bottom if args.bottom else 0 + while not ( + isinstance(bottom, float) + and bottom >= bottom_bound[0] + and top <= bottom_bound[1] + ): + bottom = float( + input( + "please enter bottom bound (degrees northings) in the range {} : ".format( + bottom_bound[1] + ) + ) + ) - print( - """ - output_location {} - collection: {} - latitude: {} - longitude: {} - country: {} - pattern_type: {} - start_date: {} - end_date: {} - time_per_point: {} - region_size: {} - run_mode: {} - n_threads: {} - """.format( + left_string = "{:0>6}".format(left) + bottom_string = "{:0>7}".format(bottom) + right_string = "{:0>6}".format(right) + top_string = "{:0>7}".format(top) + + output_location = make_output_location( + None, collection_name, left_string, bottom_string, right_string, top_string + ) + + print( + """ + output_location {} + collection: {} + left: {} + bottom: {} + right: {} + top: {} + start_date: {} + end_date: {} + time_per_point: {} + """.format( + output_location, + collection_name, + left_string, + bottom_string, + right_string, + top_string, + start_date, + end_date, + time_per_point, + ) + ) + + config_filename = write_file( + configs_dir, output_location, - collection_name, - lat_string, - long_string, - country, - pattern_type, + left_string, + bottom_string, + right_string, + top_string, start_date, end_date, time_per_point, - region_size, - run_mode, - n_threads, + collection_name, ) - ) - - config_filename = write_file( - configs_dir, - output_location, - long_string, - lat_string, - country, - pattern_type, - start_date, - end_date, - time_per_point, - region_size, - collection_name, - run_mode, - n_threads, - test_mode, - coords_id, - ) - print( - """ -To run pyveg using this configuration, do: + print( + """ + To run pyveg using this configuration, do: -pyveg_run_pipeline --config_file {} + pyveg_run_pipeline --config_file {} -""".format( - config_filename + """.format( + config_filename + ) ) - ) if __name__ == "__main__": diff --git a/pyveg/scripts/run_pipeline_loop.py b/pyveg/scripts/run_pipeline_loop.py new file mode 100644 index 00000000..864a18a3 --- /dev/null +++ b/pyveg/scripts/run_pipeline_loop.py @@ -0,0 +1,73 @@ +import argparse +import logging +import shlex +import subprocess +import time +from logging.handlers import RotatingFileHandler +from os import listdir +from os.path import isfile, join + +logger = logging.getLogger("pyveg_bulk_donwload_job") +formatter = logging.Formatter("%(asctime)s [%(levelname)s] %(message)s") +logger.setLevel(logging.INFO) + +c_handler = logging.StreamHandler() +c_handler.setFormatter(formatter) +f_handler = RotatingFileHandler( + "pyveg_bulk_download_job_{}.log".format(time.strftime("%Y-%m-%d_%H-%M-%S")), + maxBytes=5 * 1024 * 1024, + backupCount=10, +) +f_handler.setFormatter(formatter) + +logger.addHandler(f_handler) +logger.addHandler(c_handler) + + +def run_pipeline(config_directory): + # Directory containing input files + + # Get the full paths to the files in the input dir + full_paths = [ + join(config_directory, f) + for f in listdir(config_directory) + if isfile(join(config_directory, f)) + ] + + # Build a list of commands to reproject each file individually + cmds = [] + for input_fpath in full_paths: + safe_input = shlex.quote(str(input_fpath)) + + cmds.append(f"pyveg_run_pipeline --config_file {safe_input}") + + # Now run those commands (This might be expensive) + + failed = 0 + for cmd in cmds: + logger.info(f"Running gee download using the command: {cmd}") + try: + subprocess.run(cmd, shell=True) + + except subprocess.SubprocessError as e: + failed += 1 + logger.error(f"Download using the command: {cmd} failed") + + return failed + + +def main(): + + # run pyveg pipeline on a loop by running all the config files in a given directory + parser = argparse.ArgumentParser() + parser.add_argument( + "--config_dir", help="Path to directory with config files", required=True + ) + + args = parser.parse_args() + n = run_pipeline(args.config_dir) + logger.info(f"Bulk download finished. Number of failed dowloads {n}") + + +if __name__ == "__main__": + main() diff --git a/pyveg/testdata/images_1024.parquet b/pyveg/testdata/images_1024.parquet new file mode 100644 index 00000000..72fb3471 Binary files /dev/null and b/pyveg/testdata/images_1024.parquet differ diff --git a/pyveg/tests/test_image_utils.py b/pyveg/tests/test_image_utils.py index 4d78e53e..9212858b 100644 --- a/pyveg/tests/test_image_utils.py +++ b/pyveg/tests/test_image_utils.py @@ -67,17 +67,6 @@ def test_compare_opposite_images(): assert compare_binary_images(img1, img2) < 0.1 -def test_create_gif_from_images(): - path_dir = os.path.join(os.path.dirname(__file__), "..", "testdata/") - create_gif_from_images(path_dir, "test", "black_and_white") - list_png_files = [ - f - for f in os.listdir(path_dir) - if (os.path.isfile(os.path.join(path_dir, f)) and f == "test.gif") - ] - assert len(list_png_files) == 1 - - def test_pillow_to_numpy(): img = Image.open( os.path.join(os.path.dirname(__file__), "..", "testdata", "white.png") diff --git a/setup.py b/setup.py index e7bf97bd..b70f98f8 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ "pyveg_generate_config=pyveg.scripts.generate_config_file:main", "pyveg_create_analysis_report=pyveg.scripts.create_analysis_report:main", "pyveg_analysis_summary_data=pyveg.scripts.analyse_pyveg_summary_data:main", + "pyveg_run_pipeline_loop=pyveg.scripts.run_pipeline_loop:main", ] }, )