Parsl · tphung3 · Dec 10, 2024 · Dec 10, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/parsl/executors/taskvine/executor.py b/parsl/executors/taskvine/executor.py
@@ -20,7 +20,7 @@
 import uuid
 from concurrent.futures import Future
 from datetime import datetime
-from typing import List, Literal, Optional, Union
+from typing import Dict, List, Literal, Optional, Union
 
 # Import other libraries
 import typeguard
@@ -81,8 +81,12 @@ class TaskVineExecutor(BlockProviderExecutor, putils.RepresentationMixin):
             pre-warmed forked python process.
             Default is 'regular'.
 
+        use_tmp_dir_for_staging: bool
+            Whether to use tmp dir for staging functions, arguments, and results.
+            Default is True.
+
         manager_config: TaskVineManagerConfig
-            Configuration for the TaskVine manager. Default
+            Configuration for the TaskVine manager.
 
         factory_config: TaskVineFactoryConfig
             Configuration for the TaskVine factory.
@@ -104,6 +108,7 @@ def __init__(self,
                  label: str = "TaskVineExecutor",
                  worker_launch_method: Union[Literal['provider'], Literal['factory'], Literal['manual']] = 'factory',
                  function_exec_mode: Union[Literal['regular'], Literal['serverless']] = 'regular',
+                 use_tmp_dir_for_staging: bool = True,
                  manager_config: TaskVineManagerConfig = TaskVineManagerConfig(),
                  factory_config: TaskVineFactoryConfig = TaskVineFactoryConfig(),
                  provider: Optional[ExecutionProvider] = LocalProvider(init_blocks=1),
@@ -129,6 +134,7 @@ def __init__(self,
         self.label = label
         self.worker_launch_method = worker_launch_method
         self.function_exec_mode = function_exec_mode
+        self.use_tmp_dir_for_staging = use_tmp_dir_for_staging
         self.manager_config = manager_config
         self.factory_config = factory_config
         self.storage_access = storage_access
@@ -172,6 +178,13 @@ def __init__(self,
         # Path to directory that holds all tasks' data and results.
         self._function_data_dir = ""
 
+        # Mapping of function names to function details.
+        # Currently the values include function objects, path to serialized functions,
+        # path to serialized function contexts, and whether functions are serialized.
+        # Helpful to detect inconsistencies in serverless functions.
+        # Helpful to deduplicate the same function.
+        self._map_func_names_to_func_details: Dict[str, Dict] = {}
+
         # Helper scripts to prepare package tarballs for Parsl apps
         self._package_analyze_script = shutil.which("poncho_package_analyze")
         self._package_create_script = shutil.which("poncho_package_create")
@@ -218,8 +231,12 @@ def __create_data_and_logging_dirs(self):
         # Create directories for data and results
         log_dir = os.path.join(run_dir, self.label)
         os.makedirs(log_dir)
-        tmp_prefix = f'{self.label}-{getpass.getuser()}-{datetime.now().strftime("%Y%m%d%H%M%S%f")}-'
-        self._function_data_dir = tempfile.TemporaryDirectory(prefix=tmp_prefix)
+
+        if self.use_tmp_dir_for_staging:
+            tmp_prefix = f'{self.label}-{getpass.getuser()}-{datetime.now().strftime("%Y%m%d%H%M%S%f")}-'
+            self._function_data_dir = tempfile.TemporaryDirectory(prefix=tmp_prefix)
+        else:
+            self._function_data_dir = os.path.join(log_dir, 'function')
 
         # put TaskVine logs outside of a Parsl run as TaskVine caches between runs while
         # Parsl does not.
@@ -318,11 +335,35 @@ def submit(self, func, resource_specification, *args, **kwargs):
             Keyword arguments to the Parsl app
         """
 
+        # a Parsl function must have a name
+        if func.__name__ is None:
+            raise ValueError('A Parsl function must have a name')
+
         logger.debug(f'Got resource specification: {resource_specification}')
 
+        # If `_parsl_monitoring_task_id` is in kwargs, Parsl monitoring code is enabled.
+        is_monitoring_enabled = '_parsl_monitoring_task_id' in kwargs
+
         # Default execution mode of apps is regular
         exec_mode = resource_specification.get('exec_mode', self.function_exec_mode)
 
+        # Fall back to regular execution if a function is Parsl-monitored as a monitored function is invocation-specific.
+        # Note that it is possible to get the wrapped function by calling the `__wrapped__` attribute when monitoring is enabled.
+        # It will disable the monitoring wrapper code however.
+        if exec_mode == 'serverless' and is_monitoring_enabled:
+            logger.warning("A serverless task cannot run with Parsl monitoring enabled. Falling back to execute this task as a regular task.")
+            exec_mode = 'regular'
+
+        if exec_mode == 'serverless':
+            if func.__name__ not in self._map_func_names_to_func_details:
+                self._map_func_names_to_func_details[func.__name__] = {'func_obj': func}
+            else:
+                if id(func) != id(self._map_func_names_to_func_details[func.__name__]['func_obj']):
+                    logger.warning('Inconsistency in a serverless function call detected.\
+                                   A function name cannot point to two different function objects.\
+                                   Falling back to executing it as a regular task.')
+                    exec_mode = 'regular'
+
         # Detect resources and features of a submitted Parsl app
         cores = None
         memory = None
@@ -386,19 +427,43 @@ def submit(self, func, resource_specification, *args, **kwargs):
         argument_file = None
         result_file = None
         map_file = None
+        function_context_file = None
 
         # Get path to files that will contain the pickled function,
         # arguments, result, and map of input and output files
-        function_file = self._path_in_task(executor_task_id, "function")
+        if exec_mode == 'serverless':
+            if 'function_file' not in self._map_func_names_to_func_details[func.__name__]:
+                function_file = os.path.join(self._function_data_dir.name, func.__name__, 'function')
+                os.makedirs(os.path.join(self._function_data_dir.name, func.__name__))
+                self._map_func_names_to_func_details[func.__name__].update({'function_file': function_file, 'is_serialized': False})
+            else:
+                function_file = self._map_func_names_to_func_details[func.__name__]['function_file']
+        else:
+            function_file = self._path_in_task(executor_task_id, "function")
         argument_file = self._path_in_task(executor_task_id, "argument")
         result_file = self._path_in_task(executor_task_id, "result")
         map_file = self._path_in_task(executor_task_id, "map")
 
+        if exec_mode == 'serverless':
+            if 'function_context' in resource_specification:
+                if 'function_context_file' not in self._map_func_names_to_func_details[func.__name__]:
+                    function_context = resource_specification.get('function_context')
+                    function_context_args = resource_specification.get('function_context_args', [])
+                    function_context_kwargs = resource_specification.get('function_context_kwargs', {})
+                    function_context_file = os.path.join(self._function_data_dir.name, func.__name__, 'function_context')
+                    self._serialize_object_to_file(function_context_file, [function_context, function_context_args, function_context_kwargs])
+                    self._map_func_names_to_func_details[func.__name__].update({'function_context_file': function_context_file})
+                else:
+                    function_context_file = self._map_func_names_to_func_details[func.__name__]['function_context_file']
+
         logger.debug("Creating executor task {} with function at: {}, argument at: {}, \
                 and result to be found at: {}".format(executor_task_id, function_file, argument_file, result_file))
 
         # Serialize function object and arguments, separately
-        self._serialize_object_to_file(function_file, func)
+        if exec_mode == 'regular' or not self._map_func_names_to_func_details[func.__name__]['is_serialized']:
+            self._serialize_object_to_file(function_file, func)
+            if exec_mode == 'serverless':
+                self._map_func_names_to_func_details[func.__name__]['is_serialized'] = True
         args_dict = {'args': args, 'kwargs': kwargs}
         self._serialize_object_to_file(argument_file, args_dict)
 
@@ -419,6 +484,7 @@ def submit(self, func, resource_specification, *args, **kwargs):
             category = func.__name__ if self.manager_config.autocategory else 'parsl-default'
 
         task_info = ParslTaskToVine(executor_id=executor_task_id,
+                                    func_name=func.__name__,
                                     exec_mode=exec_mode,
                                     category=category,
                                     input_files=input_files,
@@ -427,6 +493,7 @@ def submit(self, func, resource_specification, *args, **kwargs):
                                     function_file=function_file,
                                     argument_file=argument_file,
                                     result_file=result_file,
+                                    function_context_file=function_context_file,
                                     cores=cores,
                                     memory=memory,
                                     disk=disk,