dask-contrib · martindurant · Mar 28, 2024 · Mar 28, 2024 · Apr 4, 2024 · Apr 5, 2024
diff --git a/src/dask_awkward/layers/layers.py b/src/dask_awkward/layers/layers.py
@@ -30,13 +30,6 @@ def from_blockwise(cls, layer: Blockwise) -> AwkwardBlockwiseLayer:
         ob.__dict__.update(layer.__dict__)
         return ob
 
-    def mock(self) -> AwkwardBlockwiseLayer:
-        layer = copy.copy(self)
-        nb = layer.numblocks
-        layer.numblocks = {k: tuple(1 for _ in v) for k, v in nb.items()}
-        layer.__dict__.pop("_dims", None)
-        return layer
-
     def __getstate__(self) -> dict:
         # Indicator that this layer has been serialised
         state = self.__dict__.copy()
@@ -54,10 +47,6 @@ def __call__(self, *args, **kwargs): ...
 T = TypeVar("T")
 
 
-class ImplementsMocking(ImplementsIOFunction, Protocol):
-    def mock(self) -> AwkwardArray: ...
-
-
 class ImplementsMockEmpty(ImplementsIOFunction, Protocol):
     def mock_empty(self, backend: BackendT) -> AwkwardArray: ...
 
@@ -67,10 +56,8 @@ class ImplementsReport(ImplementsIOFunction, Protocol):
     def return_report(self) -> bool: ...
 
 
-class ImplementsProjection(ImplementsMocking, Protocol[T]):
-    def prepare_for_projection(self) -> tuple[AwkwardArray, TypeTracerReport, T]: ...
-
-    def project(self, report: TypeTracerReport, state: T) -> ImplementsIOFunction: ...
+class ImplementsProjection(Protocol[T]):
+    def project(self, columns: list[str]) -> ImplementsIOFunction: ...
 
 
 class ImplementsNecessaryColumns(ImplementsProjection[T], Protocol):
@@ -79,7 +66,7 @@ def necessary_columns(
     ) -> frozenset[str]: ...
 
 
-class IOFunctionWithMocking(ImplementsMocking, ImplementsIOFunction):
+class IOFunctionWithMocking(ImplementsIOFunction):
     def __init__(self, meta: AwkwardArray, io_func: ImplementsIOFunction):
         self._meta = meta
         self._io_func = io_func
@@ -92,21 +79,9 @@ def __getstate__(self) -> dict:
     def __call__(self, *args, **kwargs):
         return self._io_func(*args, **kwargs)
 
-    def mock(self) -> AwkwardArray:
-        assert self._meta is not None
-        return self._meta
-
 
 def io_func_implements_projection(func: ImplementsIOFunction) -> bool:
-    return hasattr(func, "prepare_for_projection")
-
-
-def io_func_implements_mocking(func: ImplementsIOFunction) -> bool:
-    return hasattr(func, "mock")
-
-
-def io_func_implements_mock_empty(func: ImplementsIOFunction) -> bool:
-    return hasattr(func, "mock_empty")
+    return hasattr(func, "project")
 
 
 def io_func_implements_columnar(func: ImplementsIOFunction) -> bool:
@@ -179,87 +154,14 @@ def is_projectable(self) -> bool:
             io_func_implements_projection(self.io_func) and not self.has_been_unpickled
         )
 
-    @property
-    def is_mockable(self) -> bool:
-        # isinstance(self.io_func, ImplementsMocking)
-        return io_func_implements_mocking(self.io_func)
-
     @property
     def is_columnar(self) -> bool:
         return io_func_implements_columnar(self.io_func)
 
-    def mock(self) -> AwkwardInputLayer:
-        assert self.is_mockable
-        return AwkwardInputLayer(
-            name=self.name,
-            inputs=[None][: int(list(self.numblocks.values())[0][0])],
-            io_func=lambda *_, **__: cast(ImplementsMocking, self.io_func).mock(),
-            label=self.label,
-            produces_tasks=self.produces_tasks,
-            creation_info=self.creation_info,
-            annotations=self.annotations,
-        )
-
-    def prepare_for_projection(self) -> tuple[AwkwardInputLayer, TypeTracerReport, T]:
-        """Mock the input layer as starting with a data-less typetracer.
-        This method is used to create new dask task graphs that
-        operate purely on typetracer Arrays (that is, array with
-        awkward structure but without real data buffers). This allows
-        us to test which parts of a real awkward array will be used in
-        a real computation. We do this by running a graph which starts
-        with mocked AwkwardInputLayers.
-
-        We mock an AwkwardInputLayer in these steps:
-        1. Ask the IO function to prepare a new meta array, and return
-           any transient state.
-        2. Build a new AwkwardInputLayer whose IO function just returns
-           this meta (typetracer) array
-        3. Return the new input layer and the transient state
-
-        When this new layer is added to a dask task graph and that
-        graph is computed, the report object will be mutated.
-        Inspecting the report object after the compute tells us which
-        buffers from the original form would be required for a real
-        compute with the same graph.
-        Returns
-        -------
-        AwkwardInputLayer
-            Copy of the input layer with data-less input.
-        TypeTracerReport
-            The report object used to track touched buffers.
-        Any
-            The black-box state object returned by the IO function.
-        """
-        assert self.is_projectable
-        new_meta_array, report, state = cast(
-            ImplementsProjection, self.io_func
-        ).prepare_for_projection()
-
-        new_return = new_meta_array
-        if io_func_implements_report(self.io_func):
-            if cast(ImplementsReport, self.io_func).return_report:
-                new_return = (new_meta_array, type(new_meta_array)([]))
-
-        new_input_layer = AwkwardInputLayer(
-            name=self.name,
-            inputs=[None][: int(list(self.numblocks.values())[0][0])],
-            io_func=AwkwardTokenizable(new_return, self.name),
-            label=self.label,
-            produces_tasks=self.produces_tasks,
-            creation_info=self.creation_info,
-            annotations=self.annotations,
-        )
-        return new_input_layer, report, state
-
-    def project(
-        self,
-        report: TypeTracerReport,
-        state: T,
-    ) -> AwkwardInputLayer:
+    def project(self, columns: list[str]) -> AwkwardInputLayer:
         assert self.is_projectable
-        io_func = cast(ImplementsProjection, self.io_func).project(
-            report=report, state=state
-        )
+        breakpoint()
+        io_func = self.io_func.project(columns)
         return AwkwardInputLayer(
             name=self.name,
             inputs=self.inputs,
@@ -270,12 +172,6 @@ def project(
             annotations=self.annotations,
         )
 
-    def necessary_columns(self, report: TypeTracerReport, state: T) -> frozenset[str]:
-        assert self.is_columnar
-        return cast(ImplementsNecessaryColumns, self.io_func).necessary_columns(
-            report=report, state=state
-        )
-
 
 class AwkwardMaterializedLayer(MaterializedLayer):
     def __init__(
@@ -290,68 +186,5 @@ def __init__(
         self.fn = fn
         super().__init__(mapping, **kwargs)
 
-    def mock(self) -> MaterializedLayer:
-        mapping = copy.copy(self.mapping)
-        if not mapping:
-            # no partitions at all
-            return self
-        name = next(iter(mapping))[0]
-
-        npln = len(self.previous_layer_names)
-        # one previous layer name
-        #
-        # this case is used for mocking repartition or slicing where
-        # we maybe have multiple partitions that need to be included
-        # in a task.
-        if npln == 1:
-            prev_name: str = self.previous_layer_names[0]
-            if (name, 0) in mapping:
-                task = mapping[(name, 0)]
-                task = tuple(
-                    (
-                        (prev_name, 0)
-                        if isinstance(v, tuple) and len(v) == 2 and v[0] == prev_name
-                        else v
-                    )
-                    for v in task
-                )
-
-                # when using Array.partitions we need to mock that we
-                # just want the first partition.
-                if len(task) == 2 and isinstance(task[1], int) and task[1] > 0:
-                    task = (task[0], 0)
-                return MaterializedLayer({(name, 0): task})
-            return self
-
-        # zero previous layers; this is likely a known scalar.
-        #
-        # we just use the existing mapping
-        elif npln == 0:
-            return MaterializedLayer({(name, 0): mapping[(name, 0)]})
-
-        # more than one previous_layer_names
-        #
-        # this case is needed for dak.concatenate on axis=0; we need
-        # the first partition of _each_ of the previous layer names!
-        else:
-            if self.fn is None:
-                raise ValueError(
-                    "For multiple previous layers the fn argument cannot be None."
-                )
-            name0s = tuple((name, 0) for name in self.previous_layer_names)
-            task = (self.fn, *name0s)
-            return MaterializedLayer({(name, 0): task})
-
-
-class AwkwardTreeReductionLayer(DataFrameTreeReduction):
-    def mock(self) -> AwkwardTreeReductionLayer:
-        return AwkwardTreeReductionLayer(
-            name=self.name,
-            name_input=self.name_input,
-            npartitions_input=1,
-            concat_func=self.concat_func,
-            tree_node_func=self.tree_node_func,
-            finalize_func=self.finalize_func,
-            split_every=self.split_every,
-            tree_node_name=self.tree_node_name,
-        )
+
+class AwkwardTreeReductionLayer(DataFrameTreeReduction): ...
diff --git a/src/dask_awkward/lib/core.py b/src/dask_awkward/lib/core.py
@@ -386,6 +386,10 @@ def _rebuild(self, dsk, *, rename=None):
     def __reduce__(self):
         return (Scalar, (self.dask, self.name, None, self.dtype, self.known_value))
 
+    @property
+    def report(self):
+        return getattr(self._meta, "_report", set())
+
     @property
     def dask(self) -> HighLevelGraph:
         return self._dask
@@ -398,6 +402,10 @@ def name(self) -> str:
     def key(self) -> Key:
         return (self._name, 0)
 
+    @property
+    def report(self):
+        return getattr(self._meta, "_report", set())
+
     def _check_meta(self, m):
         if isinstance(m, MaybeNone):
             return ak.Array(m.content)
@@ -713,6 +721,8 @@ def _check_meta(self, m: Any | None) -> Any | None:
     def __getitem__(self, where):
         token = tokenize(self, where)
         new_name = f"{where}-{token}"
+        report = self.report
+        [_.commit(new_name) for _ in report]
         new_meta = self._meta[where]
 
         # first check for array type return
@@ -723,6 +733,7 @@ def __getitem__(self, where):
                 graphlayer,
                 dependencies=[self],
             )
+            new_meta._report = report
             return new_array_object(hlg, new_name, meta=new_meta, npartitions=1)
 
         # then check for scalar (or record) type
@@ -733,6 +744,7 @@ def __getitem__(self, where):
             dependencies=[self],
         )
         if isinstance(new_meta, ak.Record):
+            new_meta._report = report
             return new_record_object(hlg, new_name, meta=new_meta)
         else:
             return new_scalar_object(hlg, new_name, meta=new_meta)
@@ -806,7 +818,7 @@ def new_record_object(dsk: HighLevelGraph, name: str, *, meta: Any) -> Record:
         raise TypeError(
             f"meta Record must have a typetracer backend, not {ak.backend(meta)}"
         )
-    return Record(dsk, name, meta)
+    return out
 
 
 def _is_numpy_or_cupy_like(arr: Any) -> bool:
@@ -937,6 +949,10 @@ def reset_meta(self) -> None:
         """Assign an empty typetracer array as the collection metadata."""
         self._meta = empty_typetracer()
 
+    @property
+    def report(self):
+        return getattr(self._meta, "_report", set())
+
     def repartition(
         self,
         npartitions: int | None = None,
@@ -972,6 +988,7 @@ def repartition(
         new_graph = HighLevelGraph.from_collections(
             key, new_layer, dependencies=(self,)
         )
+        [_.commit(key) for _ in self.report]
         return new_array_object(
             new_graph,
             key,
@@ -1173,7 +1190,7 @@ def _partitions(self, index: Any) -> Array:
         # otherwise nullify the known divisions
         else:
             new_divisions = (None,) * (len(new_keys) + 1)  # type: ignore
-
+        [_.commit(name) for _ in self.report]
         return new_array_object(
             graph, name, meta=self._meta, divisions=tuple(new_divisions)
         )
@@ -1395,6 +1412,7 @@ def _getitem_slice_on_zero(self, where):
             AwkwardMaterializedLayer(dask, previous_layer_names=[self.name]),
             dependencies=[self],
         )
+        [_.commit(name) for _ in self.report]
         return new_array_object(
             hlg,
             name,
@@ -1943,6 +1961,15 @@ def _map_partitions(
         if meta is None:
             meta = map_meta(fn, *args, **kwargs)
 
+        reps = set()
+        for dep in to_meta(deps):
+            rep = getattr(dep, "_report", None)
+            if rep:
+                [_.commit(name) for _ in rep]
+                [reps.add(_) for _ in rep]
+
+        meta._report = reps
+
         hlg = HighLevelGraph.from_collections(
             name,
             lay,
@@ -1964,7 +1991,6 @@ def _map_partitions(
             new_divisions = tuple(map(lambda x: x * output_divisions, in_divisions))
     else:
         new_divisions = in_divisions
-
     if output_divisions is not None:
         return new_array_object(
             hlg,
@@ -2195,10 +2221,6 @@ def non_trivial_reduction(
     if combiner is None:
         combiner = reducer
 
-    # is_positional == True is not implemented
-    # if is_positional:
-    #     assert combiner is reducer
-
     # For `axis=None`, we prepare each array to have the following structure:
     #   [[[ ... [x1 x2 x3 ... xN] ... ]]] (length-1 outer lists)
     # This makes the subsequent reductions an `axis=-1` reduction
@@ -2273,14 +2295,15 @@ def non_trivial_reduction(
     )
 
     graph = HighLevelGraph.from_collections(name_finalize, trl, dependencies=(chunked,))
-
+    [_.commit(name_finalize) for _ in array.report]
     meta = reducer(
         array._meta,
         axis=axis,
         keepdims=keepdims,
         mask_identity=mask_identity,
     )
     if isinstance(meta, ak.highlevel.Array):
+        meta._report = array.report
         return new_array_object(graph, name_finalize, meta=meta, npartitions=1)
     else:
         return new_scalar_object(graph, name_finalize, meta=meta)