From 327f9f992a01218480241fb20a1ebf09644b76f1 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Thu, 24 Feb 2022 14:45:57 -0800
Subject: [PATCH] initial pass.

---
 nums/core/array/blockarray.py | 236 +++++--------------------
 nums/core/array/ops.py        | 321 ++++++++++++++++++++++++++++++++++
 nums/core/array/utils.py      |   6 +-
 3 files changed, 364 insertions(+), 199 deletions(-)
 create mode 100644 nums/core/array/ops.py

diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index 88c52f46..fa5aadc8 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -23,6 +23,7 @@
 from nums.core.array.view import ArrayView
 from nums.core.grid.grid import ArrayGrid
 from nums.core.compute.compute_manager import ComputeManager
+from nums.core.array import ops
 
 
 # pylint: disable=too-many-lines
@@ -630,10 +631,6 @@ def to_block_array(obj, cm: ComputeManager, block_shape=None):
             block_shape = cm.get_block_shape(np_array.shape, np_array.dtype)
         return BlockArray.from_np(np_array, block_shape, False, cm)
 
-    def check_or_convert_other(self, other, compute_block_shape=False):
-        block_shape = None if compute_block_shape else self.block_shape
-        return BlockArray.to_block_array(other, self.cm, block_shape=block_shape)
-
     def ufunc(self, op_name):
         result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
@@ -769,7 +766,7 @@ def reduce_axis(self, op_name, axis, keepdims=False):
         return result
 
     #################
-    # Linear Algebra
+    # Tensor Algebra
     #################
 
     def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
@@ -779,71 +776,6 @@ def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
         else:
             return other_block.true_grid_entry(), other_block.true_grid_shape()
 
-    def tensordot(self, other, axes=2):
-        if isinstance(axes, int):
-            pass
-        elif array_utils.is_array_like(axes):
-            raise NotImplementedError("Non-integer axes is currently not supported.")
-        else:
-            raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
-
-        other = self.check_or_convert_other(other, compute_block_shape=True)
-
-        if array_utils.np_tensordot_param_test(
-            self.shape, self.ndim, other.shape, other.ndim, axes
-        ):
-            raise ValueError("shape-mismatch for sum")
-
-        this_axes = self.grid.grid_shape[:-axes]
-        this_sum_axes = self.grid.grid_shape[-axes:]
-        other_axes = other.grid.grid_shape[axes:]
-        other_sum_axes = other.grid.grid_shape[:axes]
-        assert this_sum_axes == other_sum_axes
-        result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-        result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:])
-        result_grid = ArrayGrid(
-            shape=result_shape,
-            block_shape=result_block_shape,
-            dtype=array_utils.get_bop_output_type(
-                "tensordot", self.dtype, other.dtype
-            ).__name__,
-        )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
-        result = BlockArray(result_grid, self.cm)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
-                grid_entry = tuple(i + j)
-                result_block: Block = result.blocks[grid_entry]
-                sum_oids = []
-                for k in sum_dims:
-                    self_block: Block = self.blocks[tuple(i + k)]
-                    other_block: Block = other.blocks[tuple(k + j)]
-                    dot_grid_args = self._compute_tensordot_syskwargs(
-                        self_block, other_block
-                    )
-                    dotted_oid = self.cm.bop(
-                        "tensordot",
-                        self_block.oid,
-                        other_block.oid,
-                        self_block.transposed,
-                        other_block.transposed,
-                        axes=axes,
-                        syskwargs={
-                            "grid_entry": dot_grid_args[0],
-                            "grid_shape": dot_grid_args[1],
-                        },
-                    )
-                    sum_oids.append(
-                        (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
-                    )
-                result_block.oid = self._tree_reduce(
-                    "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
-                )
-        return result
-
     def __matmul__(self, other):
         if len(self.shape) > 2:
             # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet.
@@ -851,11 +783,10 @@ def __matmul__(self, other):
                 "Matrix multiply for tensors of rank > 2 not supported yet."
             )
         else:
-            return self.tensordot(other, 1)
+            return ops.tensordot(self.cm, self, other, axes=1)
 
     def __rmatmul__(self, other):
-        other = self.check_or_convert_other(other)
-        return other @ self
+        return ops.tensordot(self.cm, other, self, axes=1)
 
     __imatmul__ = __matmul__
 
@@ -863,52 +794,6 @@ def __rmatmul__(self, other):
     # Arithmetic
     #################
 
-    def _fast_element_wise(self, op_name, other):
-        """
-        Implements fast scheduling for basic element-wise operations.
-        """
-        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        # Schedule the op first.
-        blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
-        for grid_entry in self.grid.get_entry_iterator():
-            self_block: Block = self.blocks[grid_entry]
-            other_block: Block = other.blocks[grid_entry]
-            blocks[grid_entry] = block = Block(
-                grid_entry=grid_entry,
-                grid_shape=self_block.grid_shape,
-                rect=self_block.rect,
-                shape=self_block.shape,
-                dtype=dtype,
-                transposed=False,
-                cm=self.cm,
-            )
-            block.oid = self.cm.bop(
-                op_name,
-                self_block.oid,
-                other_block.oid,
-                self_block.transposed,
-                other_block.transposed,
-                axes={},
-                syskwargs={
-                    "grid_entry": grid_entry,
-                    "grid_shape": self.grid.grid_shape,
-                },
-            )
-        return BlockArray(
-            ArrayGrid(self.shape, self.block_shape, dtype.__name__),
-            self.cm,
-            blocks=blocks,
-        )
-
-    def __elementwise__(self, op_name, other):
-        other = self.check_or_convert_other(other)
-        if self.shape == other.shape and self.block_shape == other.block_shape:
-            return self._fast_element_wise(op_name, other)
-        blocks_op = self.blocks.__getattribute__("__%s__" % op_name)
-        return BlockArray.from_blocks(
-            blocks_op(other.blocks), result_shape=None, cm=self.cm
-        )
-
     def __neg__(self):
         return self.ufunc("negative")
 
@@ -919,65 +804,58 @@ def __abs__(self):
         return self.ufunc("abs")
 
     def __mod__(self, other):
-        return self.__elementwise__("mod", other)
+        return ops.elementwise("mod", self, other, self.cm)
 
     def __rmod__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mod", self)
+        return ops.elementwise("mod", other, self, self.cm)
 
     __imod__ = __mod__
 
     def __add__(self, other):
-        return self.__elementwise__("add", other)
+        return ops.elementwise("add", self, other, self.cm)
 
     def __radd__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("add", self)
+        return ops.elementwise("add", other, self, self.cm)
 
     __iadd__ = __add__
 
     def __sub__(self, other):
-        return self.__elementwise__("sub", other)
+        return ops.elementwise("sub", self, other, self.cm)
 
     def __rsub__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("sub", self)
+        return ops.elementwise("sub", other, self, self.cm)
 
     __isub__ = __sub__
 
     def __mul__(self, other):
-        return self.__elementwise__("mul", other)
+        return ops.elementwise("mul", self, other, self.cm)
 
     def __rmul__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mul", self)
+        return ops.elementwise("mul", other, self, self.cm)
 
     __imul__ = __mul__
 
     def __truediv__(self, other):
-        return self.__elementwise__("truediv", other)
+        return ops.elementwise("truediv", self, other, self.cm)
 
     def __rtruediv__(self, other):
-        other = self.check_or_convert_other(other)
-        return other / self
+        return ops.elementwise("truediv", other, self, self.cm)
 
     __itruediv__ = __truediv__
 
     def __floordiv__(self, other):
-        return self.__elementwise__("floor_divide", other)
+        return ops.elementwise("floor_divide", self, other, self.cm)
 
     def __rfloordiv__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("floor_divide", self)
+        return ops.elementwise("floor_divide", other, self, self.cm)
 
     __ifloordiv__ = __floordiv__
 
     def __pow__(self, other):
-        return self.__elementwise__("pow", other)
+        return ops.elementwise("pow", self, other, self.cm)
 
     def __rpow__(self, other):
-        other = self.check_or_convert_other(other)
-        return other ** self
+        return ops.elementwise("pow", other, self, self.cm)
 
     __ipow__ = __pow__
 
@@ -985,70 +863,41 @@ def __rpow__(self, other):
     # Inequalities
     #################
 
-    def __inequality__(self, op, other):
-        other = self.check_or_convert_other(other)
-        assert (
-            other.shape == () or other.shape == self.shape
-        ), "Currently supports comparison with scalars only."
-        shape = array_utils.broadcast(self.shape, other.shape).shape
-        block_shape = array_utils.broadcast_block_shape(
-            self.shape, other.shape, self.block_shape
-        )
-        dtype = bool.__name__
-        grid = ArrayGrid(shape, block_shape, dtype)
-        result = BlockArray(grid, self.cm)
-        for grid_entry in result.grid.get_entry_iterator():
-            if other.shape == ():
-                other_block: Block = other.blocks.item()
-            else:
-                other_block: Block = other.blocks[grid_entry]
-            result.blocks[grid_entry] = self.blocks[grid_entry].bop(
-                op, other_block, args={}
-            )
-
-        return result
-
     def __ge__(self, other):
-        return self.__inequality__("ge", other)
+        return ops.inequality("ge", self, other, self.cm)
 
     def __rge__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ge", self)
+        return ops.inequality("ge", other, self, self.cm)
 
     def __gt__(self, other):
-        return self.__inequality__("gt", other)
+        return ops.inequality("gt", self, other, self.cm)
 
     def __rgt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("gt", self)
+        return ops.inequality("gt", other, self, self.cm)
 
     def __le__(self, other):
-        return self.__inequality__("le", other)
+        return ops.inequality("le", self, other, self.cm)
 
     def __rle__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("le", self)
+        return ops.inequality("le", other, self, self.cm)
 
     def __lt__(self, other):
-        return self.__inequality__("lt", other)
+        return ops.inequality("lt", self, other, self.cm)
 
     def __rlt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("lt", self)
+        return ops.inequality("lt", other, self, self.cm)
 
     def __eq__(self, other):
-        return self.__inequality__("eq", other)
+        return ops.inequality("eq", self, other, self.cm)
 
     def __req__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("eq", self)
+        return ops.inequality("eq", other, self, self.cm)
 
     def __ne__(self, other):
-        return self.__inequality__("ne", other)
+        return ops.inequality("ne", self, other, self.cm)
 
     def __rne__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ne", self)
+        return ops.inequality("ne", other, self, self.cm)
 
     ##################
     # Boolean
@@ -1066,47 +915,42 @@ def __invert__(self):
         return self.ufunc("invert")
 
     def __or__(self, other):
-        return self.__elementwise__("bitwise_or", other)
+        return ops.elementwise("bitwise_or", self, other, self.cm)
 
     def __ror__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_or", self)
+        return ops.elementwise("bitwise_or", other, self, self.cm)
 
     __ior__ = __or__
 
     def __and__(self, other):
-        return self.__elementwise__("bitwise_and", other)
+        return ops.elementwise("bitwise_and", self, other, self.cm)
 
     def __rand__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_and", self)
+        return ops.elementwise("bitwise_and", other, self, self.cm)
 
     __iand__ = __and__
 
     def __xor__(self, other):
-        return self.__elementwise__("bitwise_xor", other)
+        return ops.elementwise("bitwise_xor", self, other, self.cm)
 
     def __rxor__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_xor", self)
+        return ops.elementwise("bitwise_xor", other, self, self.cm)
 
     __ixor__ = __xor__
 
     def __lshift__(self, other):
-        return self.__elementwise__("left_shift", other)
+        return ops.elementwise("left_shift", self, other, self.cm)
 
     def __rlshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("left_shift", self)
+        return ops.elementwise("left_shift", other, self, self.cm)
 
     __ilshift__ = __lshift__
 
     def __rshift__(self, other):
-        return self.__elementwise__("right_shift", other)
+        return ops.elementwise("right_shift", self, other, self.cm)
 
     def __rrshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("right_shift", self)
+        return ops.elementwise("right_shift", other, self, self.cm)
 
     __irshift__ = __rshift__
 
diff --git a/nums/core/array/ops.py b/nums/core/array/ops.py
new file mode 100644
index 00000000..dc6b85c7
--- /dev/null
+++ b/nums/core/array/ops.py
@@ -0,0 +1,321 @@
+
+import warnings
+import itertools
+
+import numpy as np
+
+from nums.core.array import utils as array_utils
+from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.view import ArrayView
+from nums.core.grid.grid import ArrayGrid
+from nums.core.compute.compute_manager import ComputeManager
+from nums.core.array.base import Block
+
+
+def atleast_nd_array(obj):
+    if isinstance(obj, list):
+        return np.array(obj)
+    return obj
+
+
+def tree_reduce(
+        cm, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
+):
+    """
+    Basic tree reduce imp.
+    Schedules op on same node as left operand.
+    :param op_name: The reduction op.
+    :param blocks_or_oids: A list of type Block or a list of tuples.
+                           Tuples must be of the form
+                           (oid, grid_entry, grid_shape, transposed)
+    :param result_grid_entry: The grid entry of the result block. This will be used
+                              to compute the final reduction step.
+    :param result_grid_shape: The grid entry of the result block. This will be used
+                              to compute the final reduction step.
+    :return: The oid of the result.
+    """
+    oid_list = blocks_or_oids
+    if isinstance(blocks_or_oids[0], Block):
+        oid_list = [
+            (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+            for b in blocks_or_oids
+        ]
+    if len(oid_list) == 1:
+        return oid_list[0][0]
+    q = oid_list
+    while len(q) > 1:
+        a_oid, a_ge, a_gs, a_T = q.pop(0)
+        b_oid, _, _, b_T = q.pop(0)
+        ge, gs = (
+            (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
+        )
+        c_oid = cm.bop_reduce(
+            op_name,
+            a_oid,
+            b_oid,
+            a_T,
+            b_T,
+            syskwargs={
+                "grid_entry": ge,
+                "grid_shape": gs,
+            },
+        )
+        q.append((c_oid, ge, gs, False))
+    r_oid, r_ge, r_gs, _ = q.pop(0)
+    assert r_ge == result_grid_entry
+    assert r_gs == result_grid_shape
+    return r_oid
+
+
+def get_compatible_tensordot_operands(cm, left, right, axes: int):
+    from nums.core.array.blockarray import BlockArray
+
+    def right_from_left_ba(left, right):
+        # Use this when left.size > right.size.
+        if not isinstance(left, BlockArray):
+            # Make left a block array if it's not already.
+            left = BlockArray.to_block_array(left, cm)
+
+        if isinstance(right, BlockArray):
+            if left.block_shape[-axes:] == right.block_shape[:axes]:
+                return left, right
+            else:
+                compatible_right_block_shape = tuple(left.block_shape[-axes:] + right.block_shape[axes:])
+                return left, right.reshape(block_shape=compatible_right_block_shape)
+
+        assert isinstance(right, np.ndarray)
+        # Other is the right operand of a tensordot over k axes.
+        # Set the first k dims of the right operand's block shape
+        # equal to the last k dims of the left operand's block shape.
+        # Set the dim of the rest of the axes to the corresponding dim of the right operand's shape.
+        # This last step is a conservative solution to the partitioning choices we have for the
+        # remaining dims.
+        compatible_right_block_shape = tuple(left.block_shape[-axes:] + right.shape[axes:])
+        return left, BlockArray.to_block_array(right, cm,
+                                               block_shape=compatible_right_block_shape)
+
+    def left_from_right_ba(left, right):
+        # Use this when left.size <= right.size.
+        if not isinstance(right, BlockArray):
+            # Make right a block array if it's not already.
+            right = BlockArray.to_block_array(right, cm)
+
+        if isinstance(left, BlockArray):
+            if left.block_shape[-axes:] == right.block_shape[:axes]:
+                return left, right
+            else:
+                compatible_left_block_shape = tuple(left.block_shape[:-axes] + right.block_shape[:axes])
+                return left.reshape(block_shape=compatible_left_block_shape), right
+
+        assert isinstance(left, np.ndarray)
+        compatible_left_block_shape = tuple(left.shape[:-axes] + right.block_shape[:axes])
+        return BlockArray.to_block_array(left, cm, block_shape=compatible_left_block_shape), \
+               right
+
+    left = atleast_nd_array(left)
+    right = atleast_nd_array(right)
+    # Make sure we have enough dims for tensordot.
+    assert axes <= len(left.shape)
+    assert axes <= len(right.shape)
+
+    if array_utils.np_tensordot_param_test(
+            left.shape, left.ndim, right.shape, right.ndim, axes
+    ):
+        raise ValueError("shape-mismatch for tensordot.")
+
+    # Four possibilities:
+    # 1. Left is blockarray, right is ndarray.
+    # 2. Left is ndarray, right is blockarray.
+    # 3. Both left and right are ndarray.
+    # 4. Both left and right are blockarray.
+    if left.size > right.size:
+        # Covers all 4 cases when we prefer to repartition right array based on left array.
+        return right_from_left_ba(left, right)
+    else:
+        # Covers all 4 cases when we prefer to repartition left array based on right array.
+        return left_from_right_ba(left, right)
+
+
+def _compute_tensordot_syskwargs(self_block: Block, other_block: Block):
+    # Schedule on larger block.
+    if np.product(self_block.shape) >= np.product(other_block.shape):
+        return self_block.true_grid_entry(), self_block.true_grid_shape()
+    else:
+        return other_block.true_grid_entry(), other_block.true_grid_shape()
+
+
+def tensordot(cm, left, right, axes=2):
+    if isinstance(axes, int):
+        pass
+    elif array_utils.is_array_like(axes):
+        raise NotImplementedError("Non-integer axes is currently not supported.")
+    else:
+        raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
+
+    left, right = get_compatible_tensordot_operands(cm, left, right, axes=axes)
+
+    this_axes = left.grid.grid_shape[:-axes]
+    this_sum_axes = left.grid.grid_shape[-axes:]
+    other_axes = right.grid.grid_shape[axes:]
+    other_sum_axes = right.grid.grid_shape[:axes]
+    assert this_sum_axes == other_sum_axes
+    result_shape = tuple(left.shape[:-axes] + right.shape[axes:])
+    result_block_shape = tuple(left.block_shape[:-axes] + right.block_shape[axes:])
+    result_grid = ArrayGrid(
+        shape=result_shape,
+        block_shape=result_block_shape,
+        dtype=array_utils.get_bop_output_type(
+            "tensordot", left.dtype, right.dtype
+        ).__name__,
+    )
+    assert result_grid.grid_shape == tuple(this_axes + other_axes)
+    result = BlockArray(result_grid, left.cm)
+    this_dims = list(itertools.product(*map(range, this_axes)))
+    other_dims = list(itertools.product(*map(range, other_axes)))
+    sum_dims = list(itertools.product(*map(range, this_sum_axes)))
+    for i in this_dims:
+        for j in other_dims:
+            grid_entry = tuple(i + j)
+            result_block: Block = result.blocks[grid_entry]
+            sum_oids = []
+            for k in sum_dims:
+                self_block: Block = left.blocks[tuple(i + k)]
+                other_block: Block = right.blocks[tuple(k + j)]
+                dot_grid_args = left._compute_tensordot_syskwargs(
+                    self_block, other_block
+                )
+                dotted_oid = cm.bop(
+                    "tensordot",
+                    self_block.oid,
+                    other_block.oid,
+                    self_block.transposed,
+                    other_block.transposed,
+                    axes=axes,
+                    syskwargs={
+                        "grid_entry": dot_grid_args[0],
+                        "grid_shape": dot_grid_args[1],
+                    },
+                )
+                sum_oids.append(
+                    (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
+                )
+            result_block.oid = tree_reduce(cm,
+                                           "sum",
+                                           sum_oids,
+                                           result_block.grid_entry,
+                                           result_block.grid_shape)
+    return result
+
+
+def get_compatible_elementwise_operands(left, right, cm):
+
+    def truncated_block_shape(block_shape_a, shape_b):
+        # Creates block shape for b based on b's available dims.
+        # Example: If a has shape NxMxK and b has shape K, set b.block_shape = a.block_shape[:-1].
+        assert len(block_shape_a) >= len(shape_b)
+        block_shape_b = []
+        for i in range(-1, len(shape_b) - 1, -1):
+            block_shape_b.append(block_shape_a[i])
+        return block_shape_b
+
+    def broadcast_a_to_b(a, b):
+        if not isinstance(a, BlockArray):
+            # Make a a block array if it's not already.
+            a = BlockArray.to_block_array(a, cm)
+
+        if len(a.shape) == len(b.shape):
+            right_block_shape = a.block_shape
+        else:
+            right_block_shape = truncated_block_shape(a.block_shape, b.shape)
+
+        if isinstance(b, BlockArray):
+            if a.block_shape == b.block_shape:
+                return a, b
+            else:
+                return a, b.reshape(block_shape=right_block_shape)
+        else:
+            return a, BlockArray.to_block_array(b, a.cm, block_shape=right_block_shape)
+
+    left = atleast_nd_array(left)
+    right = atleast_nd_array(right)
+    assert array_utils.can_broadcast_shapes(left.shape, right.shape)
+
+    # Handle cases where ndims don't match.
+    if left.ndims > right.ndims:
+        left, right = broadcast_a_to_b(left, right)
+    else:
+        right, left = broadcast_a_to_b(right, left)
+    return left, right
+
+
+def fast_element_wise(op_name, left, right):
+    """
+    Implements fast scheduling for basic element-wise operations.
+    """
+    dtype = array_utils.get_bop_output_type(op_name, left.dtype, right.dtype)
+    # Schedule the op first.
+    blocks = np.empty(shape=left.grid.grid_shape, dtype=Block)
+    for grid_entry in left.grid.get_entry_iterator():
+        self_block: Block = left.blocks[grid_entry]
+        other_block: Block = right.blocks[grid_entry]
+        blocks[grid_entry] = block = Block(
+            grid_entry=grid_entry,
+            grid_shape=self_block.grid_shape,
+            rect=self_block.rect,
+            shape=self_block.shape,
+            dtype=dtype,
+            transposed=False,
+            cm=left.cm,
+        )
+        block.oid = left.cm.bop(
+            op_name,
+            self_block.oid,
+            other_block.oid,
+            self_block.transposed,
+            other_block.transposed,
+            axes={},
+            syskwargs={
+                "grid_entry": grid_entry,
+                "grid_shape": left.grid.grid_shape,
+            },
+        )
+    return BlockArray(
+        ArrayGrid(left.shape, left.block_shape, dtype.__name__),
+        left.cm,
+        blocks=blocks,
+    )
+
+
+def elementwise(op_name, left, right, cm):
+    left, right = get_compatible_elementwise_operands(left, right, cm)
+    if left.shape == right.shape and left.block_shape == right.block_shape:
+        return fast_element_wise(op_name, left, right)
+    blocks_op = left.blocks.__getattribute__("__%s__" % op_name)
+    return BlockArray.from_blocks(
+        blocks_op(right.blocks), result_shape=None, cm=left.cm
+    )
+
+
+def inequality(op, left, right, cm):
+    left, right = get_compatible_elementwise_operands(left, right, cm)
+    right = left.convert_other_elementwise(right)
+    assert (
+            right.shape == () or right.shape == left.shape
+    ), "Currently supports comparison with scalars only."
+    shape = array_utils.broadcast(left.shape, right.shape).shape
+    block_shape = array_utils.broadcast_block_shape(
+        left.shape, right.shape, left.block_shape
+    )
+    dtype = bool.__name__
+    grid = ArrayGrid(shape, block_shape, dtype)
+    result = BlockArray(grid, cm)
+    for grid_entry in result.grid.get_entry_iterator():
+        if right.shape == ():
+            other_block: Block = right.blocks.item()
+        else:
+            other_block: Block = right.blocks[grid_entry]
+        result.blocks[grid_entry] = left.blocks[grid_entry].bop(
+            op, other_block, args={}
+        )
+
+    return result
diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py
index b1b22798..07a5fb3b 100644
--- a/nums/core/array/utils.py
+++ b/nums/core/array/utils.py
@@ -123,9 +123,9 @@ def broadcast(a_shape, b_shape):
 
 
 def broadcast_block_shape(a_shape, b_shape, a_block_shape):
-    # Starting from last block shape dim and
-    # map each shape dim to block shape dim as already defined,
-    # and for the rest of dims, set block shape to 1.
+    # Start from last block shape dim and
+    # map each shape dim to block shape dim as already defined.
+    # For the rest of dims, set block shape to 1.
     result_shape = broadcast(a_shape, b_shape).shape
     result_block_shape = []
     a_block_shape_r = list(reversed(a_block_shape))