Implement MaskedSoftmax

kakaobrain · Oct 4, 2023 · 04073ec · 04073ec
1 parent 21ab240
commit 04073ec
Show file tree

Hide file tree

Showing 10 changed files with 512 additions and 3 deletions.
diff --git a/benchmarks/benchmark_masked_softmax.py b/benchmarks/benchmark_masked_softmax.py
@@ -0,0 +1,66 @@
+# Copyright 2023 ⓒ Kakao Brain Corp.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import triton
+import util
+
+import trident
+
+
+def masked_softmax(input: torch.Tensor, mask: torch.Tensor, dim: int):
+    input = torch.where(mask.bool(), float("-inf"), input)
+    output = torch.nn.functional.softmax(input, dim)
+
+    return output
+
+
+def build_mask(y_size: int, x_size: int, device=None, dtype=None):
+    mask = torch.randint(0, 2, (y_size, x_size), device=device, dtype=dtype)
+    mask[0, :] = mask[:, 0] = 0
+
+    return mask
+
+
+@util.report("masked softmax forward", ["x_size"], [128 * i for i in range(1, 21)], {"y_size": 16})
+def bench_masked_softmax_forward(y_size, x_size, dtype, backend):
+    input = torch.randn(y_size, x_size, device="cuda", dtype=dtype)
+    mask = build_mask(y_size, x_size, "cuda", dtype)
+
+    if backend == "torch":
+        return triton.testing.do_bench_cudagraph(lambda: masked_softmax(input, mask, 1))
+    else:
+        return triton.testing.do_bench_cudagraph(lambda: trident.function.masked_softmax(input, mask, 1))
+
+
+@util.report("masked softmax backward", ["x_size"], [128 * i for i in range(1, 21)], {"y_size": 16})
+def bench_masked_softmax_backward(y_size, x_size, dtype, backend):
+    factory_kwargs = {"device": "cuda", "dtype": dtype}
+    input = torch.randn(y_size, x_size, **factory_kwargs, requires_grad=True)
+    mask = build_mask(y_size, x_size, "cuda", dtype)
+    grad_output = torch.randn(y_size, x_size, **factory_kwargs)
+
+    if backend == "torch":
+        output = masked_softmax(input, mask, 1)
+    else:
+        output = trident.function.masked_softmax(input, mask, 1)
+
+    return triton.testing.do_bench_cudagraph(lambda: output.backward(grad_output, retain_graph=True))
+
+
+def run_benchmark(mode, show_plots, dtype):
+    if mode == "forward":
+        bench_masked_softmax_forward.run(print_data=True, show_plots=show_plots, dtype=dtype)
+    else:
+        bench_masked_softmax_backward.run(print_data=True, show_plots=show_plots, dtype=dtype)
diff --git a/benchmarks/benchmarker.py b/benchmarks/benchmarker.py
@@ -26,6 +26,7 @@
 import benchmark_layer_norm
 import benchmark_leaky_relu
 import benchmark_linear
+import benchmark_masked_softmax
 import benchmark_max
 import benchmark_mean
 import benchmark_prelu
@@ -58,6 +59,7 @@ def print_scenarios():
                 "layer-norm",
                 "leaky-relu",
                 "linear",
+                "masked-softmax",
                 "max",
                 "mean",
                 "prelu",
@@ -99,6 +101,8 @@ def run_benchmarks(scenario, mode, show_plots, dtype):
         benchmark_leaky_relu.run_benchmark(mode, show_plots, dtype)
     elif scenario == "linear":
         benchmark_linear.run_benchmark(mode, show_plots, dtype)
+    elif scenario == "masked-softmax":
+        benchmark_masked_softmax.run_benchmark(mode, show_plots, dtype)
     elif scenario == "max":
         benchmark_max.run_benchmark(mode, show_plots, dtype)
     elif scenario == "mean":
@@ -134,6 +138,7 @@ def run_benchmarks(scenario, mode, show_plots, dtype):
         benchmark_layer_norm.run_benchmark(mode, show_plots, dtype)
         benchmark_leaky_relu.run_benchmark(mode, show_plots, dtype)
         benchmark_linear.run_benchmark(mode, show_plots, dtype)
+        benchmark_masked_softmax.run_benchmark(mode, show_plots, dtype)
         benchmark_max.run_benchmark(mode, show_plots, dtype)
         benchmark_mean.run_benchmark(mode, show_plots, dtype)
         benchmark_prelu.run_benchmark(mode, show_plots, dtype)

diff --git a/tests/test_masked_softmax.py b/tests/test_masked_softmax.py
@@ -0,0 +1,77 @@
+# Copyright 2023 ⓒ Kakao Brain Corp.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+
+import trident
+from tests import util
+
+
+def masked_softmax(input: torch.Tensor, mask: torch.Tensor, dim: int):
+    input = torch.where(mask.bool(), float("-inf"), input)
+    output = torch.nn.functional.softmax(input, dim)
+
+    return output
+
+
+def build_mask(y_size: int, x_size: int, device=None):
+    mask = torch.randint(0, 2, (y_size, x_size), device=device)
+    mask[0, :] = mask[:, 0] = 0
+
+    return mask
+
+
+@pytest.mark.parametrize("y_size, x_size, dim", [(2, 512, 0), (3, 1000, 1)])
+def test_forward(y_size, x_size, dim, device):
+    input = torch.randn(y_size, x_size, device=device)
+    mask = build_mask(y_size, x_size, device)
+
+    assert util.equal(masked_softmax(input, mask, dim), trident.function.masked_softmax(input, mask, dim))
+
+
+@pytest.mark.parametrize("y_size, x_size, dim", [(3, 1000, 0), (2, 512, 1)])
+def test_backward(y_size, x_size, dim, device):
+    input = torch.randn(y_size, x_size, device=device)
+    mask = build_mask(y_size, x_size, device)
+    grad_output = torch.randn(y_size, x_size, device=device)
+
+    def train(func):
+        i = input.clone()
+        i.requires_grad = True
+        func(i, mask, dim).backward(grad_output, retain_graph=True)
+        return (i.grad,)
+
+    (x,) = train(masked_softmax)
+    (a,) = train(trident.function.masked_softmax)
+
+    assert util.equal(x, a)
+
+
+@pytest.mark.parametrize("y_size, x_size, dim", [(1, 32, 1)])
+def test_masked_softmax(y_size, x_size, dim, device, dtype):
+    factory_kwargs = {"device": device, "dtype": dtype}
+    input = torch.randn(y_size, x_size, **factory_kwargs, requires_grad=True)
+    mask = build_mask(y_size, x_size, device)
+    grad_output = torch.randn_like(input)
+
+    output = trident.MaskedSoftmax(dim).forward(input, mask)
+
+    assert output is not None
+    assert output.dtype == dtype
+
+    output.backward(grad_output)
+
+    assert input.grad is not None
+    assert input.grad.dtype == dtype
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
@@ -31,14 +31,14 @@ def test_backward(y_size, x_size, dim, device):
     input = torch.randn(y_size, x_size, device=device)
     grad_output = torch.randn(y_size, x_size, device=device)
 
-    def train(func, dim):
+    def train(func):
         i = input.clone()
         i.requires_grad = True
         func(i, dim).backward(grad_output, retain_graph=True)
         return (i.grad,)
 
-    (x,) = train(torch.nn.functional.softmax, dim)
-    (a,) = train(trident.function.softmax, dim)
+    (x,) = train(torch.nn.functional.softmax)
+    (a,) = train(trident.function.softmax)
 
     assert util.equal(x, a)
 
@@ -50,9 +50,11 @@ def test_softmax(y_size, x_size, dim, device, dtype):
     grad_output = torch.randn_like(input)
 
     output = trident.Softmax(dim).forward(input)
+
     assert output is not None
     assert output.dtype == dtype
 
     output.backward(grad_output)
+
     assert input.grad is not None
     assert input.grad.dtype == dtype
diff --git a/trident/function/function.py b/trident/function/function.py
@@ -163,6 +163,15 @@ def linear(
         return operation.Linear.apply(input, weight, bias, use_accelerator)
 
 
+def masked_softmax(input: torch.Tensor, mask: torch.Tensor, dim: int):
+    """
+    Applies Masked Softmax to an input rescaling them so that an output lie in the range [0,1] and sum to 1.
+
+    See MaskedSoftmax for more details.
+    """
+    return operation.MaskedSoftmax.apply(input, mask, dim)
+
+
 def max(input: torch.Tensor, dim: int):
     """
     Returns the max along the specified dimension in an input.

diff --git a/trident/kernel/__init__.py b/trident/kernel/__init__.py
@@ -24,6 +24,7 @@
 from .layer_norm import *
 from .leaky_relu import *
 from .linear import *
+from .masked_softmax import *
 from .max import *
 from .mean import *
 from .prelu import *