ROCm · hliuca · Nov 20, 2024 · Nov 20, 2024 · Nov 20, 2024 · Nov 24, 2024
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
@@ -93,20 +93,21 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
 
 // Launch activation and gating kernel.
 #ifdef USE_ROCM
-#define LAUNCH_SCALED_ACTIVATION_GATE_KERNEL(KERNEL)                           \
-  int d = input.size(-1) / 2;                                                  \
-  int64_t num_tokens = input.numel() / input.size(-1);                         \
-  dim3 grid(num_tokens);                                                       \
-  dim3 block(std::min(d, 1024));                                               \
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));            \
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                \
-  VLLM_DISPATCH_FLOATING_TYPES(                                                \
-      input.scalar_type(), "scaled_act_and_mul_kernel", [&] {                  \
-        vllm::scaled_act_and_mul_kernel<scalar_t, KERNEL<scalar_t>>            \
-            <<<grid, block, 0, stream>>>(out.data_ptr<c10::Float8_e4m3fnuz>(), \
-                                         input.data_ptr<scalar_t>(), d,        \
-                                         1.0 / (*scale.data_ptr<float>()));    \
-      });
+  #define LAUNCH_SCALED_ACTIVATION_GATE_KERNEL(KERNEL)                \
+    int d = input.size(-1) / 2;                                       \
+    int64_t num_tokens = input.numel() / input.size(-1);              \
+    dim3 grid(num_tokens);                                            \
+    dim3 block(std::min(d, 1024));                                    \
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();     \
+    VLLM_DISPATCH_FLOATING_TYPES(                                     \
+        input.scalar_type(), "scaled_act_and_mul_kernel", [&] {       \
+          vllm::scaled_act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
+              <<<grid, block, 0, stream>>>(                           \
+                  out.data_ptr<c10::Float8_e4m3fnuz>(),               \
+                  input.data_ptr<scalar_t>(), d,                      \
+                  1.0 / (*scale.data_ptr<float>()));                  \
+        });
 #endif
 
 void silu_and_mul(torch::Tensor& out,    // [..., d]

diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -247,7 +247,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
     LAUNCH_RMS_NORM(0);
   }
 #else
-    LAUNCH_RMS_NORM(0);
+  LAUNCH_RMS_NORM(0);
 #endif
 }
 

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -218,12 +218,6 @@
             max_encoder_seq_len=self.max_encoder_seq_len,
             cross_slot_mapping=self.cross_slot_mapping,
             cross_block_tables=self.cross_block_tables)
-        # Batch may be composed of prefill|decodes, adjust query start indices
-        # to refer to the start of decodes when the two are split apart.
-        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
-        if self._cached_decode_metadata.query_start_loc is not None:
-            qs = self._cached_decode_metadata.query_start_loc
-            self._cached_decode_metadata.query_start_loc = qs - qs[0]
         return self._cached_decode_metadata
 
     def advance_step(self,
@@ -459,10 +453,12 @@
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError(
-                "ROCmFlashAttention does not support attention logits soft "
-                "capping.")
+
+        if logits_soft_cap is None:
+            # In flash-attn, setting logits_soft_cap as 0 means no soft cap.
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
+
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -487,6 +483,14 @@
         # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
         self.use_triton_flash_attn = envs.VLLM_USE_TRITON_FLASH_ATTN
         if self.use_triton_flash_attn:
+            if logits_soft_cap is not None:
+                raise ValueError(
+                    "ROCm Triton FlashAttention does not support attention"
+                    "logits soft capping."
+                    " please try using the ROCm CK "
+                    "FA backend instead by setting the env var "
+                    "`VLLM_USE_TRITON_FLASH_ATTN=0`")
+
             from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                 triton_attention)
             self.attn_func = triton_attention
@@ -505,12 +509,17 @@
            else:
                try:
                    from flash_attn import flash_attn_varlen_func  # noqa: F401
                    self.attn_func = flash_attn_varlen_func
                    logger.debug("Using CK FA in ROCmBackend")
                except ModuleNotFoundError:
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
+                if logits_soft_cap is not None:
+                    raise ValueError(
+                        "ROCm Naive FlashAttention does not support"
+                        "attention logits soft capping.")
+
                 self.attn_func = _sdpa_attention
                 logger.debug("Using naive (SDPA) attention in ROCmBackend")
 
@@ -662,7 +671,7 @@
                            query.dtype,
                            seq_lens,
                            make_attn_mask=False)  # type: ignore
                    out, _ = self.attn_func(
                        query,
                        key,
                        value,
@@ -691,7 +700,7 @@
                    key = key.movedim(0, key.dim() - 2)
                    value = value.movedim(0, value.dim() - 2)
                    # sdpa math backend attention
                    out = self.attn_func(
                        query,
                        key,
                        value,
@@ -704,7 +713,7 @@
                        attn_masks,
                    )
                else:
                    out = self.attn_func(
                        q=query,
                        k=key,
                        v=value,
@@ -716,6 +725,7 @@
                         causal=True,
                         window_size=self.sliding_window,
                         alibi_slopes=self.alibi_slopes,
+                        softcap=self.logits_soft_cap,
                     )
 
                 # common code for prefill