From 4415bd2296c31a5ddca05e34fadba8670ccaa983 Mon Sep 17 00:00:00 2001
From: qdavid1 <168590940+qdavid1@users.noreply.github.com>
Date: Tue, 17 Dec 2024 11:08:38 -0800
Subject: [PATCH] Update RoformerQKVLinear to support kv_state (#882)

---
 axlearn/common/attention.py      | 45 ++++++++++++++++++++++-----
 axlearn/common/attention_test.py | 53 ++++++++++++++++++++++++--------
 axlearn/common/lora.py           |  7 +++++
 3 files changed, 85 insertions(+), 20 deletions(-)

diff --git a/axlearn/common/attention.py b/axlearn/common/attention.py
index 26aceb797..37baf3d8b 100644
--- a/axlearn/common/attention.py
+++ b/axlearn/common/attention.py
@@ -867,6 +867,7 @@ def forward(
         *,
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
+        kv_state: Optional[Tensor] = None,
         time_step: Optional[Tensor] = None,
     ) -> BaseQKVLinear.Output:
         """Computes attention for the given query, key, value.
@@ -875,6 +876,12 @@ def forward(
 
         See parent class for full docstring.
         """
+        if kv_state is not None:
+            raise ValueError(
+                "QKVLinear computes key and value projections "
+                "and does not expect external `kv_state`."
+            )
+
         key = query if key is None else key
         value = query if value is None else value
         q_proj = self.q_proj(query)
@@ -1019,6 +1026,7 @@ def forward(
         *,
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
+        kv_state: Optional[KVState] = None,
         time_step: Optional[Tensor] = None,
     ) -> BaseQKVLinear.Output:
         """Computes multi-head query, key, and value for the input query, key, value
@@ -1029,8 +1037,14 @@ def forward(
         See parent class for full docstring.
 
         Raises:
-            ValueError: If key and value are not both set or both None.
+            ValueError: If key and value are not both set or both None; or if kv_state is not None.
         """
+        if kv_state is not None:
+            raise ValueError(
+                "FusedQKVLinear computes key and value projections "
+                "and does not expect external `kv_state`."
+            )
+
         with child_context("qkv_proj"):
             params = self.qkv_proj.parameters
             if key is None and value is None:
@@ -1111,12 +1125,18 @@ def forward(
         *,
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
+        kv_state: Optional[Tensor] = None,
         time_step: Optional[Tensor] = None,
     ) -> FusedQKVLinear.Output:
         """See FusedQKVLinear for full docstring.
 
         N.B. Only supports cases where key and value are both None.
         """
+        if kv_state is not None:
+            raise ValueError(
+                "FusedGroupedQKVLinear computes key and value projections "
+                "and does not expect external `kv_state`."
+            )
         if key is not None or value is not None:
             raise ValueError("Key and value should be both None.")
         cfg = self.config
@@ -1193,6 +1213,7 @@ def apply_rotary_position_embeddings(
     key: Tensor,
     value: Tensor,
     sinusoidal_pos: Tensor,
+    rotary_key: bool,
     rotary_value: bool,
 ) -> tuple[Tensor, Tensor, Tensor]:
     """This is a jax implementation (a copy) of the RoPE apply_rotary_position_embeddings.
@@ -1205,7 +1226,8 @@ def apply_rotary_position_embeddings(
         key: Key embeddings with shape [batch_size, seq_len, num_heads, dim].
         value: Value embeddings with shape [batch_size, seq_len, num_heads, dim].
         sinusoidal_pos: Rotary position embeddings with shape [batch_size, seq_len, 1, dim].
-        rotary_value: Whether to apply rotary position embeddings on value layer.
+        rotary_key: Whether to apply rotary position embeddings on key.
+        rotary_value: Whether to apply rotary position embeddings on value.
 
     Returns:
         A tuple of:
@@ -1226,9 +1248,13 @@ def apply_rotary_position_embeddings(
         jnp.stack([-query[..., 1::2], query[..., ::2]], axis=-1), query.shape
     )
     query = query * cos_pos + rotate_half_query * sin_pos
-    # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
-    rotate_half_key = jnp.reshape(jnp.stack([-key[..., 1::2], key[..., ::2]], axis=-1), key.shape)
-    key = key * cos_pos + rotate_half_key * sin_pos
+
+    if rotary_key:
+        # rotate_half_key_layer [-k1,k0,-k3,k2......,-kd-1,kd-2]
+        rotate_half_key = jnp.reshape(
+            jnp.stack([-key[..., 1::2], key[..., ::2]], axis=-1), key.shape
+        )
+        key = key * cos_pos + rotate_half_key * sin_pos
     if rotary_value:
         # rotate_half_value_layer [-v1,v0,-v3,v2......,-vd-1,vd-2]
         rotate_half_value = jnp.reshape(
@@ -1252,6 +1278,7 @@ class Config(BaseQKVLinear.Config):
             RoFormerSinusoidalPositionalEmbedding.default_config()
         )
         input_linear: BaseQKVLinear.Config = QKVLinear.default_config()
+        # Whether to apply RoPE rotations to the value embeddings.
         rotary_value: Required[bool] = REQUIRED
 
     def __init__(self, cfg: QKVLinear.Config, *, parent: Module):
@@ -1283,23 +1310,27 @@ def forward(
         *,
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
+        kv_state: Optional[KVState] = None,
         time_step: Optional[Tensor] = None,
     ) -> BaseQKVLinear.Output:
         cfg = self.config
         # Query should have shape of [batch_size, seq_len, num_heads, per_head_dim].
-        query, key, value = self.i_proj(query, key=key, value=value)
+        query, key, value = self.i_proj(query, key=key, value=value, kv_state=kv_state)
         query_pos = jnp.arange(query.shape[1])[None]  # [batch_size=1, seq_len].
         if time_step is not None:
             query_pos = query_pos + time_step[:, None]  # [batch_size, seq_len].
         sinusoidal_pos_emb = self.rope_pos_emb_layer.forward(query_pos).astype(query.dtype)
         # sinusoidal_pos_emb shape should be [batch_size, seq_len, 1, dim]
         sinusoidal_pos_emb = jnp.expand_dims(sinusoidal_pos_emb, 2)
+
+        i_proj_computes_kv = kv_state is None
         query, key, value = apply_rotary_position_embeddings(
             sinusoidal_pos=sinusoidal_pos_emb,
             query=query,
             key=key,
             value=value,
-            rotary_value=cfg.rotary_value,
+            rotary_key=i_proj_computes_kv,
+            rotary_value=i_proj_computes_kv and cfg.rotary_value,
         )
 
         return self.Output(query, key, value)
diff --git a/axlearn/common/attention_test.py b/axlearn/common/attention_test.py
index 5d4aeb623..1e188ecc0 100644
--- a/axlearn/common/attention_test.py
+++ b/axlearn/common/attention_test.py
@@ -737,18 +737,24 @@ def test_alibi_attention_mask(self):
 class RoFormerSinusoidalPositionalEmbeddingTest(TestCase):
     """Tests RoFormerSinusoidalPositionalEmbedding."""
 
-    @parameterized.parameters(
-        (2, 3, 10, 32, True),
-        (2, 3, 8, 32, False),
-        (2, 4, 6, 32, True),
-        (2, 4, 8, 16, False),
-        (2, 5, 8, 48, True),
-        (2, 5, 8, 64, False),
+    @parameterized.product(
+        tensor_dimensions=(
+            (2, 3, 10, 32),
+            (2, 3, 8, 32),
+            (2, 4, 6, 32),
+            (2, 4, 8, 16),
+            (2, 5, 8, 48),
+            (2, 5, 8, 64),
+        ),
+        rotary_key=(True, False),
+        rotary_value=(True, False),
     )
     def test_apply_rotary_position_embeddings(
-        self, batch_size, num_heads, max_len, dim, rotary_value
+        self, tensor_dimensions: tuple[int, int, int, int], rotary_key: bool, rotary_value: bool
     ):
         # Unittest against the apply_rotary_position_embeddings in HF.
+        batch_size, num_heads, max_len, dim = tensor_dimensions
+
         token_ids = np.random.randint(low=1, high=20, size=[batch_size, max_len])
         sinusoidal_pos_layer = hf_roformer.RoFormerSinusoidalPositionalEmbedding(max_len, dim)
         sinusoidal_pos = sinusoidal_pos_layer(as_torch_tensor(token_ids).shape)[None, None, :, :]
@@ -771,11 +777,15 @@ def test_apply_rotary_position_embeddings(
                 sinusoidal_pos, as_torch_tensor(query), as_torch_tensor(key)
             )
             ref_v_proj = as_torch_tensor(value)
+        if not rotary_key:
+            ref_k_proj = as_torch_tensor(key)
+
         test_q_proj, test_k_proj, test_v_proj = test_layer(
             sinusoidal_pos=as_tensor(sinusoidal_pos),
             query=query,
             key=key,
             value=value,
+            rotary_key=rotary_key,
             rotary_value=rotary_value,
         )
         np.testing.assert_allclose(test_q_proj, ref_q_proj, atol=5e-7)
@@ -1128,6 +1138,7 @@ def test_against_llama_for_apply_rotary_emb(self):
             key=jnp.asarray(key),
             value=jnp.asarray(value),
             sinusoidal_pos=axlearn_rope,
+            rotary_key=True,
             rotary_value=False,
         )
 
@@ -1382,11 +1393,22 @@ def test_num_kv_heads(
             layer = cfg.instantiate(parent=None)
             self.assertEqual(expected, layer.num_kv_heads)
 
-    def test_qlinear(self):
+    @parameterized.parameters(
+        (QKVLinear.default_config(), QLinear.default_config()),
+        (
+            RoFormerQKVLinear.default_config().set(
+                input_linear=QKVLinear.default_config(), rotary_value=False
+            ),
+            RoFormerQKVLinear.default_config().set(
+                input_linear=QLinear.default_config(), rotary_value=False
+            ),
+        ),
+    )
+    def test_qlinear(self, base_cfg, test_cfg):
         """Tests that QLinear is equivalent to QKVLinear with the same kv_state."""
         with utils.numeric_checks(True):
             model_dim = 12
-            num_heads = 4
+            num_heads = 3
             per_head_dim = model_dim // num_heads
             layer_kwargs = dict(
                 query_dim=model_dim,
@@ -1395,8 +1417,8 @@ def test_qlinear(self):
                 num_heads=num_heads,
                 per_head_dim=per_head_dim,
             )
-            base_cfg = QKVLinear.default_config().set(**layer_kwargs)
-            test_cfg = QLinear.default_config().set(**layer_kwargs)
+            base_cfg = base_cfg.set(**layer_kwargs)
+            test_cfg = test_cfg.set(**layer_kwargs)
             maybe_set_config(test_cfg, num_kv_heads=num_heads)
             base_layer = base_cfg.set(name="base").instantiate(parent=None)
             test_layer = test_cfg.set(name="test").instantiate(parent=None)
@@ -1404,7 +1426,12 @@ def test_qlinear(self):
             # Construct base layer state.
             base_state = base_layer.initialize_parameters_recursively(jax.random.PRNGKey(0))
             # Map state to QLinear.
-            test_state = {"q_proj": base_state["q_proj"]}
+            if "q_proj" in base_state:
+                test_state = {"q_proj": base_state["q_proj"]}
+            elif "i_proj" in base_state:
+                test_state = {"i_proj": {"q_proj": base_state["i_proj"]["q_proj"]}}
+            else:
+                raise ValueError("Cannot find expected q_proj state.")
 
             # Construct test inputs.
             batch_size, src_len, tgt_len = 2, 6, 6
diff --git a/axlearn/common/lora.py b/axlearn/common/lora.py
index b968f1548..199cef603 100644
--- a/axlearn/common/lora.py
+++ b/axlearn/common/lora.py
@@ -516,8 +516,15 @@ def forward(
         *,
         key: Optional[Tensor] = None,
         value: Optional[Tensor] = None,
+        kv_state: Optional[Tensor] = None,
         time_step: Optional[Tensor] = None,
     ) -> BaseQKVLinear.Output:
+        if kv_state is not None:
+            raise ValueError(
+                "LoraFusedQKVLinear computes key and value projections "
+                "and does not expect external `kv_state`."
+            )
+
         cfg = self.config
         if key is None and value is None:
             inputs = query