Skip to content

Commit

Permalink
Upgrade Transformers to v4.45.x (#751)
Browse files Browse the repository at this point in the history
Changes:
- add sdp attention to Roberta, Albert MBart, XLM-R
- re-copy GPT-J

---------

Co-authored-by: Leon Engländer <[email protected]>
  • Loading branch information
calpt and lenglaender authored Nov 2, 2024
1 parent bcace97 commit 3a6cec3
Show file tree
Hide file tree
Showing 7 changed files with 610 additions and 21 deletions.
2 changes: 1 addition & 1 deletion hf_transformers
Submodule hf_transformers updated 972 files
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
# We try to follow their general layout wherever sensible.

_deps = [
"accelerate>=0.21.0",
"accelerate>=0.26.0",
"beautifulsoup4",
"black~=24.4.0",
"dataclasses",
"datasets!=2.5.0",
"dill<0.3.5",
"docutils==0.16.0",
Expand All @@ -38,7 +39,7 @@
"protobuf",
"psutil",
"pytest>=7.2.0,<8.0.0",
"pytest-subtests",
"pytest-rich",
"pytest-timeout",
"pytest-xdist",
"markupsafe==2.0.1",
Expand All @@ -58,7 +59,7 @@
"sphinx-multiversion==0.2.4",
"timeout-decorator",
"torch",
"transformers~=4.44.0",
"transformers~=4.45.2",
]


Expand All @@ -84,7 +85,7 @@ def deps_list(*pkgs):
extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
extras["testing"] = deps_list(
"pytest",
"pytest-subtests",
"pytest-rich",
"pytest-xdist",
"timeout-decorator",
"parameterized",
Expand Down
74 changes: 73 additions & 1 deletion src/adapters/models/albert/modeling_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@
import torch
from torch import nn

from transformers.models.albert.modeling_albert import AlbertAttention, AlbertLayer
from transformers.models.albert.modeling_albert import AlbertAttention, AlbertLayer, AlbertSdpaAttention
from transformers.pytorch_utils import apply_chunking_to_forward
from transformers.utils import logging

from ...composition import adjust_tensors_for_parallel, match_attn_matrices_for_parallel
from ...utils import prefix_attention_mask
from .mixin_albert import AlbertAttentionAdaptersMixin, AlbertEncoderLayerAdaptersMixin


logger = logging.get_logger(__name__)


class AlbertAttentionWithAdapters(AlbertAttentionAdaptersMixin, AlbertAttention):
def forward(
self,
Expand Down Expand Up @@ -101,6 +105,74 @@ def forward(
return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,)


class AlbertSdpaAttentionWithAdapters(AlbertAttentionAdaptersMixin, AlbertSdpaAttention):
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
output_attentions: bool = False,
) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]:
# >>> START AH Changes <<<
attention_mask = prefix_attention_mask(attention_mask, [2, 3]) # type: ignore
# >>> END AH Changes <<<

if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None:
logger.warning(
"AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support "
"non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to "
"the eager attention implementation, but specifying the eager implementation will be required from "
"Transformers version v5.0.0 onwards. This warning can be removed using the argument "
'`attn_implementation="eager"` when loading the model.'
)
return super().forward(hidden_states, attention_mask, head_mask, output_attentions)

batch_size, seq_len, _ = hidden_states.size()
query_layer = self.transpose_for_scores(self.query(hidden_states))
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))

# >>> START AH Changes <<<
query_layer, key_layer, value_layer = match_attn_matrices_for_parallel(query_layer, key_layer, value_layer)
(attention_mask,) = adjust_tensors_for_parallel(query_layer, attention_mask)

key_layer, value_layer, attention_mask = self.prefix_tuning(
key_layer, value_layer, hidden_states, attention_mask
)
(query_layer,) = adjust_tensors_for_parallel(key_layer, query_layer)
batch_size = query_layer.size(0)
# >>> END AH Changes <<<

# SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
# attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
# Reference: https://github.com/pytorch/pytorch/issues/112577
if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
query_layer = query_layer.contiguous()
key_layer = key_layer.contiguous()
value_layer = value_layer.contiguous()

attention_output = torch.nn.functional.scaled_dot_product_attention(
query=query_layer,
key=key_layer,
value=value_layer,
attn_mask=attention_mask,
dropout_p=self.dropout_prob if self.training else 0.0,
is_causal=False,
)

attention_output = attention_output.transpose(1, 2)
attention_output = attention_output.reshape(batch_size, seq_len, self.all_head_size)

projected_context_layer = self.dense(attention_output)
projected_context_layer_dropout = self.output_dropout(projected_context_layer)

layernormed_context_layer = self.attention_adapters(
hidden_states, projected_context_layer_dropout, self.LayerNorm
)

return (layernormed_context_layer,)


class AlbertLayerWithAdapters(AlbertEncoderLayerAdaptersMixin, AlbertLayer):
def forward(
self,
Expand Down
28 changes: 15 additions & 13 deletions src/adapters/models/gptj/modeling_gptj.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import torch
import torch.utils.checkpoint

from transformers.cache_utils import Cache
from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, apply_rotary_pos_emb, get_embed_positions
from transformers.utils.import_utils import is_torch_fx_proxy

Expand All @@ -30,12 +31,13 @@ class GPTJAttentionWithAdapters(GPTJAttentionAdaptersMixin, GPTJAttention):
def forward(
self,
hidden_states: torch.FloatTensor,
layer_past: Optional[Tuple[torch.Tensor]] = None,
layer_past: Optional[Cache] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[
Tuple[torch.Tensor, Tuple[torch.Tensor]],
Optional[Tuple[torch.Tensor, Tuple[torch.Tensor], Tuple[torch.Tensor, ...]]],
Expand Down Expand Up @@ -82,15 +84,13 @@ def forward(
query = query.permute(0, 2, 1, 3)

if layer_past is not None:
past_key = layer_past[0]
past_value = layer_past[1]
key = torch.cat((past_key, key), dim=-2)
value = torch.cat((past_value, value), dim=-2)

if use_cache is True:
present = (key, value)
else:
present = None
cache_kwargs = {
"sin": sin,
"cos": cos,
"partial_rotation_size": self.rotary_dim,
"cache_position": cache_position,
}
key, value = layer_past.update(key, value, self.layer_idx, cache_kwargs)

key, value, attention_mask = self.prefix_tuning(key, value, hidden_states, attention_mask)
(query,) = adjust_tensors_for_parallel(key, query)
Expand All @@ -102,7 +102,7 @@ def forward(
attn_output = self.out_proj(attn_output)
attn_output = self.resid_dropout(attn_output)

outputs = (attn_output, present)
outputs = (attn_output, layer_past)
if output_attentions:
outputs += (attn_weights,)

Expand All @@ -113,24 +113,26 @@ class GPTJBlockWithAdapters(GPTJDecoderBlockAdaptersMixin, GPTJBlock):
def forward(
self,
hidden_states: Optional[torch.FloatTensor],
layer_past: Optional[Tuple[torch.Tensor]] = None,
layer_past: Optional[Cache] = None,
attention_mask: Optional[torch.FloatTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = False,
output_attentions: Optional[bool] = False,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
adjust_tensors_for_parallel_(hidden_states, attention_mask)
residual = hidden_states
hidden_states = self.ln_1(hidden_states)
attn_outputs = self.attn(
hidden_states,
hidden_states=hidden_states,
layer_past=layer_past,
attention_mask=attention_mask,
position_ids=position_ids,
head_mask=head_mask,
use_cache=use_cache,
output_attentions=output_attentions,
cache_position=cache_position,
)
attn_output = attn_outputs[0] # output_attn: a, present, (attentions)
outputs = attn_outputs[1:]
Expand Down
Loading

0 comments on commit 3a6cec3

Please sign in to comment.