diff --git a/rl4co/models/nn/attention.py b/rl4co/models/nn/attention.py index b65169f0..0dfa5973 100644 --- a/rl4co/models/nn/attention.py +++ b/rl4co/models/nn/attention.py @@ -19,7 +19,8 @@ def scaled_dot_product_attention_simple( q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False ): - """Simple Scaled Dot-Product Attention in PyTorch without Flash Attention""" + """Simple (exact) Scaled Dot-Product Attention in RL4CO without customized kernels (i.e. no Flash Attention).""" + # Check for causal and attn_mask conflict if is_causal and attn_mask is not None: raise ValueError("Cannot set both is_causal and attn_mask") diff --git a/tests/test_utils.py b/tests/test_utils.py index a2494e80..c0f6041a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,9 @@ import torch from tensordict import TensorDict +from torch.nn.functional import scaled_dot_product_attention +from rl4co.models.nn.attention import scaled_dot_product_attention_simple from rl4co.utils.decoding import process_logits from rl4co.utils.ops import batchify, unbatchify @@ -35,3 +37,16 @@ def test_top_k_top_p_sampling(top_p, top_k): mask = torch.ones(8, 10).bool() logprobs = process_logits(logits, mask, top_p=top_p, top_k=top_k) assert len(logprobs) == logits.size(0) + + +def test_scaled_dot_product_attention(): + bs, ns, ds = 2, 3, 4 + q = torch.rand(bs, ns, ds) + k = torch.rand(bs, ns, ds) + v = torch.rand(bs, ns, ds) + attn_mask = torch.rand(bs, ns, ns) > 0.5 + attn_mask[:, 0, :] = True # at least one row element is True + attn_mask[:, :, 0] = True # at least one column element is True + attn_torch = scaled_dot_product_attention(q, k, v, attn_mask) + attn_rl4co = scaled_dot_product_attention_simple(q, k, v, attn_mask) + assert torch.allclose(attn_torch, attn_rl4co)