Skip to content

Commit

Permalink
attention re-use in lookup vit should use pre-softmax attention matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
lucidrains committed Jul 20, 2024
1 parent 4b2c00c commit 9992a61
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 12 deletions.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
setup(
name = 'vit-pytorch',
packages = find_packages(exclude=['examples']),
version = '1.7.3',
version = '1.7.4',
license='MIT',
description = 'Vision Transformer (ViT) - Pytorch',
long_description=long_description,
Expand Down
23 changes: 12 additions & 11 deletions vit_pytorch/look_vit.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ def forward(
self,
x,
context = None,
return_attn = False,
attn = None
return_qk_sim = False,
qk_sim = None
):
x = self.norm(x)

Expand All @@ -119,20 +119,21 @@ def forward(
q, k = tuple(self.split_heads(t) for t in qk)

q = q * self.scale
sim = einsum(q, k, 'b h i d, b h j d -> b h i j')
qk_sim = einsum(q, k, 'b h i d, b h j d -> b h i j')

attn = self.attend(sim)
attn = self.dropout(attn)
else:
assert exists(attn), 'attention matrix must be passed in for reusing previous attention'
assert exists(qk_sim), 'qk sim matrix must be passed in for reusing previous attention'

attn = self.attend(qk_sim)
attn = self.dropout(attn)

out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
out = self.to_out(out)

if not return_attn:
if not return_qk_sim:
return out

return out, attn
return out, qk_sim

# LookViT

Expand Down Expand Up @@ -228,17 +229,17 @@ def forward(self, img):

# main tokens cross attends (lookup) on the high res tokens

lookup_out, lookup_attn = lookup_cross_attn(tokens, highres_tokens, return_attn = True) # return attention as they reuse the attention matrix
lookup_out, qk_sim = lookup_cross_attn(tokens, highres_tokens, return_qk_sim = True) # return attention as they reuse the attention matrix
tokens = lookup_out + tokens

tokens = attn(tokens) + tokens
tokens = mlp(tokens) + tokens

# attention-reuse

lookup_attn = rearrange(lookup_attn, 'b h i j -> b h j i') # transpose for reverse cross attention
qk_sim = rearrange(qk_sim, 'b h i j -> b h j i') # transpose for reverse cross attention

highres_tokens = highres_attn(highres_tokens, tokens, attn = lookup_attn) + highres_tokens
highres_tokens = highres_attn(highres_tokens, tokens, qk_sim = qk_sim) + highres_tokens
highres_tokens = highres_norm(highres_tokens)

highres_tokens = highres_mlp(highres_tokens) + highres_tokens
Expand Down

0 comments on commit 9992a61

Please sign in to comment.