Skip to content

Commit 83aee5c

Browse files
committed
Add explicit GAP (avg pool) variants of other SigLIP models.
1 parent f04802f commit 83aee5c

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

timm/models/vision_transformer.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,6 +1778,35 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17781778
input_size=(3, 384, 384),
17791779
num_classes=0),
17801780

1781+
'vit_base_patch16_siglip_gap_224.webli': _cfg(
1782+
hf_hub_id='timm/ViT-B-16-SigLIP',
1783+
hf_hub_filename='open_clip_pytorch_model.bin',
1784+
num_classes=0),
1785+
'vit_base_patch16_siglip_gap_256.webli': _cfg(
1786+
hf_hub_id='timm/ViT-B-16-SigLIP-256',
1787+
hf_hub_filename='open_clip_pytorch_model.bin',
1788+
input_size=(3, 256, 256),
1789+
num_classes=0),
1790+
'vit_base_patch16_siglip_gap_384.webli': _cfg(
1791+
hf_hub_id='timm/ViT-B-16-SigLIP-384',
1792+
hf_hub_filename='open_clip_pytorch_model.bin',
1793+
input_size=(3, 384, 384),
1794+
num_classes=0),
1795+
'vit_base_patch16_siglip_gap_512.webli': _cfg(
1796+
hf_hub_id='timm/ViT-B-16-SigLIP-512',
1797+
hf_hub_filename='open_clip_pytorch_model.bin',
1798+
input_size=(3, 512, 512),
1799+
num_classes=0),
1800+
'vit_large_patch16_siglip_gap_256.webli': _cfg(
1801+
hf_hub_id='timm/ViT-L-16-SigLIP-256',
1802+
hf_hub_filename='open_clip_pytorch_model.bin',
1803+
input_size=(3, 256, 256),
1804+
num_classes=0),
1805+
'vit_large_patch16_siglip_gap_384.webli': _cfg(
1806+
hf_hub_id='timm/ViT-L-16-SigLIP-384',
1807+
hf_hub_filename='open_clip_pytorch_model.bin',
1808+
input_size=(3, 384, 384),
1809+
num_classes=0),
17811810
'vit_so400m_patch14_siglip_gap_224.webli': _cfg(
17821811
hf_hub_id='timm/ViT-SO400M-14-SigLIP',
17831812
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -2803,8 +2832,75 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
28032832
return model
28042833

28052834

2835+
@register_model
2836+
def vit_base_patch16_siglip_gap_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2837+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2838+
model_args = dict(
2839+
patch_size=16, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='avg', fc_norm=False,
2840+
)
2841+
model = _create_vision_transformer(
2842+
'vit_base_patch16_siglip_gap_224', pretrained=pretrained, **dict(model_args, **kwargs))
2843+
return model
2844+
2845+
2846+
@register_model
2847+
def vit_base_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
2848+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2849+
model_args = dict(
2850+
patch_size=16, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='avg', fc_norm=False,
2851+
)
2852+
model = _create_vision_transformer(
2853+
'vit_base_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
2854+
return model
2855+
2856+
2857+
@register_model
2858+
def vit_base_patch16_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
2859+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2860+
model_args = dict(
2861+
patch_size=16, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='avg', fc_norm=False,
2862+
)
2863+
model = _create_vision_transformer(
2864+
'vit_base_patch16_siglip_gap_384', pretrained=pretrained, **dict(model_args, **kwargs))
2865+
return model
2866+
2867+
2868+
@register_model
2869+
def vit_base_patch16_siglip_gap_512(pretrained: bool = False, **kwargs) -> VisionTransformer:
2870+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2871+
model_args = dict(
2872+
patch_size=16, embed_dim=768, depth=12, num_heads=12, class_token=False, global_pool='avg', fc_norm=False,
2873+
)
2874+
model = _create_vision_transformer(
2875+
'vit_base_patch16_siglip_gap_512', pretrained=pretrained, **dict(model_args, **kwargs))
2876+
return model
2877+
2878+
2879+
@register_model
2880+
def vit_large_patch16_siglip_gap_256(pretrained: bool = False, **kwargs) -> VisionTransformer:
2881+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2882+
model_args = dict(
2883+
patch_size=16, embed_dim=1024, depth=24, num_heads=16, class_token=False, global_pool='avg', fc_norm=False,
2884+
)
2885+
model = _create_vision_transformer(
2886+
'vit_large_patch16_siglip_gap_256', pretrained=pretrained, **dict(model_args, **kwargs))
2887+
return model
2888+
2889+
2890+
@register_model
2891+
def vit_large_patch16_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
2892+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2893+
model_args = dict(
2894+
patch_size=16, embed_dim=1024, depth=24, num_heads=16, class_token=False, global_pool='avg', fc_norm=False,
2895+
)
2896+
model = _create_vision_transformer(
2897+
'vit_large_patch16_siglip_gap_384', pretrained=pretrained, **dict(model_args, **kwargs))
2898+
return model
2899+
2900+
28062901
@register_model
28072902
def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2903+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28082904
model_args = dict(
28092905
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
28102906
class_token=False, global_pool='avg', fc_norm=False,
@@ -2816,6 +2912,7 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
28162912

28172913
@register_model
28182914
def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> VisionTransformer:
2915+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28192916
model_args = dict(
28202917
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
28212918
class_token=False, global_pool='avg', fc_norm=False,
@@ -2827,6 +2924,7 @@ def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> Vis
28272924

28282925
@register_model
28292926
def vit_so400m_patch14_siglip_gap_448(pretrained: bool = False, **kwargs) -> VisionTransformer:
2927+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28302928
model_args = dict(
28312929
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
28322930
class_token=False, global_pool='avg', fc_norm=False,
@@ -2838,6 +2936,7 @@ def vit_so400m_patch14_siglip_gap_448(pretrained: bool = False, **kwargs) -> Vis
28382936

28392937
@register_model
28402938
def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> VisionTransformer:
2939+
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
28412940
model_args = dict(
28422941
patch_size=14, embed_dim=1152, depth=27, num_heads=16, mlp_ratio=3.7362,
28432942
class_token=False, global_pool='avg', fc_norm=False,

0 commit comments

Comments
 (0)