@@ -1778,6 +1778,35 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1778
1778
input_size = (3 , 384 , 384 ),
1779
1779
num_classes = 0 ),
1780
1780
1781
+ 'vit_base_patch16_siglip_gap_224.webli' : _cfg (
1782
+ hf_hub_id = 'timm/ViT-B-16-SigLIP' ,
1783
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1784
+ num_classes = 0 ),
1785
+ 'vit_base_patch16_siglip_gap_256.webli' : _cfg (
1786
+ hf_hub_id = 'timm/ViT-B-16-SigLIP-256' ,
1787
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1788
+ input_size = (3 , 256 , 256 ),
1789
+ num_classes = 0 ),
1790
+ 'vit_base_patch16_siglip_gap_384.webli' : _cfg (
1791
+ hf_hub_id = 'timm/ViT-B-16-SigLIP-384' ,
1792
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1793
+ input_size = (3 , 384 , 384 ),
1794
+ num_classes = 0 ),
1795
+ 'vit_base_patch16_siglip_gap_512.webli' : _cfg (
1796
+ hf_hub_id = 'timm/ViT-B-16-SigLIP-512' ,
1797
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1798
+ input_size = (3 , 512 , 512 ),
1799
+ num_classes = 0 ),
1800
+ 'vit_large_patch16_siglip_gap_256.webli' : _cfg (
1801
+ hf_hub_id = 'timm/ViT-L-16-SigLIP-256' ,
1802
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1803
+ input_size = (3 , 256 , 256 ),
1804
+ num_classes = 0 ),
1805
+ 'vit_large_patch16_siglip_gap_384.webli' : _cfg (
1806
+ hf_hub_id = 'timm/ViT-L-16-SigLIP-384' ,
1807
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1808
+ input_size = (3 , 384 , 384 ),
1809
+ num_classes = 0 ),
1781
1810
'vit_so400m_patch14_siglip_gap_224.webli' : _cfg (
1782
1811
hf_hub_id = 'timm/ViT-SO400M-14-SigLIP' ,
1783
1812
hf_hub_filename = 'open_clip_pytorch_model.bin' ,
@@ -2803,8 +2832,75 @@ def vit_so400m_patch14_siglip_384(pretrained: bool = False, **kwargs) -> VisionT
2803
2832
return model
2804
2833
2805
2834
2835
+ @register_model
2836
+ def vit_base_patch16_siglip_gap_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2837
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2838
+ model_args = dict (
2839
+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2840
+ )
2841
+ model = _create_vision_transformer (
2842
+ 'vit_base_patch16_siglip_gap_224' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2843
+ return model
2844
+
2845
+
2846
+ @register_model
2847
+ def vit_base_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2848
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2849
+ model_args = dict (
2850
+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2851
+ )
2852
+ model = _create_vision_transformer (
2853
+ 'vit_base_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2854
+ return model
2855
+
2856
+
2857
+ @register_model
2858
+ def vit_base_patch16_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2859
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2860
+ model_args = dict (
2861
+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2862
+ )
2863
+ model = _create_vision_transformer (
2864
+ 'vit_base_patch16_siglip_gap_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2865
+ return model
2866
+
2867
+
2868
+ @register_model
2869
+ def vit_base_patch16_siglip_gap_512 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2870
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2871
+ model_args = dict (
2872
+ patch_size = 16 , embed_dim = 768 , depth = 12 , num_heads = 12 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2873
+ )
2874
+ model = _create_vision_transformer (
2875
+ 'vit_base_patch16_siglip_gap_512' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2876
+ return model
2877
+
2878
+
2879
+ @register_model
2880
+ def vit_large_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2881
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2882
+ model_args = dict (
2883
+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2884
+ )
2885
+ model = _create_vision_transformer (
2886
+ 'vit_large_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2887
+ return model
2888
+
2889
+
2890
+ @register_model
2891
+ def vit_large_patch16_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2892
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2893
+ model_args = dict (
2894
+ patch_size = 16 , embed_dim = 1024 , depth = 24 , num_heads = 16 , class_token = False , global_pool = 'avg' , fc_norm = False ,
2895
+ )
2896
+ model = _create_vision_transformer (
2897
+ 'vit_large_patch16_siglip_gap_384' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
2898
+ return model
2899
+
2900
+
2806
2901
@register_model
2807
2902
def vit_so400m_patch14_siglip_gap_224 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2903
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2808
2904
model_args = dict (
2809
2905
patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
2810
2906
class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2816,6 +2912,7 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
2816
2912
2817
2913
@register_model
2818
2914
def vit_so400m_patch14_siglip_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2915
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2819
2916
model_args = dict (
2820
2917
patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
2821
2918
class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2827,6 +2924,7 @@ def vit_so400m_patch14_siglip_gap_384(pretrained: bool = False, **kwargs) -> Vis
2827
2924
2828
2925
@register_model
2829
2926
def vit_so400m_patch14_siglip_gap_448 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2927
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2830
2928
model_args = dict (
2831
2929
patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
2832
2930
class_token = False , global_pool = 'avg' , fc_norm = False ,
@@ -2838,6 +2936,7 @@ def vit_so400m_patch14_siglip_gap_448(pretrained: bool = False, **kwargs) -> Vis
2838
2936
2839
2937
@register_model
2840
2938
def vit_so400m_patch14_siglip_gap_896 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
2939
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
2841
2940
model_args = dict (
2842
2941
patch_size = 14 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
2843
2942
class_token = False , global_pool = 'avg' , fc_norm = False ,
0 commit comments