@@ -697,6 +697,13 @@ def _cfg(url='', **kwargs):
697
697
698
698
default_cfgs = generate_default_cfgs ({
699
699
700
+ # re-finetuned augreg 21k FT on in1k weights
701
+ 'vit_base_patch16_224.augreg2_in21k_ft_in1k' : _cfg (
702
+ hf_hub_id = 'timm/' ),
703
+ 'vit_base_patch16_384.augreg2_in21k_ft_in1k' : _cfg (),
704
+ 'vit_base_patch8_224.augreg2_in21k_ft_in1k' : _cfg (
705
+ hf_hub_id = 'timm/' ),
706
+
700
707
# How to train your ViT (augreg) weights, pretrained on 21k FT on in1k
701
708
'vit_tiny_patch16_224.augreg_in21k_ft_in1k' : _cfg (
702
709
url = 'https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz' ,
@@ -751,13 +758,6 @@ def _cfg(url='', **kwargs):
751
758
hf_hub_id = 'timm/' ,
752
759
custom_load = True , input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
753
760
754
- # re-finetuned augreg 21k FT on in1k weights
755
- 'vit_base_patch16_224.augreg2_in21k_ft_in1k' : _cfg (
756
- hf_hub_id = 'timm/' ),
757
- 'vit_base_patch16_384.augreg2_in21k_ft_in1k' : _cfg (),
758
- 'vit_base_patch8_224.augreg2_in21k_ft_in1k' : _cfg (
759
- hf_hub_id = 'timm/' ),
760
-
761
761
# patch models (weights from official Google JAX impl) pretrained on in21k FT on in1k
762
762
'vit_base_patch16_224.orig_in21k_ft_in1k' : _cfg (
763
763
url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth' ,
@@ -802,7 +802,6 @@ def _cfg(url='', **kwargs):
802
802
'vit_giant_patch14_224.untrained' : _cfg (url = '' ),
803
803
'vit_gigantic_patch14_224.untrained' : _cfg (url = '' ),
804
804
805
-
806
805
# patch models, imagenet21k (weights from official Google JAX impl)
807
806
'vit_large_patch32_224.orig_in21k' : _cfg (
808
807
url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth' ,
@@ -869,7 +868,6 @@ def _cfg(url='', **kwargs):
869
868
hf_hub_id = 'timm/' ,
870
869
mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD , num_classes = 0 ),
871
870
872
-
873
871
# ViT ImageNet-21K-P pretraining by MILL
874
872
'vit_base_patch16_224_miil.in21k' : _cfg (
875
873
url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth' ,
@@ -880,7 +878,7 @@ def _cfg(url='', **kwargs):
880
878
hf_hub_id = 'timm/' ,
881
879
mean = (0. , 0. , 0. ), std = (1. , 1. , 1. ), crop_pct = 0.875 , interpolation = 'bilinear' ),
882
880
883
- # custom timm variants
881
+ # Custom timm variants
884
882
'vit_base_patch16_rpn_224.in1k' : _cfg (
885
883
url = 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth' ,
886
884
hf_hub_id = 'timm/' ),
@@ -896,52 +894,6 @@ def _cfg(url='', **kwargs):
896
894
'vit_base_patch16_gap_224' : _cfg (),
897
895
898
896
# CLIP pretrained image tower and related fine-tuned weights
899
- 'vit_base_patch32_clip_224.laion2b' : _cfg (
900
- hf_hub_id = 'laion/CLIP-ViT-B-32-laion2B-s34B-b79K' ,
901
- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
902
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
903
- 'vit_base_patch16_clip_224.laion2b' : _cfg (
904
- #hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
905
- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
906
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
907
- 'vit_large_patch14_clip_224.laion2b' : _cfg (
908
- hf_hub_id = 'laion/CLIP-ViT-L-14-laion2B-s32B-b82K' ,
909
- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
910
- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 768 ),
911
- 'vit_huge_patch14_clip_224.laion2b' : _cfg (
912
- hf_hub_id = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K' ,
913
- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
914
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
915
- 'vit_giant_patch14_clip_224.laion2b' : _cfg (
916
- hf_hub_id = 'laion/CLIP-ViT-g-14-laion2B-s12B-b42K' ,
917
- hf_hub_filename = 'open_clip_pytorch_model.bin' ,
918
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
919
-
920
- 'vit_base_patch32_clip_224.laion2b_ft_in1k' : _cfg (
921
- hf_hub_id = 'timm/' ,
922
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
923
- 'vit_base_patch16_clip_224.laion2b_ft_in1k' : _cfg (
924
- hf_hub_id = 'timm/' ,
925
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
926
- 'vit_base_patch16_clip_384.laion2b_ft_in1k' : _cfg (
927
- hf_hub_id = 'timm/' ,
928
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
929
- crop_pct = 1.0 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
930
- 'vit_large_patch14_clip_224.laion2b_ft_in1k' : _cfg (
931
- hf_hub_id = 'timm/' ,
932
- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 ),
933
- 'vit_large_patch14_clip_336.laion2b_ft_in1k' : _cfg (
934
- hf_hub_id = 'timm/' ,
935
- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD ,
936
- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
937
- 'vit_huge_patch14_clip_224.laion2b_ft_in1k' : _cfg (
938
- hf_hub_id = 'timm/' ,
939
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
940
- 'vit_huge_patch14_clip_336.laion2b_ft_in1k' : _cfg (
941
- hf_hub_id = '' ,
942
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
943
- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
944
-
945
897
'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k' : _cfg (
946
898
hf_hub_id = 'timm/' ,
947
899
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
@@ -973,28 +925,52 @@ def _cfg(url='', **kwargs):
973
925
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
974
926
crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
975
927
976
- 'vit_base_patch32_clip_224.laion2b_ft_in12k ' : _cfg (
977
- #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k ',
978
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
979
- 'vit_base_patch16_clip_224.laion2b_ft_in12k ' : _cfg (
928
+ 'vit_base_patch32_clip_224.openai_ft_in12k_in1k ' : _cfg (
929
+ # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k ',
930
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
931
+ 'vit_base_patch32_clip_384.openai_ft_in12k_in1k ' : _cfg (
980
932
hf_hub_id = 'timm/' ,
981
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
982
- 'vit_large_patch14_clip_224.laion2b_ft_in12k' : _cfg (
933
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
934
+ crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
935
+ 'vit_base_patch16_clip_224.openai_ft_in12k_in1k' : _cfg (
983
936
hf_hub_id = 'timm/' ,
984
- mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 11821 ),
985
- 'vit_huge_patch14_clip_224.laion2b_ft_in12k ' : _cfg (
937
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 0.95 ),
938
+ 'vit_base_patch16_clip_384.openai_ft_in12k_in1k ' : _cfg (
986
939
hf_hub_id = 'timm/' ,
987
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
940
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
941
+ crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
942
+ 'vit_large_patch14_clip_224.openai_ft_in12k_in1k' : _cfg (
943
+ hf_hub_id = 'timm/' ,
944
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
945
+ 'vit_large_patch14_clip_336.openai_ft_in12k_in1k' : _cfg (
946
+ hf_hub_id = 'timm/' ,
947
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
948
+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
988
949
989
- 'vit_base_patch32_clip_224.openai ' : _cfg (
950
+ 'vit_base_patch32_clip_224.laion2b_ft_in1k ' : _cfg (
990
951
hf_hub_id = 'timm/' ,
991
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
992
- 'vit_base_patch16_clip_224.openai ' : _cfg (
952
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
953
+ 'vit_base_patch16_clip_224.laion2b_ft_in1k ' : _cfg (
993
954
hf_hub_id = 'timm/' ,
994
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
995
- 'vit_large_patch14_clip_224.openai ' : _cfg (
955
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
956
+ 'vit_base_patch16_clip_384.laion2b_ft_in1k ' : _cfg (
996
957
hf_hub_id = 'timm/' ,
997
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
958
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
959
+ crop_pct = 1.0 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
960
+ 'vit_large_patch14_clip_224.laion2b_ft_in1k' : _cfg (
961
+ hf_hub_id = 'timm/' ,
962
+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 ),
963
+ 'vit_large_patch14_clip_336.laion2b_ft_in1k' : _cfg (
964
+ hf_hub_id = 'timm/' ,
965
+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD ,
966
+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
967
+ 'vit_huge_patch14_clip_224.laion2b_ft_in1k' : _cfg (
968
+ hf_hub_id = 'timm/' ,
969
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
970
+ 'vit_huge_patch14_clip_336.laion2b_ft_in1k' : _cfg (
971
+ hf_hub_id = '' ,
972
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
973
+ crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
998
974
999
975
'vit_base_patch32_clip_224.openai_ft_in1k' : _cfg (
1000
976
hf_hub_id = 'timm/' ,
@@ -1010,30 +986,21 @@ def _cfg(url='', **kwargs):
1010
986
hf_hub_id = 'timm/' ,
1011
987
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
1012
988
1013
- 'vit_base_patch32_clip_224.openai_ft_in12k_in1k' : _cfg (
1014
- #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k',
1015
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ),
1016
- 'vit_base_patch32_clip_384.openai_ft_in12k_in1k' : _cfg (
1017
- hf_hub_id = 'timm/' ,
1018
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1019
- crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
1020
- 'vit_base_patch16_clip_224.openai_ft_in12k_in1k' : _cfg (
1021
- hf_hub_id = 'timm/' ,
1022
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 0.95 ),
1023
- 'vit_base_patch16_clip_384.openai_ft_in12k_in1k' : _cfg (
989
+ 'vit_base_patch32_clip_224.laion2b_ft_in12k' : _cfg (
990
+ #hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
991
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
992
+ 'vit_base_patch16_clip_224.laion2b_ft_in12k' : _cfg (
1024
993
hf_hub_id = 'timm/' ,
1025
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1026
- crop_pct = 0.95 , input_size = (3 , 384 , 384 ), crop_mode = 'squash' ),
1027
- 'vit_large_patch14_clip_224.openai_ft_in12k_in1k' : _cfg (
994
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
995
+ 'vit_large_patch14_clip_224.laion2b_ft_in12k' : _cfg (
1028
996
hf_hub_id = 'timm/' ,
1029
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 ),
1030
- 'vit_large_patch14_clip_336.openai_ft_in12k_in1k ' : _cfg (
997
+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 11821 ),
998
+ 'vit_huge_patch14_clip_224.laion2b_ft_in12k ' : _cfg (
1031
999
hf_hub_id = 'timm/' ,
1032
- mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD ,
1033
- crop_pct = 1.0 , input_size = (3 , 336 , 336 ), crop_mode = 'squash' ),
1000
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
1034
1001
1035
1002
'vit_base_patch32_clip_224.openai_ft_in12k' : _cfg (
1036
- #hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
1003
+ # hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
1037
1004
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 11821 ),
1038
1005
'vit_base_patch16_clip_224.openai_ft_in12k' : _cfg (
1039
1006
hf_hub_id = 'timm/' ,
@@ -1042,6 +1009,37 @@ def _cfg(url='', **kwargs):
1042
1009
hf_hub_id = 'timm/' ,
1043
1010
mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 11821 ),
1044
1011
1012
+ 'vit_base_patch32_clip_224.laion2b' : _cfg (
1013
+ hf_hub_id = 'laion/CLIP-ViT-B-32-laion2B-s34B-b79K' ,
1014
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1015
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1016
+ 'vit_base_patch16_clip_224.laion2b' : _cfg (
1017
+ # hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
1018
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1019
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 512 ),
1020
+ 'vit_large_patch14_clip_224.laion2b' : _cfg (
1021
+ hf_hub_id = 'laion/CLIP-ViT-L-14-laion2B-s32B-b82K' ,
1022
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1023
+ mean = IMAGENET_INCEPTION_MEAN , std = IMAGENET_INCEPTION_STD , crop_pct = 1.0 , num_classes = 768 ),
1024
+ 'vit_huge_patch14_clip_224.laion2b' : _cfg (
1025
+ hf_hub_id = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K' ,
1026
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1027
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1028
+ 'vit_giant_patch14_clip_224.laion2b' : _cfg (
1029
+ hf_hub_id = 'laion/CLIP-ViT-g-14-laion2B-s12B-b42K' ,
1030
+ hf_hub_filename = 'open_clip_pytorch_model.bin' ,
1031
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 1024 ),
1032
+
1033
+ 'vit_base_patch32_clip_224.openai' : _cfg (
1034
+ hf_hub_id = 'timm/' ,
1035
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1036
+ 'vit_base_patch16_clip_224.openai' : _cfg (
1037
+ hf_hub_id = 'timm/' ,
1038
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , num_classes = 512 ),
1039
+ 'vit_large_patch14_clip_224.openai' : _cfg (
1040
+ hf_hub_id = 'timm/' ,
1041
+ mean = OPENAI_CLIP_MEAN , std = OPENAI_CLIP_STD , crop_pct = 1.0 , num_classes = 768 ),
1042
+
1045
1043
# experimental (may be removed)
1046
1044
'vit_base_patch32_plus_256' : _cfg (url = '' , input_size = (3 , 256 , 256 ), crop_pct = 0.95 ),
1047
1045
'vit_base_patch16_plus_240' : _cfg (url = '' , input_size = (3 , 240 , 240 ), crop_pct = 0.95 ),
0 commit comments