Skip to content

Commit 60ebb6c

Browse files
committed
Re-order vit pretrained entries for more sensible default weights (no .tag specified)
1 parent e861b74 commit 60ebb6c

File tree

1 file changed

+88
-90
lines changed

1 file changed

+88
-90
lines changed

timm/models/vision_transformer.py

Lines changed: 88 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,13 @@ def _cfg(url='', **kwargs):
697697

698698
default_cfgs = generate_default_cfgs({
699699

700+
# re-finetuned augreg 21k FT on in1k weights
701+
'vit_base_patch16_224.augreg2_in21k_ft_in1k': _cfg(
702+
hf_hub_id='timm/'),
703+
'vit_base_patch16_384.augreg2_in21k_ft_in1k': _cfg(),
704+
'vit_base_patch8_224.augreg2_in21k_ft_in1k': _cfg(
705+
hf_hub_id='timm/'),
706+
700707
# How to train your ViT (augreg) weights, pretrained on 21k FT on in1k
701708
'vit_tiny_patch16_224.augreg_in21k_ft_in1k': _cfg(
702709
url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz',
@@ -751,13 +758,6 @@ def _cfg(url='', **kwargs):
751758
hf_hub_id='timm/',
752759
custom_load=True, input_size=(3, 384, 384), crop_pct=1.0),
753760

754-
# re-finetuned augreg 21k FT on in1k weights
755-
'vit_base_patch16_224.augreg2_in21k_ft_in1k': _cfg(
756-
hf_hub_id='timm/'),
757-
'vit_base_patch16_384.augreg2_in21k_ft_in1k': _cfg(),
758-
'vit_base_patch8_224.augreg2_in21k_ft_in1k': _cfg(
759-
hf_hub_id='timm/'),
760-
761761
# patch models (weights from official Google JAX impl) pretrained on in21k FT on in1k
762762
'vit_base_patch16_224.orig_in21k_ft_in1k': _cfg(
763763
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth',
@@ -802,7 +802,6 @@ def _cfg(url='', **kwargs):
802802
'vit_giant_patch14_224.untrained': _cfg(url=''),
803803
'vit_gigantic_patch14_224.untrained': _cfg(url=''),
804804

805-
806805
# patch models, imagenet21k (weights from official Google JAX impl)
807806
'vit_large_patch32_224.orig_in21k': _cfg(
808807
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth',
@@ -869,7 +868,6 @@ def _cfg(url='', **kwargs):
869868
hf_hub_id='timm/',
870869
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
871870

872-
873871
# ViT ImageNet-21K-P pretraining by MILL
874872
'vit_base_patch16_224_miil.in21k': _cfg(
875873
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth',
@@ -880,7 +878,7 @@ def _cfg(url='', **kwargs):
880878
hf_hub_id='timm/',
881879
mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear'),
882880

883-
# custom timm variants
881+
# Custom timm variants
884882
'vit_base_patch16_rpn_224.in1k': _cfg(
885883
url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth',
886884
hf_hub_id='timm/'),
@@ -896,52 +894,6 @@ def _cfg(url='', **kwargs):
896894
'vit_base_patch16_gap_224': _cfg(),
897895

898896
# CLIP pretrained image tower and related fine-tuned weights
899-
'vit_base_patch32_clip_224.laion2b': _cfg(
900-
hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K',
901-
hf_hub_filename='open_clip_pytorch_model.bin',
902-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
903-
'vit_base_patch16_clip_224.laion2b': _cfg(
904-
#hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
905-
hf_hub_filename='open_clip_pytorch_model.bin',
906-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
907-
'vit_large_patch14_clip_224.laion2b': _cfg(
908-
hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K',
909-
hf_hub_filename='open_clip_pytorch_model.bin',
910-
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768),
911-
'vit_huge_patch14_clip_224.laion2b': _cfg(
912-
hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
913-
hf_hub_filename='open_clip_pytorch_model.bin',
914-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
915-
'vit_giant_patch14_clip_224.laion2b': _cfg(
916-
hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K',
917-
hf_hub_filename='open_clip_pytorch_model.bin',
918-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
919-
920-
'vit_base_patch32_clip_224.laion2b_ft_in1k': _cfg(
921-
hf_hub_id='timm/',
922-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
923-
'vit_base_patch16_clip_224.laion2b_ft_in1k': _cfg(
924-
hf_hub_id='timm/',
925-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
926-
'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg(
927-
hf_hub_id='timm/',
928-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
929-
crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'),
930-
'vit_large_patch14_clip_224.laion2b_ft_in1k': _cfg(
931-
hf_hub_id='timm/',
932-
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
933-
'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg(
934-
hf_hub_id='timm/',
935-
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
936-
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
937-
'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg(
938-
hf_hub_id='timm/',
939-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
940-
'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg(
941-
hf_hub_id='',
942-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
943-
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
944-
945897
'vit_base_patch32_clip_224.laion2b_ft_in12k_in1k': _cfg(
946898
hf_hub_id='timm/',
947899
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
@@ -973,28 +925,52 @@ def _cfg(url='', **kwargs):
973925
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
974926
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
975927

976-
'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg(
977-
#hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
978-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
979-
'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg(
928+
'vit_base_patch32_clip_224.openai_ft_in12k_in1k': _cfg(
929+
# hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k',
930+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
931+
'vit_base_patch32_clip_384.openai_ft_in12k_in1k': _cfg(
980932
hf_hub_id='timm/',
981-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
982-
'vit_large_patch14_clip_224.laion2b_ft_in12k': _cfg(
933+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
934+
crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
935+
'vit_base_patch16_clip_224.openai_ft_in12k_in1k': _cfg(
983936
hf_hub_id='timm/',
984-
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=11821),
985-
'vit_huge_patch14_clip_224.laion2b_ft_in12k': _cfg(
937+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95),
938+
'vit_base_patch16_clip_384.openai_ft_in12k_in1k': _cfg(
986939
hf_hub_id='timm/',
987-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
940+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
941+
crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
942+
'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg(
943+
hf_hub_id='timm/',
944+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
945+
'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg(
946+
hf_hub_id='timm/',
947+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
948+
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
988949

989-
'vit_base_patch32_clip_224.openai': _cfg(
950+
'vit_base_patch32_clip_224.laion2b_ft_in1k': _cfg(
990951
hf_hub_id='timm/',
991-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
992-
'vit_base_patch16_clip_224.openai': _cfg(
952+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
953+
'vit_base_patch16_clip_224.laion2b_ft_in1k': _cfg(
993954
hf_hub_id='timm/',
994-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
995-
'vit_large_patch14_clip_224.openai': _cfg(
955+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
956+
'vit_base_patch16_clip_384.laion2b_ft_in1k': _cfg(
996957
hf_hub_id='timm/',
997-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
958+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
959+
crop_pct=1.0, input_size=(3, 384, 384), crop_mode='squash'),
960+
'vit_large_patch14_clip_224.laion2b_ft_in1k': _cfg(
961+
hf_hub_id='timm/',
962+
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0),
963+
'vit_large_patch14_clip_336.laion2b_ft_in1k': _cfg(
964+
hf_hub_id='timm/',
965+
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
966+
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
967+
'vit_huge_patch14_clip_224.laion2b_ft_in1k': _cfg(
968+
hf_hub_id='timm/',
969+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
970+
'vit_huge_patch14_clip_336.laion2b_ft_in1k': _cfg(
971+
hf_hub_id='',
972+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
973+
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
998974

999975
'vit_base_patch32_clip_224.openai_ft_in1k': _cfg(
1000976
hf_hub_id='timm/',
@@ -1010,30 +986,21 @@ def _cfg(url='', **kwargs):
1010986
hf_hub_id='timm/',
1011987
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1012988

1013-
'vit_base_patch32_clip_224.openai_ft_in12k_in1k': _cfg(
1014-
#hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k_in1k',
1015-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
1016-
'vit_base_patch32_clip_384.openai_ft_in12k_in1k': _cfg(
1017-
hf_hub_id='timm/',
1018-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1019-
crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
1020-
'vit_base_patch16_clip_224.openai_ft_in12k_in1k': _cfg(
1021-
hf_hub_id='timm/',
1022-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=0.95),
1023-
'vit_base_patch16_clip_384.openai_ft_in12k_in1k': _cfg(
989+
'vit_base_patch32_clip_224.laion2b_ft_in12k': _cfg(
990+
#hf_hub_id='timm/vit_base_patch32_clip_224.laion2b_ft_in12k',
991+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
992+
'vit_base_patch16_clip_224.laion2b_ft_in12k': _cfg(
1024993
hf_hub_id='timm/',
1025-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1026-
crop_pct=0.95, input_size=(3, 384, 384), crop_mode='squash'),
1027-
'vit_large_patch14_clip_224.openai_ft_in12k_in1k': _cfg(
994+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
995+
'vit_large_patch14_clip_224.laion2b_ft_in12k': _cfg(
1028996
hf_hub_id='timm/',
1029-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0),
1030-
'vit_large_patch14_clip_336.openai_ft_in12k_in1k': _cfg(
997+
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=11821),
998+
'vit_huge_patch14_clip_224.laion2b_ft_in12k': _cfg(
1031999
hf_hub_id='timm/',
1032-
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1033-
crop_pct=1.0, input_size=(3, 336, 336), crop_mode='squash'),
1000+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
10341001

10351002
'vit_base_patch32_clip_224.openai_ft_in12k': _cfg(
1036-
#hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
1003+
# hf_hub_id='timm/vit_base_patch32_clip_224.openai_ft_in12k',
10371004
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821),
10381005
'vit_base_patch16_clip_224.openai_ft_in12k': _cfg(
10391006
hf_hub_id='timm/',
@@ -1042,6 +1009,37 @@ def _cfg(url='', **kwargs):
10421009
hf_hub_id='timm/',
10431010
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=11821),
10441011

1012+
'vit_base_patch32_clip_224.laion2b': _cfg(
1013+
hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K',
1014+
hf_hub_filename='open_clip_pytorch_model.bin',
1015+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1016+
'vit_base_patch16_clip_224.laion2b': _cfg(
1017+
# hf_hub_id='laion/CLIP-ViT-B-16-laion2B-s34B-b88K',
1018+
hf_hub_filename='open_clip_pytorch_model.bin',
1019+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
1020+
'vit_large_patch14_clip_224.laion2b': _cfg(
1021+
hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K',
1022+
hf_hub_filename='open_clip_pytorch_model.bin',
1023+
mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, crop_pct=1.0, num_classes=768),
1024+
'vit_huge_patch14_clip_224.laion2b': _cfg(
1025+
hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
1026+
hf_hub_filename='open_clip_pytorch_model.bin',
1027+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1028+
'vit_giant_patch14_clip_224.laion2b': _cfg(
1029+
hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K',
1030+
hf_hub_filename='open_clip_pytorch_model.bin',
1031+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1032+
1033+
'vit_base_patch32_clip_224.openai': _cfg(
1034+
hf_hub_id='timm/',
1035+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1036+
'vit_base_patch16_clip_224.openai': _cfg(
1037+
hf_hub_id='timm/',
1038+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1039+
'vit_large_patch14_clip_224.openai': _cfg(
1040+
hf_hub_id='timm/',
1041+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
1042+
10451043
# experimental (may be removed)
10461044
'vit_base_patch32_plus_256': _cfg(url='', input_size=(3, 256, 256), crop_pct=0.95),
10471045
'vit_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240), crop_pct=0.95),

0 commit comments

Comments
 (0)