Skip to content

Commit 1b5cae6

Browse files
committed
Update some clip pretrained weights to point to new hub locations, add a few missing weights
1 parent 310ffa3 commit 1b5cae6

File tree

2 files changed

+120
-28
lines changed

2 files changed

+120
-28
lines changed

timm/models/byobnet.py

+39
Original file line numberDiff line numberDiff line change
@@ -2315,6 +2315,27 @@ def _cfgr(url='', **kwargs):
23152315
fixed_input_size=True, input_size=(3, 448, 448), pool_size=(14, 14),
23162316
classifier='head.proj',
23172317
),
2318+
'resnet50_clip.cc12m': _cfgr(
2319+
hf_hub_id='timm/',
2320+
hf_hub_filename='open_clip_pytorch_model.bin',
2321+
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2322+
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
2323+
classifier='head.proj',
2324+
),
2325+
'resnet50_clip.yfcc15m': _cfgr(
2326+
hf_hub_id='timm/',
2327+
hf_hub_filename='open_clip_pytorch_model.bin',
2328+
num_classes=1024, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2329+
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
2330+
classifier='head.proj',
2331+
),
2332+
'resnet101_clip.yfcc15m': _cfgr(
2333+
hf_hub_id='timm/',
2334+
hf_hub_filename='open_clip_pytorch_model.bin',
2335+
num_classes=512, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2336+
fixed_input_size=True, input_size=(3, 224, 224), pool_size=(7, 7),
2337+
classifier='head.proj',
2338+
),
23182339

23192340
# avg-pool w/ optional standard classifier head variants
23202341
'resnet50_clip_gap.openai': _cfgr(
@@ -2347,6 +2368,24 @@ def _cfgr(url='', **kwargs):
23472368
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
23482369
input_size=(3, 448, 448), pool_size=(14, 14),
23492370
),
2371+
'resnet50_clip_gap.cc12m': _cfgr(
2372+
hf_hub_id='timm/resnet50_clip.cc12m',
2373+
hf_hub_filename='open_clip_pytorch_model.bin',
2374+
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2375+
input_size=(3, 224, 224), pool_size=(7, 7),
2376+
),
2377+
'resnet50_clip_gap.yfcc15m': _cfgr(
2378+
hf_hub_id='timm/resnet50_clip.cc12m',
2379+
hf_hub_filename='open_clip_pytorch_model.bin',
2380+
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2381+
input_size=(3, 224, 224), pool_size=(7, 7),
2382+
),
2383+
'resnet101_clip_gap.yfcc15m': _cfgr(
2384+
hf_hub_id='timm/resnet101_clip_gap.yfcc15m',
2385+
hf_hub_filename='open_clip_pytorch_model.bin',
2386+
num_classes=0, mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
2387+
input_size=(3, 224, 224), pool_size=(7, 7),
2388+
),
23502389

23512390
'resnet50_mlp.untrained': _cfgr(
23522391
input_size=(3, 256, 256), pool_size=(8, 8),

timm/models/vision_transformer.py

+81-28
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
2424
Hacked together by / Copyright 2020, Ross Wightman
2525
"""
26+
import copy
2627
import logging
2728
import math
2829
from collections import OrderedDict
@@ -1601,6 +1602,21 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16011602
hf_hub_filename='open_clip_pytorch_model.bin',
16021603
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
16031604

1605+
'vit_base_patch32_clip_224.laion400m_e32': _cfg(
1606+
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1607+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1608+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
1609+
'vit_base_patch16_clip_224.laion400m_e32': _cfg(
1610+
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1611+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
1612+
'vit_base_patch16_plus_clip_240.laion400m_e32': _cfg(
1613+
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1614+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
1615+
input_size=(3, 240, 240), crop_pct=1.0, num_classes=512),
1616+
'vit_large_patch14_clip_224.laion400m_e32': _cfg(
1617+
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
1618+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
1619+
16041620
'vit_base_patch32_clip_224.datacompxl': _cfg(
16051621
hf_hub_id='laion/CLIP-ViT-B-32-DataComp.XL-s13B-b90K',
16061622
hf_hub_filename='open_clip_pytorch_model.bin',
@@ -1641,44 +1657,68 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
16411657
crop_pct=1.0, input_size=(3, 378, 378), num_classes=1024),
16421658

16431659
'vit_base_patch32_clip_224.metaclip_2pt5b': _cfg(
1644-
hf_hub_id='facebook/metaclip-b32-fullcc2.5b',
1645-
hf_hub_filename='metaclip_b32_fullcc2.5b.bin',
1660+
hf_hub_id='timm/',
1661+
hf_hub_filename='open_clip_pytorch_model.bin',
16461662
license='cc-by-nc-4.0',
16471663
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16481664
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16491665
'vit_base_patch16_clip_224.metaclip_2pt5b': _cfg(
1650-
hf_hub_id='facebook/metaclip-b16-fullcc2.5b',
1651-
hf_hub_filename='metaclip_b16_fullcc2.5b.bin',
1666+
hf_hub_id='timm/',
1667+
hf_hub_filename='open_clip_pytorch_model.bin',
16521668
license='cc-by-nc-4.0',
16531669
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16541670
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
16551671
'vit_large_patch14_clip_224.metaclip_2pt5b': _cfg(
1656-
hf_hub_id='facebook/metaclip-l14-fullcc2.5b',
1657-
hf_hub_filename='metaclip_l14_fullcc2.5b.bin',
1672+
hf_hub_id='timm/',
1673+
hf_hub_filename='open_clip_pytorch_model.bin',
16581674
license='cc-by-nc-4.0',
16591675
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16601676
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16611677
'vit_huge_patch14_clip_224.metaclip_2pt5b': _cfg(
1662-
hf_hub_id='facebook/metaclip-h14-fullcc2.5b',
1663-
hf_hub_filename='metaclip_h14_fullcc2.5b.bin',
1678+
hf_hub_id='timm/',
1679+
hf_hub_filename='open_clip_pytorch_model.bin',
16641680
license='cc-by-nc-4.0',
16651681
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16661682
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1024),
1683+
'vit_gigantic_patch14_clip_224.metaclip_2pt5b': _cfg(
1684+
hf_hub_id='timm/',
1685+
hf_hub_filename='open_clip_pytorch_model.bin',
1686+
license='cc-by-nc-4.0',
1687+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1688+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=1280),
1689+
'vit_base_patch32_clip_224.metaclip_400m': _cfg(
1690+
hf_hub_id='timm/',
1691+
hf_hub_filename='open_clip_pytorch_model.bin',
1692+
license='cc-by-nc-4.0',
1693+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1694+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
1695+
'vit_base_patch16_clip_224.metaclip_400m': _cfg(
1696+
hf_hub_id='timm/',
1697+
hf_hub_filename='open_clip_pytorch_model.bin',
1698+
license='cc-by-nc-4.0',
1699+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1700+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=512),
1701+
'vit_large_patch14_clip_224.metaclip_400m': _cfg(
1702+
hf_hub_id='timm/',
1703+
hf_hub_filename='open_clip_pytorch_model.bin',
1704+
license='cc-by-nc-4.0',
1705+
notes=('natively QuickGELU, use quickgelu model variant for original results',),
1706+
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16671707

16681708
'vit_base_patch32_clip_224.openai': _cfg(
1669-
hf_hub_id='timm/vit_base_patch32_clip_224.openai',
1709+
hf_hub_id='timm/',
16701710
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16711711
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
16721712
'vit_base_patch16_clip_224.openai': _cfg(
1673-
hf_hub_id='timm/vit_base_patch16_clip_224.openai',
1713+
hf_hub_id='timm/',
16741714
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16751715
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512),
16761716
'vit_large_patch14_clip_224.openai': _cfg(
1677-
hf_hub_id='timm/vit_large_patch14_clip_224.openai',
1717+
hf_hub_id='timm/',
16781718
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16791719
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, crop_pct=1.0, num_classes=768),
16801720
'vit_large_patch14_clip_336.openai': _cfg(
1681-
hf_hub_id='timm/vit_large_patch14_clip_336.openai', hf_hub_filename='open_clip_pytorch_model.bin',
1721+
hf_hub_id='timm/', hf_hub_filename='open_clip_pytorch_model.bin',
16821722
notes=('natively QuickGELU, use quickgelu model variant for original results',),
16831723
mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
16841724
crop_pct=1.0, input_size=(3, 336, 336), num_classes=768),
@@ -2071,22 +2111,13 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
20712111
input_size=(3, 160, 160), crop_pct=0.95),
20722112
}
20732113

2074-
_quick_gelu_cfgs = [
2075-
'vit_large_patch14_clip_224.dfn2b',
2076-
'vit_huge_patch14_clip_224.dfn5b',
2077-
'vit_huge_patch14_clip_378.dfn5b',
2078-
'vit_base_patch32_clip_224.metaclip_2pt5b',
2079-
'vit_base_patch16_clip_224.metaclip_2pt5b',
2080-
'vit_large_patch14_clip_224.metaclip_2pt5b',
2081-
'vit_huge_patch14_clip_224.metaclip_2pt5b',
2082-
'vit_base_patch32_clip_224.openai',
2083-
'vit_base_patch16_clip_224.openai',
2084-
'vit_large_patch14_clip_224.openai',
2085-
'vit_large_patch14_clip_336.openai',
2086-
]
2087-
default_cfgs.update({
2088-
n.replace('_clip_', '_clip_quickgelu_'): default_cfgs[n] for n in _quick_gelu_cfgs
2089-
})
2114+
_quick_gelu_cfgs = [n for n, c in default_cfgs.items() if c.get('notes', ()) and 'quickgelu' in c['notes'][0]]
2115+
for n in _quick_gelu_cfgs:
2116+
# generate quickgelu default cfgs based on contents of notes field
2117+
c = copy.deepcopy(default_cfgs[n])
2118+
if c['hf_hub_id'] == 'timm/':
2119+
c['hf_hub_id'] = 'timm/' + n # need to use non-quickgelu model name for hub id
2120+
default_cfgs[n.replace('_clip_', '_clip_quickgelu_')] = c
20902121
default_cfgs = generate_default_cfgs(default_cfgs)
20912122

20922123

@@ -2510,6 +2541,16 @@ def vit_base_patch16_clip_384(pretrained: bool = False, **kwargs) -> VisionTrans
25102541
return model
25112542

25122543

2544+
@register_model
2545+
def vit_base_patch16_plus_clip_240(pretrained: bool = False, **kwargs) -> VisionTransformer:
2546+
""" ViT-Base (ViT-B/16+) CLIP image tower @ 240x240
2547+
"""
2548+
model_args = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, pre_norm=True, norm_layer=nn.LayerNorm)
2549+
model = _create_vision_transformer(
2550+
'vit_base_patch16_plus_clip_240', pretrained=pretrained, **dict(model_args, **kwargs))
2551+
return model
2552+
2553+
25132554
@register_model
25142555
def vit_large_patch14_clip_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
25152556
""" ViT-Large model (ViT-L/14) CLIP image tower
@@ -2656,6 +2697,18 @@ def vit_huge_patch14_clip_quickgelu_378(pretrained: bool = False, **kwargs) -> V
26562697
return model
26572698

26582699

2700+
@register_model
2701+
def vit_gigantic_patch14_clip_quickgelu_224(pretrained: bool = False, **kwargs) -> VisionTransformer:
2702+
""" ViT-bigG model (ViT-G/14) w/ QuickGELU act
2703+
"""
2704+
model_args = dict(
2705+
patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, pre_norm=True,
2706+
norm_layer=nn.LayerNorm, act_layer='quick_gelu')
2707+
model = _create_vision_transformer(
2708+
'vit_gigantic_patch14_clip_quickgelu_224', pretrained=pretrained, **dict(model_args, **kwargs))
2709+
return model
2710+
2711+
26592712
# Experimental models below
26602713

26612714
@register_model

0 commit comments

Comments
 (0)