Add DropPath (stochastic depth) to ReXNet and VoVNet. RegNet DropPath impl tweak and dedupe se args.

rwightman · rwightman · commit e8e2d9cabf05 · 2020-09-24T18:20:36.000-07:00
diff --git a/timm/models/regnet.py b/timm/models/regnet.py
@@ -195,15 +195,18 @@ class RegStage(nn.Module):
     """Stage (sequence of blocks w/ the same output shape)."""
 
     def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio, group_width,
-                 block_fn=Bottleneck, se_ratio=0., drop_path_rate=None, drop_block=None):
+                 block_fn=Bottleneck, se_ratio=0., drop_path_rates=None, drop_block=None):
         super(RegStage, self).__init__()
         block_kwargs = {}  # FIXME setup to pass various aa, norm, act layer common args
         first_dilation = 1 if dilation in (1, 2) else 2
         for i in range(depth):
             block_stride = stride if i == 0 else 1
             block_in_chs = in_chs if i == 0 else out_chs
             block_dilation = first_dilation if i == 0 else dilation
-            drop_path = DropPath(drop_path_rate[i]) if drop_path_rate is not None else None
+            if drop_path_rates is not None and drop_path_rates[i] > 0.:
+                drop_path = DropPath(drop_path_rates[i])
+            else:
+                drop_path = None
             if (block_in_chs != out_chs) or (block_stride != 1):
                 proj_block = downsample_conv(block_in_chs, out_chs, 1, block_stride, block_dilation)
             else:
@@ -301,7 +304,7 @@ def _get_stage_params(self, cfg, default_stride=2, output_stride=32, drop_path_r
 
         # Adjust the compatibility of ws and gws
         stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths, stage_bottle_ratios, stage_groups)
-        param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rate']
+        param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rates']
         stage_params = [
             dict(zip(param_names, params)) for params in
             zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_bottle_ratios, stage_groups,
diff --git a/timm/models/rexnet.py b/timm/models/rexnet.py
@@ -15,7 +15,7 @@
 
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .helpers import build_model_with_cfg
-from .layers import ClassifierHead, create_act_layer, ConvBnAct
+from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath
 from .registry import register_model
 
 
@@ -56,10 +56,10 @@ def make_divisible(v, divisor=8, min_value=None):
 
 class SEWithNorm(nn.Module):
 
-    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, divisor=1, reduction_channels=None,
+    def __init__(self, channels, se_ratio=1 / 12., act_layer=nn.ReLU, divisor=1, reduction_channels=None,
                  gate_layer='sigmoid'):
         super(SEWithNorm, self).__init__()
-        reduction_channels = reduction_channels or make_divisible(channels // reduction, divisor=divisor)
+        reduction_channels = reduction_channels or make_divisible(int(channels * se_ratio), divisor=divisor)
         self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True)
         self.bn = nn.BatchNorm2d(reduction_channels)
         self.act = act_layer(inplace=True)
@@ -76,7 +76,7 @@ def forward(self, x):
 
 
 class LinearBottleneck(nn.Module):
-    def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, use_se=True, se_rd=12, ch_div=1):
+    def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, se_ratio=0., ch_div=1, drop_path=None):
         super(LinearBottleneck, self).__init__()
         self.use_shortcut = stride == 1 and in_chs <= out_chs
         self.in_channels = in_chs
@@ -90,10 +90,11 @@ def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, use_se=True, se_rd=12
             self.conv_exp = None
 
         self.conv_dw = ConvBnAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False)
-        self.se = SEWithNorm(dw_chs, reduction=se_rd, divisor=ch_div) if use_se else None
+        self.se = SEWithNorm(dw_chs, se_ratio=se_ratio, divisor=ch_div) if se_ratio > 0. else None
         self.act_dw = nn.ReLU6()
 
         self.conv_pwl = ConvBnAct(dw_chs, out_chs, 1, apply_act=False)
+        self.drop_path = drop_path
 
     def feat_channels(self, exp=False):
         return self.conv_dw.out_channels if exp else self.out_channels
@@ -107,12 +108,14 @@ def forward(self, x):
             x = self.se(x)
         x = self.act_dw(x)
         x = self.conv_pwl(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
         if self.use_shortcut:
             x[:, 0:self.in_channels] += shortcut
         return x
 
 
-def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, use_se=True, ch_div=1):
+def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, se_ratio=0., ch_div=1):
     layers = [1, 2, 2, 3, 3, 5]
     strides = [1, 2, 2, 2, 1, 2]
     layers = [ceil(element * depth_mult) for element in layers]
@@ -127,29 +130,31 @@ def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, us
         out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div))
         base_chs += final_chs / (depth // 3 * 1.0)
 
-    if use_se:
-        use_ses = [False] * (layers[0] + layers[1]) + [True] * sum(layers[2:])
-    else:
-        use_ses = [False] * sum(layers[:])
+    se_ratios = [0.] * (layers[0] + layers[1]) + [se_ratio] * sum(layers[2:])
 
-    return zip(out_chs_list, exp_ratios, strides, use_ses)
+    return list(zip(out_chs_list, exp_ratios, strides, se_ratios))
 
 
-def _build_blocks(block_cfg, prev_chs, width_mult, se_rd=12, ch_div=1, feature_location='bottleneck'):
+def _build_blocks(
+        block_cfg, prev_chs, width_mult, ch_div=1, drop_path_rate=0., feature_location='bottleneck'):
     feat_exp = feature_location == 'expansion'
     feat_chs = [prev_chs]
     feature_info = []
     curr_stride = 2
     features = []
-    for block_idx, (chs, exp_ratio, stride, se) in enumerate(block_cfg):
+    num_blocks = len(block_cfg)
+    for block_idx, (chs, exp_ratio, stride, se_ratio) in enumerate(block_cfg):
         if stride > 1:
             fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}'
             if block_idx > 0 and feat_exp:
                 fname += '.act_dw'
             feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)]
             curr_stride *= stride
+        block_dpr = drop_path_rate * block_idx / (num_blocks - 1)  # stochastic depth linear decay rule
+        drop_path = DropPath(block_dpr) if block_dpr > 0. else None
         features.append(LinearBottleneck(
-            in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, use_se=se, se_rd=se_rd, ch_div=ch_div))
+            in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, se_ratio=se_ratio,
+            ch_div=ch_div, drop_path=drop_path))
         prev_chs = chs
         feat_chs += [features[-1].feat_channels(feat_exp)]
     pen_chs = make_divisible(1280 * width_mult, divisor=ch_div)
@@ -162,8 +167,8 @@ def _build_blocks(block_cfg, prev_chs, width_mult, se_rd=12, ch_div=1, feature_l
 
 class ReXNetV1(nn.Module):
     def __init__(self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32,
-                 initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, use_se=True,
-                 se_rd=12, ch_div=1, drop_rate=0.2, feature_location='bottleneck'):
+                 initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, se_ratio=1/12.,
+                 ch_div=1, drop_rate=0.2, drop_path_rate=0., feature_location='bottleneck'):
         super(ReXNetV1, self).__init__()
         self.drop_rate = drop_rate
         self.num_classes = num_classes
@@ -173,9 +178,9 @@ def __init__(self, in_chans=3, num_classes=1000, global_pool='avg', output_strid
         stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div)
         self.stem = ConvBnAct(in_chans, stem_chs, 3, stride=2, act_layer='swish')
 
-        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, use_se, ch_div)
+        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div)
         features, self.feature_info = _build_blocks(
-            block_cfg, stem_chs, width_mult, se_rd, ch_div, feature_location)
+            block_cfg, stem_chs, width_mult, ch_div, drop_path_rate, feature_location)
         self.num_features = features[-1].out_channels
         self.features = nn.Sequential(*features)
 
diff --git a/timm/models/vovnet.py b/timm/models/vovnet.py
@@ -20,7 +20,7 @@
 from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .registry import register_model
 from .helpers import build_model_with_cfg
-from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, \
+from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, DropPath,\
     create_attn, create_norm_act, get_norm_act_layer
 
 
@@ -179,7 +179,7 @@ def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Ten
 class OsaBlock(nn.Module):
 
     def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
-                 depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU):
+                 depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path=None):
         super(OsaBlock, self).__init__()
 
         self.residual = residual
@@ -212,6 +212,8 @@ def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False,
         else:
             self.attn = None
 
+        self.drop_path = drop_path
+
     def forward(self, x):
         output = [x]
         if self.conv_reduction is not None:
@@ -220,6 +222,8 @@ def forward(self, x):
         x = self.conv_concat(x)
         if self.attn is not None:
             x = self.attn(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
         if self.residual:
             x = x + output[0]
         return x
@@ -228,7 +232,8 @@ def forward(self, x):
 class OsaStage(nn.Module):
 
     def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, downsample=True,
-                 residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU):
+                 residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU,
+                 drop_path_rates=None):
         super(OsaStage, self).__init__()
 
         if downsample:
@@ -239,10 +244,15 @@ def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, d
         blocks = []
         for i in range(block_per_stage):
             last_block = i == block_per_stage - 1
+            if drop_path_rates is not None and drop_path_rates[i] > 0.:
+                drop_path = DropPath(drop_path_rates[i])
+            else:
+                drop_path = None
             blocks += [OsaBlock(
-                in_chs if i == 0 else out_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0,
-                depthwise=depthwise, attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer)
+                in_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, depthwise=depthwise,
+                attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer, drop_path=drop_path)
             ]
+            in_chs = out_chs
         self.blocks = nn.Sequential(*blocks)
 
     def forward(self, x):
@@ -255,7 +265,7 @@ def forward(self, x):
 class VovNet(nn.Module):
 
     def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4,
-                 output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU):
+                 output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path_rate=0.):
         """ VovNet (v2)
         """
         super(VovNet, self).__init__()
@@ -284,14 +294,15 @@ def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_ra
         current_stride = stem_stride
 
         # OSA stages
+        stage_dpr = torch.split(torch.linspace(0, drop_path_rate, sum(block_per_stage)), block_per_stage)
         in_ch_list = stem_chs[-1:] + stage_out_chs[:-1]
         stage_args = dict(residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], **conv_kwargs)
         stages = []
         for i in range(4):  # num_stages
             downsample = stem_stride == 2 or i > 0  # first stage has no stride/downsample if stem_stride is 4
             stages += [OsaStage(
                 in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block,
-                downsample=downsample, **stage_args)
+                downsample=downsample, drop_path_rates=stage_dpr[i], **stage_args)
             ]
             self.num_features = stage_out_chs[i]
             current_stride *= 2 if downsample else 1