implement skip connections for sam vit encoder

Rustem Galiullin · Rustem Galiullin · commit c5bc3567a0d6 · 2023-06-12T16:25:57.000+04:00
diff --git a/segmentation_models_pytorch/encoders/sam.py b/segmentation_models_pytorch/encoders/sam.py
@@ -4,6 +4,8 @@
 
 import torch
 from segment_anything.modeling import ImageEncoderViT
+from torch import nn
+from segment_anything.modeling.common import LayerNorm2d
 
 from segmentation_models_pytorch.encoders._base import EncoderMixin
 
@@ -16,15 +18,55 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self._out_chans = kwargs.get("out_chans", 256)
         self._patch_size = kwargs.get("patch_size", 16)
+        self._embed_dim = kwargs.get("embed_dim", 768)
         self._validate()
+        self.intermediate_necks = nn.ModuleList(
+            [self.init_neck(self._embed_dim, out_chan) for out_chan in self.out_channels[:-1]]
+        )
+
+    @staticmethod
+    def init_neck(embed_dim: int, out_chans: int) -> nn.Module:
+        # Use similar neck as in ImageEncoderViT
+        return nn.Sequential(
+            nn.Conv2d(
+                embed_dim,
+                out_chans,
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+            nn.Conv2d(
+                out_chans,
+                out_chans,
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(out_chans),
+        )
+
+    @staticmethod
+    def neck_forward(neck: nn.Module, x: torch.Tensor, scale_factor: float = 1) -> torch.Tensor:
+        x = x.permute(0, 3, 1, 2)
+        if scale_factor != 1.0:
+            x = nn.functional.interpolate(x, scale_factor=scale_factor, mode="bilinear")
+        return neck(x)
+
+    def requires_grad_(self, requires_grad: bool = True):
+        # Keep the intermediate necks trainable
+        for param in self.parameters():
+            param.requires_grad_(requires_grad)
+        for param in self.intermediate_necks.parameters():
+            param.requires_grad_(True)
+        return self
 
     @property
     def output_stride(self):
         return 32
 
-    def _get_scale_factor(self) -> float:
-        """Input image will be downscale by this factor"""
-        return int(math.log(self._patch_size, 2))
+    @property
+    def out_channels(self):
+        return [self._out_chans // (2**i) for i in range(self._encoder_depth + 1)][::-1]
 
     def _validate(self):
         # check vit depth
@@ -39,15 +81,30 @@ def _validate(self):
                 "It is recommended to set encoder depth=4 with default vit patch_size=16."
             )
 
-    @property
-    def out_channels(self):
-        # Fill up with leading zeros to be used in Unet
-        scale_factor = self._get_scale_factor()
-        return [0] * scale_factor + [self._out_chans]
+    def _get_scale_factor(self) -> float:
+        """Input image will be downscale by this factor"""
+        return int(math.log(self._patch_size, 2))
 
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
-        # Return a list of tensors to match other encoders
-        return [x, super().forward(x)]
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+
+        features = []
+        skip_steps = self._vit_depth // self._encoder_depth
+        scale_factor = self._get_scale_factor()
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i % skip_steps == 0:
+                # Double spatial dimension and halve number of channels
+                neck = self.intermediate_necks[i // skip_steps]
+                features.append(self.neck_forward(neck, x, scale_factor=2**scale_factor))
+                scale_factor -= 1
+
+        x = self.neck(x.permute(0, 3, 1, 2))
+        features.append(x)
+
+        return features
 
     def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) -> None:
         # Exclude mask_decoder and prompt encoder weights
@@ -58,6 +115,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) ->
             if not k.startswith("mask_decoder") and not k.startswith("prompt_encoder")
         }
         missing, unused = super().load_state_dict(state_dict, strict=False)
+        missing = list(filter(lambda x: not x.startswith("intermediate_necks"), missing))
         if len(missing) + len(unused) > 0:
             n_loaded = len(state_dict) - len(missing) - len(unused)
             warnings.warn(
diff --git a/tests/test_sam.py b/tests/test_sam.py
@@ -13,13 +13,35 @@
 def test_sam_encoder(encoder_name, img_size, patch_size, depth, vit_depth):
     encoder = get_encoder(encoder_name, img_size=img_size, patch_size=patch_size, depth=depth, vit_depth=vit_depth)
     assert encoder.output_stride == 32
+    assert encoder.out_channels == [256 // (2**i) for i in range(depth + 1)][::-1]
 
     sample = torch.ones(1, 3, img_size, img_size)
     with torch.no_grad():
         out = encoder(sample)
 
-    expected_patches = img_size // patch_size
-    assert out[-1].size() == torch.Size([1, 256, expected_patches, expected_patches])
+    assert len(out) == depth + 1
+
+    expected_spatial_size = img_size // patch_size
+    expected_chans = 256
+    for i in range(1, len(out)):
+        assert out[-i].size() == torch.Size([1, expected_chans, expected_spatial_size, expected_spatial_size])
+        expected_spatial_size *= 2
+        expected_chans //= 2
+
+
+def test_sam_encoder_trainable():
+    encoder = get_encoder("sam-vit_b", depth=4)
+
+    encoder.requires_grad_(False)
+    for name, param in encoder.named_parameters():
+        if name.startswith("intermediate_necks"):
+            assert param.requires_grad
+        else:
+            assert not param.requires_grad
+
+    encoder.requires_grad_(True)
+    for param in encoder.parameters():
+        assert param.requires_grad
 
 
 def test_sam_encoder_validation_error():
@@ -29,25 +51,6 @@ def test_sam_encoder_validation_error():
         get_encoder("sam-vit_b", img_size=64, patch_size=16, depth=4, vit_depth=6)
 
 
-@pytest.mark.skip(reason="Decoder has been removed, keeping this for future integration")
-@pytest.mark.parametrize("decoder_multiclass_output", [True, False])
-@pytest.mark.parametrize("n_classes", [1, 3])
-def test_sam(decoder_multiclass_output, n_classes):
-    model = smp.SAM(
-        "sam-vit_b",
-        encoder_weights=None,
-        weights=None,
-        image_size=64,
-        decoder_multimask_output=decoder_multiclass_output,
-        classes=n_classes,
-    )
-    sample = get_sample(smp.SAM)
-    model.eval()
-
-    _test_forward(model, sample, test_shape=True)
-    _test_forward_backward(model, sample, test_shape=True)
-
-
 @pytest.mark.parametrize("model_class", [smp.Unet])
 @pytest.mark.parametrize("decoder_channels,encoder_depth", [([64, 32, 16, 8], 4), ([64, 32, 16, 8], 4)])
 def test_sam_encoder_arch(model_class, decoder_channels, encoder_depth):
@@ -62,11 +65,6 @@ def test_sam_encoder_arch(model_class, decoder_channels, encoder_depth):
     _test_forward_backward(model, smp, test_shape=True)
 
 
-@pytest.mark.skip(reason="Run this test manually as it needs to download weights")
-def test_sam_weights():
-    smp.create_model("sam", encoder_name="sam-vit_b", encoder_weights=None, weights="sa-1b")
-
-
 @pytest.mark.skip(reason="Run this test manually as it needs to download weights")
 def test_sam_encoder_weights():
     smp.create_model(