use vit_depth to control sam vit depth

Rustem Galiullin · Rustem Galiullin · commit 5edc0ee9452b · 2023-06-02T13:10:05.000+04:00
diff --git a/segmentation_models_pytorch/decoders/unet/model.py b/segmentation_models_pytorch/decoders/unet/model.py
@@ -65,30 +65,20 @@ def __init__(
         classes: int = 1,
         activation: Optional[Union[str, callable]] = None,
         aux_params: Optional[dict] = None,
-        encoder_kwargs: Optional[dict] = None,
     ):
         super().__init__()
 
-        # if sam encoder, make sure to make num_hidden_skips is set
-        if encoder_name.startswith("sam-"):
-            encoder_kwargs = encoder_kwargs if encoder_kwargs is not None else {}
-            encoder_kwargs.update({"num_hidden_skips": len(decoder_channels)})
-            n_decoder_blocks = len(decoder_channels)
-        else:
-            n_decoder_blocks = encoder_depth
-
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
             depth=encoder_depth,
             weights=encoder_weights,
-            **encoder_kwargs if encoder_kwargs is not None else {},
         )
 
         self.decoder = UnetDecoder(
             encoder_channels=self.encoder.out_channels,
             decoder_channels=decoder_channels,
-            n_blocks=n_decoder_blocks,
+            n_blocks=encoder_depth,
             use_batchnorm=decoder_use_batchnorm,
             center=True if encoder_name.startswith("vgg") else False,
             attention_type=decoder_attention_type,
diff --git a/segmentation_models_pytorch/encoders/__init__.py b/segmentation_models_pytorch/encoders/__init__.py
@@ -97,13 +97,8 @@ def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **
         raise KeyError("Wrong encoder name `{}`, supported encoders: {}".format(name, list(encoders.keys())))
 
     params = encoders[name]["params"]
-    if name.startswith("sam-"):
-        params.update(**kwargs)
-        params.update(dict(name=name[4:]))
-        if depth is not None:
-            params.update(depth=depth)
-    else:
-        params.update(depth=depth)
+    params.update(depth=depth)
+    params.update(kwargs)
     encoder = Encoder(**params)
 
     if weights is not None:
diff --git a/segmentation_models_pytorch/encoders/sam.py b/segmentation_models_pytorch/encoders/sam.py
@@ -9,33 +9,41 @@
 
 
 class SamVitEncoder(EncoderMixin, ImageEncoderViT):
-    def __init__(self, name: str, **kwargs):
-        patch_size = kwargs.get("patch_size", 16)
-        n_skips = kwargs.pop("num_hidden_skips", int(self._get_scale_factor(patch_size)))
+    def __init__(self, **kwargs):
+        self._vit_depth = kwargs.pop("vit_depth")
+        self._encoder_depth = kwargs.get("depth", 5)
+        kwargs.update({"depth": self._vit_depth})
         super().__init__(**kwargs)
-        self._name = name
-        self._depth = kwargs["depth"]
         self._out_chans = kwargs.get("out_chans", 256)
-        self._num_skips = n_skips
-        self._validate_output(patch_size)
+        self._patch_size = kwargs.get("patch_size", 16)
+        self._validate()
 
-    @staticmethod
-    def _get_scale_factor(patch_size: int) -> float:
+    @property
+    def output_stride(self):
+        return 32
+
+    def _get_scale_factor(self) -> float:
         """Input image will be downscale by this factor"""
-        return math.log(patch_size, 2)
+        return int(math.log(self._patch_size, 2))
 
-    def _validate_output(self, patch_size: int):
-        scale_factor = self._get_scale_factor(patch_size)
-        if scale_factor != self._num_skips:
+    def _validate(self):
+        # check vit depth
+        if self._vit_depth not in [12, 24, 32]:
+            raise ValueError(f"vit_depth must be one of [12, 24, 32], got {self._vit_depth}")
+        # check output
+        scale_factor = self._get_scale_factor()
+        if scale_factor != self._encoder_depth:
             raise ValueError(
-                f"With {patch_size=} and {self._num_skips} skip connection layers, "
-                "spatial dimensions of model output will not match input spatial dimensions"
+                f"With patch_size={self._patch_size} and depth={self._encoder_depth}, "
+                "spatial dimensions of model output will not match input spatial dimensions. "
+                "It is recommended to set encoder depth=4 with default vit patch_size=16."
             )
 
     @property
     def out_channels(self):
         # Fill up with leading zeros to be used in Unet
-        return [0] * self._num_skips + [self._out_chans]
+        scale_factor = self._get_scale_factor()
+        return [0] * scale_factor + [self._out_chans]
 
     def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
         # Return a list of tensors to match other encoders
@@ -66,7 +74,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) ->
         },
         "params": dict(
             embed_dim=1280,
-            depth=32,
+            vit_depth=32,
             num_heads=16,
             global_attn_indexes=[7, 15, 23, 31],
         ),
@@ -78,7 +86,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) ->
         },
         "params": dict(
             embed_dim=1024,
-            depth=24,
+            vit_depth=24,
             num_heads=16,
             global_attn_indexes=[5, 11, 17, 23],
         ),
@@ -90,7 +98,7 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True) ->
         },
         "params": dict(
             embed_dim=768,
-            depth=12,
+            vit_depth=12,
             num_heads=12,
             global_attn_indexes=[2, 5, 8, 11],
         ),
diff --git a/tests/test_sam.py b/tests/test_sam.py
@@ -8,11 +8,10 @@
 
 @pytest.mark.parametrize("encoder_name", ["sam-vit_b", "sam-vit_l"])
 @pytest.mark.parametrize("img_size", [64, 128])
-@pytest.mark.parametrize("patch_size", [8, 16])
-@pytest.mark.parametrize("depth", [6, 24, None])
-def test_sam_encoder(encoder_name, img_size, patch_size, depth):
-    encoder = get_encoder(encoder_name, img_size=img_size, patch_size=patch_size, depth=depth)
-    assert encoder._name == encoder_name[4:]
+@pytest.mark.parametrize("patch_size,depth", [(8, 3), (16, 4)])
+@pytest.mark.parametrize("vit_depth", [12, 24])
+def test_sam_encoder(encoder_name, img_size, patch_size, depth, vit_depth):
+    encoder = get_encoder(encoder_name, img_size=img_size, patch_size=patch_size, depth=depth, vit_depth=vit_depth)
     assert encoder.output_stride == 32
 
     sample = torch.ones(1, 3, img_size, img_size)
@@ -23,6 +22,13 @@ def test_sam_encoder(encoder_name, img_size, patch_size, depth):
     assert out[-1].size() == torch.Size([1, 256, expected_patches, expected_patches])
 
 
+def test_sam_encoder_validation_error():
+    with pytest.raises(ValueError):
+        get_encoder("sam-vit_b", img_size=64, patch_size=16, depth=5, vit_depth=12)
+        get_encoder("sam-vit_b", img_size=64, patch_size=16, depth=4, vit_depth=None)
+        get_encoder("sam-vit_b", img_size=64, patch_size=16, depth=4, vit_depth=6)
+
+
 @pytest.mark.skip(reason="Decoder has been removed, keeping this for future integration")
 @pytest.mark.parametrize("decoder_multiclass_output", [True, False])
 @pytest.mark.parametrize("n_classes", [1, 3])
@@ -43,14 +49,13 @@ def test_sam(decoder_multiclass_output, n_classes):
 
 
 @pytest.mark.parametrize("model_class", [smp.Unet])
-@pytest.mark.parametrize("decoder_channels,patch_size", [([64, 32, 16, 8], 16), ([64, 32, 16], 8)])
-def test_sam_as_encoder_only(model_class, decoder_channels, patch_size):
-    img_size = 64
+@pytest.mark.parametrize("decoder_channels,encoder_depth", [([64, 32, 16, 8], 4), ([64, 32, 16, 8], 4)])
+def test_sam_encoder_arch(model_class, decoder_channels, encoder_depth):
+    img_size = 1024
     model = model_class(
         "sam-vit_b",
         encoder_weights=None,
-        encoder_depth=3,
-        encoder_kwargs=dict(img_size=img_size, out_chans=decoder_channels[0], patch_size=patch_size),
+        encoder_depth=encoder_depth,
         decoder_channels=decoder_channels,
     )
     smp = torch.ones(1, 3, img_size, img_size)
@@ -65,5 +70,5 @@ def test_sam_weights():
 @pytest.mark.skip(reason="Run this test manually as it needs to download weights")
 def test_sam_encoder_weights():
     smp.create_model(
-        "unet", encoder_name="sam-vit_b", encoder_weights="sa-1b", encoder_depth=12, decoder_channels=[64, 32, 16, 8]
+        "unet", encoder_name="sam-vit_b", encoder_depth=4, encoder_weights="sa-1b", decoder_channels=[64, 32, 16, 8]
     )