Added weight conversion script

vedantdalimkar · vedantdalimkar · commit e85836ddf211 · 2025-03-22T11:07:02.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -75,6 +75,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+*ipynb*
 
 # pyenv
 .python-version
@@ -109,4 +110,7 @@ venv.bak/
 .mypy_cache/
 
 # ruff
-.ruff_cache/
+.ruff_cache/
+
+# model weight folder
+dpt_large-ade20k-b12dca68
diff --git a/segmentation_models_pytorch/decoders/dpt/decoder.py b/segmentation_models_pytorch/decoders/dpt/decoder.py
@@ -1,5 +1,7 @@
 import torch
 import torch.nn as nn
+from segmentation_models_pytorch.base.modules import Activation
+from typing import Optional
 
 
 def _get_feature_processing_out_channels(encoder_name: str) -> list[int]:
@@ -71,7 +73,7 @@ def forward(self, feature: torch.Tensor, cls_token: torch.Tensor):
         return feature
 
 
-class FeatureProcessBlock(nn.Module):
+class ReassembleBlock(nn.Module):
     """
     Processes the features such that they have progressively increasing embedding size and progressively decreasing
     spatial dimension
@@ -107,7 +109,11 @@ def __init__(
             )
 
         self.project_to_feature_dim = nn.Conv2d(
-            in_channels=out_channel, out_channels=feature_dim, kernel_size=3, padding=1
+            in_channels=out_channel,
+            out_channels=feature_dim,
+            kernel_size=3,
+            padding=1,
+            bias=False,
         )
 
     def forward(self, x: torch.Tensor):
@@ -121,29 +127,34 @@ def forward(self, x: torch.Tensor):
 class ResidualConvBlock(nn.Module):
     def __init__(self, feature_dim: int):
         super().__init__()
-        self.conv_block = nn.Sequential(
-            nn.ReLU(),
-            nn.Conv2d(
-                in_channels=feature_dim,
-                out_channels=feature_dim,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            nn.BatchNorm2d(num_features=feature_dim),
-            nn.ReLU(),
-            nn.Conv2d(
-                in_channels=feature_dim,
-                out_channels=feature_dim,
-                kernel_size=3,
-                padding=1,
-                bias=False,
-            ),
-            nn.BatchNorm2d(num_features=feature_dim),
+
+        self.conv_1 = nn.Conv2d(
+            in_channels=feature_dim,
+            out_channels=feature_dim,
+            kernel_size=3,
+            padding=1,
+            bias=False,
         )
+        self.batch_norm_1 = nn.BatchNorm2d(num_features=feature_dim)
+        self.conv_2 = nn.Conv2d(
+            in_channels=feature_dim,
+            out_channels=feature_dim,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+        )
+        self.batch_norm_2 = nn.BatchNorm2d(num_features=feature_dim)
+        self.activation = nn.ReLU()
 
     def forward(self, x: torch.Tensor):
-        return x + self.conv_block(x)
+        activated_x_1 = self.activation(x)
+        conv_1_out = self.conv_1(activated_x_1)
+        batch_norm_1_out = self.batch_norm_1(conv_1_out)
+        activated_x_2 = self.activation(batch_norm_1_out)
+        conv_2_out = self.conv_2(activated_x_2)
+        batch_norm_2_out = self.batch_norm_2(conv_2_out)
+
+        return x + batch_norm_2_out
 
 
 class FusionBlock(nn.Module):
@@ -172,7 +183,6 @@ def forward(self, feature: torch.Tensor, preceding_layer_feature: torch.Tensor):
             feature, scale_factor=2, align_corners=True, mode="bilinear"
         )
         feature = self.project(feature)
-        feature = self.activation(feature)
 
         return feature
 
@@ -230,9 +240,9 @@ def __init__(
                 :encoder_depth
             ]
 
-        self.feature_processing_blocks = nn.ModuleList(
+        self.reassemble_blocks = nn.ModuleList(
             [
-                FeatureProcessBlock(
+                ReassembleBlock(
                     transformer_embed_dim, feature_dim, out_channel, upsample_factor
                 )
                 for upsample_factor, out_channel in zip(
@@ -253,7 +263,7 @@ def forward(
         # Process the encoder features to scale of [1/32,1/16,1/8,1/4]
         for index, (feature, cls_token) in enumerate(zip(features, cls_tokens)):
             readout_feature = self.readout_blocks[index](feature, cls_token)
-            processed_feature = self.feature_processing_blocks[index](readout_feature)
+            processed_feature = self.reassemble_blocks[index](readout_feature)
             processed_features.append(processed_feature)
 
         preceding_layer_feature = None
@@ -265,3 +275,38 @@ def forward(
             preceding_layer_feature = out
 
         return out
+
+
+class DPTSegmentationHead(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        activation: Optional[str] = None,
+        kernel_size: int = 3,
+        upsampling: float = 2.0,
+    ):
+        super().__init__()
+
+        self.head = nn.Sequential(
+            nn.Conv2d(
+                in_channels, in_channels, kernel_size=kernel_size, padding=1, bias=False
+            ),
+            nn.BatchNorm2d(in_channels),
+            nn.ReLU(True),
+            nn.Dropout(0.1, False),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1),
+        )
+        self.activation = Activation(activation)
+        self.upsampling_factor = upsampling
+
+    def forward(self, x):
+        head_output = self.head(x)
+        resized_output = nn.functional.interpolate(
+            head_output,
+            scale_factor=self.upsampling_factor,
+            mode="bilinear",
+            align_corners=True,
+        )
+        activation_output = self.activation(resized_output)
+        return activation_output
diff --git a/segmentation_models_pytorch/decoders/dpt/model.py b/segmentation_models_pytorch/decoders/dpt/model.py
@@ -9,7 +9,7 @@
 from segmentation_models_pytorch.encoders import get_encoder
 from segmentation_models_pytorch.base.utils import is_torch_compiling
 from segmentation_models_pytorch.base.hub_mixin import supports_config_loading
-from .decoder import DPTDecoder
+from .decoder import DPTDecoder, DPTSegmentationHead
 
 
 class DPT(SegmentationModel):
@@ -75,6 +75,7 @@ def __init__(
         classes: int = 1,
         activation: Optional[Union[str, Callable]] = None,
         aux_params: Optional[dict] = None,
+        output_stride: Optional[int] = None,
         **kwargs: dict[str, Any],
     ):
         super().__init__()
@@ -86,6 +87,7 @@ def __init__(
             weights=encoder_weights,
             use_vit_encoder=True,
             allow_downsampling=False,
+            output_stride=output_stride,
             allow_output_stride_not_power_of_two=False,
             **kwargs,
         )
@@ -103,11 +105,11 @@ def __init__(
             cls_token_supported=self.cls_token_supported,
         )
 
-        self.segmentation_head = SegmentationHead(
+        self.segmentation_head = DPTSegmentationHead(
             in_channels=feature_dim,
             out_channels=classes,
             activation=activation,
-            kernel_size=1,
+            kernel_size=3,
             upsampling=2,
         )
 
diff --git a/segmentation_models_pytorch/decoders/dpt/weight_conversion_script.py b/segmentation_models_pytorch/decoders/dpt/weight_conversion_script.py
@@ -0,0 +1,109 @@
+import segmentation_models_pytorch as smp
+import torch
+import huggingface_hub
+
+MODEL_WEIGHTS_PATH = r"C:\Users\vedan\Downloads\dpt_large-ade20k-b12dca68.pt"
+HF_HUB_PATH = "vedantdalimkar/DPT"
+
+if __name__ == "__main__":
+    smp_model = smp.DPT(encoder_name="tu-vit_large_patch16_384", classes=150)
+    dpt_model_dict = torch.load(MODEL_WEIGHTS_PATH)
+
+    for layer_index in range(0, 4):
+        for param in [
+            "running_mean",
+            "running_var",
+            "num_batches_tracked",
+            "weight",
+            "bias",
+        ]:
+            for block_index in [1, 2]:
+                for bn_index in [1, 2]:
+                    # Assigning weights of 4th fusion layer of original model to 1st layer of SMP DPT model,
+                    # Assigning weights of 3rd fusion layer of original model to 2nd layer of SMP DPT model ...
+                    # and so on ...
+
+                    # This is because order of calling fusion layers is reversed in original DPT implementation
+
+                    dpt_model_dict[
+                        f"decoder.fusion_blocks.{layer_index}.residual_conv_block{block_index}.batch_norm_{bn_index}.{param}"
+                    ] = dpt_model_dict.pop(
+                        f"scratch.refinenet{4 - layer_index}.resConfUnit{block_index}.bn{bn_index}.{param}"
+                    )
+
+            if param in ["weight", "bias"]:
+                if param == "weight":
+                    for block_index in [1, 2]:
+                        for conv_index in [1, 2]:
+                            dpt_model_dict[
+                                f"decoder.fusion_blocks.{layer_index}.residual_conv_block{block_index}.conv_{conv_index}.{param}"
+                            ] = dpt_model_dict.pop(
+                                f"scratch.refinenet{4 - layer_index}.resConfUnit{block_index}.conv{conv_index}.{param}"
+                            )
+
+                    dpt_model_dict[
+                        f"decoder.reassemble_blocks.{layer_index}.project_to_feature_dim.{param}"
+                    ] = dpt_model_dict.pop(f"scratch.layer{layer_index + 1}_rn.{param}")
+
+                dpt_model_dict[
+                    f"decoder.fusion_blocks.{layer_index}.project.{param}"
+                ] = dpt_model_dict.pop(
+                    f"scratch.refinenet{4 - layer_index}.out_conv.{param}"
+                )
+
+                dpt_model_dict[
+                    f"decoder.readout_blocks.{layer_index}.project.0.{param}"
+                ] = dpt_model_dict.pop(
+                    f"pretrained.act_postprocess{layer_index + 1}.0.project.0.{param}"
+                )
+
+                dpt_model_dict[
+                    f"decoder.reassemble_blocks.{layer_index}.project_to_out_channel.{param}"
+                ] = dpt_model_dict.pop(
+                    f"pretrained.act_postprocess{layer_index + 1}.3.{param}"
+                )
+
+                if layer_index != 2:
+                    dpt_model_dict[
+                        f"decoder.reassemble_blocks.{layer_index}.upsample.{param}"
+                    ] = dpt_model_dict.pop(
+                        f"pretrained.act_postprocess{layer_index + 1}.4.{param}"
+                    )
+
+    # Changing state dict keys for segmentation head
+    dpt_model_dict = {
+        (
+            "segmentation_head.head" + name[len("scratch.output_conv") :]
+            if name.startswith("scratch.output_conv")
+            else name
+        ): parameter
+        for name, parameter in dpt_model_dict.items()
+    }
+
+    # Changing state dict keys for encoder layers
+    dpt_model_dict = {
+        (
+            "encoder.model" + name[len("pretrained.model") :]
+            if name.startswith("pretrained.model")
+            else name
+        ): parameter
+        for name, parameter in dpt_model_dict.items()
+    }
+
+    # Removing keys,value pairs associated with auxiliary head
+    dpt_model_dict = {
+        name: parameter
+        for name, parameter in dpt_model_dict.items()
+        if not name.startswith("auxlayer")
+    }
+
+    smp_model.load_state_dict(dpt_model_dict, strict=True)
+
+    model_name = MODEL_WEIGHTS_PATH.split("\\")[-1].replace(".pt", "")
+
+    smp_model.save_pretrained(model_name)
+
+    repo_id = HF_HUB_PATH
+    api = huggingface_hub.HfApi()
+    api.create_repo(repo_id=repo_id, repo_type="model")
+    api.upload_folder(folder_path=model_name, repo_id=repo_id)
diff --git a/segmentation_models_pytorch/encoders/timm_vit.py b/segmentation_models_pytorch/encoders/timm_vit.py
@@ -4,6 +4,8 @@
 import torch
 import torch.nn as nn
 
+from .timm_universal import _merge_kwargs_no_duplicates
+
 
 class TimmViTEncoder(nn.Module):
     """
@@ -26,6 +28,7 @@ def __init__(
         in_channels: int = 3,
         depth: int = 4,
         output_indices: Optional[Union[list[int], int]] = None,
+        output_stride: Optional[int] = None,
         **kwargs: dict[str, Any],
     ):
         """
@@ -49,7 +52,6 @@ def __init__(
         super().__init__()
         self.name = name
 
-        output_stride = kwargs.pop("output_stride", None)
         if output_stride is not None:
             raise ValueError("Dilated mode not supported, set output stride to None")
 
@@ -160,6 +162,8 @@ def forward(self, x: torch.Tensor) -> tuple[list[torch.Tensor], list[torch.Tenso
 
         cls_tokens = [None] * len(self.out_indices)
 
+        # If there are multiple prefix tokens, discard the register tokens if they are present and
+        # return the CLS token, if it exists. Only patch features are retrieved if CLS token is not supported
         if self.num_prefix_tokens > 0:
             features, prefix_tokens = zip(*intermediate_outputs)
             if self.cls_token_supported:
@@ -205,42 +209,3 @@ def output_stride(self) -> int:
             int: The effective output stride.
         """
         return self._output_stride
-
-    def load_state_dict(self, state_dict, **kwargs):
-        # for compatibility of weights for
-        # timm- ported encoders with TimmUniversalEncoder
-        patterns = ["regnet", "res2", "resnest", "mobilenetv3", "gernet"]
-
-        is_deprecated_encoder = any(
-            self.name.startswith(pattern) for pattern in patterns
-        )
-
-        if is_deprecated_encoder:
-            keys = list(state_dict.keys())
-            for key in keys:
-                new_key = key
-                if not key.startswith("model."):
-                    new_key = "model." + key
-                if "gernet" in self.name:
-                    new_key = new_key.replace(".stages.", ".stages_")
-                state_dict[new_key] = state_dict.pop(key)
-
-        return super().load_state_dict(state_dict, **kwargs)
-
-
-def _merge_kwargs_no_duplicates(a: dict[str, Any], b: dict[str, Any]) -> dict[str, Any]:
-    """
-    Merge two dictionaries, ensuring no duplicate keys exist.
-
-    Args:
-        a (dict): Base dictionary.
-        b (dict): Additional parameters to merge.
-
-    Returns:
-        dict: A merged dictionary.
-    """
-    duplicates = a.keys() & b.keys()
-    if duplicates:
-        raise ValueError(f"'{duplicates}' already specified internally")
-
-    return a | b
diff --git a/tests/models/test_dpt.py b/tests/models/test_dpt.py