Add decoder_readout according to initial impl

qubvel · qubvel · commit 8d3ed4fd2667 · 2025-04-07T12:33:55.000Z
diff --git a/segmentation_models_pytorch/decoders/dpt/decoder.py b/segmentation_models_pytorch/decoders/dpt/decoder.py
@@ -1,37 +1,40 @@
 import torch
 import torch.nn as nn
 from segmentation_models_pytorch.base.modules import Activation
-from typing import Optional, Sequence, Union, Callable
+from typing import Optional, Sequence, Union, Callable, Literal
 
 
-class ProjectionBlock(nn.Module):
+class ReadoutConcatBlock(nn.Module):
     """
-    Concatenates the cls tokens with the features to make use of the global information aggregated in the cls token.
-    Projects the combined feature map to the original embedding dimension using a MLP
+    Concatenates the cls tokens with the features to make use of the global information aggregated in the prefix (cls) tokens.
+    Projects the combined feature map to the original embedding dimension using a MLP.
+
+    According to:
+        https://github.com/isl-org/DPT/blob/cd3fe90bb4c48577535cc4d51b602acca688a2ee/dpt/vit.py#L79-L90
     """
 
-    def __init__(self, embed_dim: int, has_cls_token: bool):
+    def __init__(self, embed_dim: int, has_prefix_tokens: bool):
         super().__init__()
-        in_features = embed_dim * 2 if has_cls_token else embed_dim
+        in_features = embed_dim * 2 if has_prefix_tokens else embed_dim
         out_features = embed_dim
         self.project = nn.Sequential(
             nn.Linear(in_features, out_features),
             nn.GELU(),
         )
 
     def forward(
-        self, features: torch.Tensor, cls_token: Optional[torch.Tensor] = None
+        self, features: torch.Tensor, prefix_tokens: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         batch_size, embed_dim, height, width = features.shape
 
         # Rearrange to (batch_size, height * width, embed_dim)
         features = features.view(batch_size, embed_dim, -1)
         features = features.transpose(1, 2).contiguous()
 
-        # Add CLS token
-        if cls_token is not None:
-            cls_token = cls_token.expand_as(features)
-            features = torch.cat([features, cls_token], dim=2)
+        if prefix_tokens is not None:
+            # (batch_size, num_tokens, embed_dim) -> (batch_size, embed_dim)
+            prefix_tokens = prefix_tokens[:, 0].expand_as(features)
+            features = torch.cat([features, prefix_tokens], dim=2)
 
         # Project to embedding dimension
         features = self.project(features)
@@ -43,6 +46,34 @@ def forward(
         return features
 
 
+class ReadoutAddBlock(nn.Module):
+    """
+    Adds the prefix tokens to the features to make use of the global information aggregated in the prefix (cls) tokens.
+
+    According to:
+        https://github.com/isl-org/DPT/blob/cd3fe90bb4c48577535cc4d51b602acca688a2ee/dpt/vit.py#L71-L76
+    """
+
+    def forward(
+        self, features: torch.Tensor, prefix_tokens: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        if prefix_tokens is not None:
+            batch_size, embed_dim, height, width = features.shape
+            prefix_tokens = prefix_tokens.mean(dim=1)
+            prefix_tokens = prefix_tokens.view(batch_size, embed_dim, 1, 1)
+            features = features + prefix_tokens
+        return features
+
+
+class ReadoutIgnoreBlock(nn.Module):
+    """
+    Ignores the prefix tokens and returns the features as is.
+    """
+
+    def forward(self, features: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        return features
+
+
 class ReassembleBlock(nn.Module):
     """
     Processes the features such that they have progressively increasing embedding size and progressively decreasing
@@ -182,20 +213,30 @@ def __init__(
         self,
         encoder_out_channels: Sequence[int] = (756, 756, 756, 756),
         encoder_output_strides: Sequence[int] = (16, 16, 16, 16),
+        encoder_has_prefix_tokens: bool = True,
+        readout: Literal["cat", "add", "ignore"] = "cat",
         intermediate_channels: Sequence[int] = (256, 512, 1024, 1024),
         fusion_channels: int = 256,
-        has_cls_token: bool = False,
     ):
         super().__init__()
 
         num_blocks = len(encoder_output_strides)
 
-        # If encoder has cls token, then concatenate it with the features along the embedding dimension and project it
-        # back to the feature_dim dimension. Else, ignore the non-existent cls token
-        blocks = [
-            ProjectionBlock(in_channels, has_cls_token)
-            for in_channels in encoder_out_channels
-        ]
+        # If encoder has prefix tokens (e.g. cls_token), then we can concat/add/ignore them
+        # according to the readout mode
+        if readout == "cat":
+            blocks = [
+                ReadoutConcatBlock(in_channels, encoder_has_prefix_tokens)
+                for in_channels in encoder_out_channels
+            ]
+        elif readout == "add":
+            blocks = [ReadoutAddBlock() for _ in encoder_out_channels]
+        elif readout == "ignore":
+            blocks = [ReadoutIgnoreBlock() for _ in encoder_out_channels]
+        else:
+            raise ValueError(
+                f"Invalid readout mode: {readout}, should be one of: 'cat', 'add', 'ignore'"
+            )
         self.projection_blocks = nn.ModuleList(blocks)
 
         # Upsample factors to resize features to [1/4, 1/8, 1/16, 1/32, ...] scales
diff --git a/segmentation_models_pytorch/decoders/dpt/model.py b/segmentation_models_pytorch/decoders/dpt/model.py
@@ -1,4 +1,6 @@
-from typing import Any, Optional, Union, Callable, Sequence
+import warnings
+from typing import Any, Optional, Union, Callable, Sequence, Literal
+
 import torch
 
 from segmentation_models_pytorch.base import (
@@ -43,6 +45,8 @@ class DPT(SegmentationModel):
             across the number of blocks in encoder, e.g. if number of blocks is 4 and encoder has 20 blocks, then
             encoder_output_indices will be (4, 9, 14, 19). If specified the number of indices should be equal to
             encoder_depth. Default is **None**.
+        decoder_readout: The strategy to utilize the prefix tokens (e.g. cls_token) from the encoder.
+            Can be one of **"cat"**, **"add"**, or **"ignore"**. Default is **"cat"**.
         decoder_intermediate_channels: The number of channels for the intermediate decoder layers. Reduce if you
             want to reduce the number of parameters in the decoder. Default is (256, 512, 1024, 1024).
         decoder_fusion_channels: The latent dimension to which the encoder features will be projected to before fusion.
@@ -78,6 +82,7 @@ def __init__(
         encoder_depth: int = 4,
         encoder_weights: Optional[str] = "imagenet",
         encoder_output_indices: Optional[list[int]] = None,
+        decoder_readout: Literal["ignore", "add", "cat"] = "cat",
         decoder_intermediate_channels: Sequence[int] = (256, 512, 1024, 1024),
         decoder_fusion_channels: int = 256,
         in_channels: int = 3,
@@ -94,6 +99,11 @@ def __init__(
                 f"Only Timm encoders are supported for DPT. Encoder name must start with 'tu-', got {encoder_name}"
             )
 
+        if decoder_readout not in ["ignore", "add", "cat"]:
+            raise ValueError(
+                f"Invalid decoder readout mode. Must be one of: 'ignore', 'add', 'cat'. Got: {decoder_readout}"
+            )
+
         self.encoder = TimmViTEncoder(
             name=encoder_name,
             in_channels=in_channels,
@@ -103,12 +113,20 @@ def __init__(
             **kwargs,
         )
 
+        if not self.encoder.has_prefix_tokens and decoder_readout != "ignore":
+            warnings.warn(
+                f"Encoder does not have prefix tokens (e.g. cls_token), but `decoder_readout` is set to '{decoder_readout}'. "
+                f"It's recommended to set `decoder_readout='ignore'` when using a encoder without prefix tokens.",
+                UserWarning,
+            )
+
         self.decoder = DPTDecoder(
             encoder_out_channels=self.encoder.out_channels,
+            encoder_output_strides=self.encoder.output_strides,
+            encoder_has_prefix_tokens=self.encoder.has_prefix_tokens,
+            readout=decoder_readout,
             intermediate_channels=decoder_intermediate_channels,
             fusion_channels=decoder_fusion_channels,
-            encoder_output_strides=self.encoder.output_strides,
-            has_cls_token=self.encoder.has_class_token,
         )
 
         self.segmentation_head = DPTSegmentationHead(
diff --git a/segmentation_models_pytorch/encoders/timm_vit.py b/segmentation_models_pytorch/encoders/timm_vit.py
@@ -129,13 +129,14 @@ def __init__(
 
         # Private attributes for model forward
         self._num_prefix_tokens = getattr(self.model, "num_prefix_tokens", 0)
+        self._has_cls_token = getattr(self.model, "has_cls_token", False)
         self._output_indices = output_indices
 
         # Public attributes
         self.output_strides = [feature_info[i]["reduction"] for i in output_indices]
         self.output_stride = self.output_strides[-1]
         self.out_channels = [feature_info[i]["num_chs"] for i in output_indices]
-        self.has_class_token = getattr(self.model, "has_class_token", False)
+        self.has_prefix_tokens = self._num_prefix_tokens > 0
 
     @property
     def is_fixed_input_size(self) -> bool:
@@ -145,25 +146,22 @@ def is_fixed_input_size(self) -> bool:
     def input_size(self) -> int:
         return self.model.pretrained_cfg.get("input_size", None)
 
-    def _forward_with_cls_token(
+    def _forward_with_prefix_tokens(
         self, x: torch.Tensor
     ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         intermediate_outputs = self.model.forward_intermediates(
             x,
             indices=self._output_indices,
-            return_prefix_tokens=True,
             intermediates_only=True,
+            return_prefix_tokens=True,
         )
 
         features = [output[0] for output in intermediate_outputs]
-        cls_tokens = [output[1] for output in intermediate_outputs]
-
-        if self.has_class_token and self._num_prefix_tokens > 1:
-            cls_tokens = [x[:, 0, :] for x in cls_tokens]
+        prefix_tokens = [output[1] for output in intermediate_outputs]
 
-        return features, cls_tokens
+        return features, prefix_tokens
 
-    def _forward_without_cls_token(self, x: torch.Tensor) -> list[torch.Tensor]:
+    def _forward_without_prefix_tokens(self, x: torch.Tensor) -> list[torch.Tensor]:
         features = self.model.forward_intermediates(
             x,
             indices=self._output_indices,
@@ -184,10 +182,10 @@ def forward(
             tuple[list[torch.Tensor], list[torch.Tensor]]: Tuple of feature maps and cls tokens (if supported) at different scales.
         """
 
-        if self.has_class_token:
-            features, cls_tokens = self._forward_with_cls_token(x)
+        if self.has_prefix_tokens:
+            features, prefix_tokens = self._forward_with_prefix_tokens(x)
         else:
             features = self._forward_without_cls_token(x)
-            cls_tokens = [None] * len(features)
+            prefix_tokens = [None] * len(features)
 
-        return features, cls_tokens
+        return features, prefix_tokens