vedantdalimkar
diff --git a/‎segmentation_models_pytorch/__init__.py
+1-1 b/‎segmentation_models_pytorch/__init__.py
+1-1
diff --git a/‎segmentation_models_pytorch/decoders/dpt/decoder.py
+4-5 b/‎segmentation_models_pytorch/decoders/dpt/decoder.py
+4-5
diff --git a/‎segmentation_models_pytorch/decoders/dpt/model.py
+34-8 b/‎segmentation_models_pytorch/decoders/dpt/model.py
+34-8
diff --git a/‎segmentation_models_pytorch/encoders/__init__.py
+1-1 b/‎segmentation_models_pytorch/encoders/__init__.py
+1-1
diff --git a/‎segmentation_models_pytorch/encoders/timm_vit.py
+47-44 b/‎segmentation_models_pytorch/encoders/timm_vit.py
+47-44
@@ -35,7 +35,7 @@
     PAN,
     UPerNet,
     Segformer,
-    DPT
+    DPT,
 ]
 MODEL_ARCHITECTURES_MAPPING = {a.__name__.lower(): a for a in _MODEL_ARCHITECTURES}
 
 
@@ -196,16 +196,16 @@ def __init__(
         encoder_output_stride: int,
         feature_dim: int = 256,
         encoder_depth: int = 4,
-        prefix_token_supported: bool = False,
+        cls_token_supported: bool = False,
     ):
         super().__init__()
 
-        self.prefix_token_supported = prefix_token_supported
+        self.cls_token_supported = cls_token_supported
 
         # If encoder has cls token, then concatenate it with the features along the embedding dimension and project it
         # back to the feature_dim dimension. Else, ignore the non-existent cls token
 
-        if prefix_token_supported:
+        if cls_token_supported:
             self.readout_blocks = nn.ModuleList(
                 [
                     ProjectionReadout(
@@ -246,9 +246,8 @@ def __init__(
         )
 
     def forward(
-        self, encoder_output: list[list[torch.Tensor], list[torch.Tensor]]
+        self, features: list[torch.Tensor], cls_tokens: list[torch.Tensor]
     ) -> torch.Tensor:
-        features, cls_tokens = encoder_output
         processed_features = []
 
         # Process the encoder features to scale of [1/32,1/16,1/8,1/4]
 
@@ -1,11 +1,13 @@
 from typing import Any, Optional, Union, Callable
+import torch
 
 from segmentation_models_pytorch.base import (
     ClassificationHead,
     SegmentationHead,
     SegmentationModel,
 )
 from segmentation_models_pytorch.encoders import get_encoder
+from segmentation_models_pytorch.base.utils import is_torch_compiling
 from segmentation_models_pytorch.base.hub_mixin import supports_config_loading
 from .decoder import DPTDecoder
 
@@ -46,8 +48,8 @@ class DPT(SegmentationModel):
                     (could be **None** to return logits)
         kwargs: Arguments passed to the encoder class ``__init__()`` function. Applies only to ``timm`` models. Keys with
                 ``None`` values are pruned before passing.
-                allow_downsampling : Allow ViT encoder to have progressive downsampling. Set to False for DPT as the architecture
-                    requires all encoder feature outputs to have the same spatial shape.
+                allow_downsampling : Allow ViT encoder to have progressive spatial downsampling for it's representations.
+                Set to False for DPT as the architecture requires all encoder feature outputs to have the same spatial shape.
                 allow_output_stride_not_power_of_two : Allow ViT encoders with output_stride not being a power of 2. This
                     is set False for DPT as the architecture requires the encoder output features to have an output stride of
                     [1/32,1/16,1/8,1/4]
@@ -58,6 +60,10 @@ class DPT(SegmentationModel):
 
     """
 
+    _is_torch_scriptable = False
+    _is_torch_compilable = False
+    requires_divisible_input_shape = True
+
     @supports_config_loading
     def __init__(
         self,
@@ -84,17 +90,17 @@ def __init__(
             **kwargs,
         )
 
-        transformer_embed_dim = self.encoder.embed_dim
-        encoder_output_stride = self.encoder.output_stride
-        cls_token_supported = self.encoder.prefix_token_supported
+        self.transformer_embed_dim = self.encoder.embed_dim
+        self.encoder_output_stride = self.encoder.output_stride
+        self.cls_token_supported = self.encoder.cls_token_supported
 
         self.decoder = DPTDecoder(
             encoder_name=encoder_name,
-            transformer_embed_dim=transformer_embed_dim,
+            transformer_embed_dim=self.transformer_embed_dim,
             feature_dim=feature_dim,
             encoder_depth=encoder_depth,
-            encoder_output_stride=encoder_output_stride,
-            prefix_token_supported=cls_token_supported,
+            encoder_output_stride=self.encoder_output_stride,
+            cls_token_supported=self.cls_token_supported,
         )
 
         self.segmentation_head = SegmentationHead(
@@ -114,3 +120,23 @@ def __init__(
 
         self.name = "dpt-{}".format(encoder_name)
         self.initialize()
+
+    def forward(self, x):
+        """Sequentially pass `x` trough model`s encoder, decoder and heads"""
+
+        if not (
+            torch.jit.is_scripting() or torch.jit.is_tracing() or is_torch_compiling()
+        ):
+            self.check_input_shape(x)
+
+        features, cls_tokens = self.encoder(x)
+
+        decoder_output = self.decoder(features, cls_tokens)
+
+        masks = self.segmentation_head(decoder_output)
+
+        if self.classification_head is not None:
+            labels = self.classification_head(features[-1])
+            return masks, labels
+
+        return masks
@@ -92,7 +92,7 @@ def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **
                 in_channels=in_channels,
                 depth=depth,
                 pretrained=weights is not None,
-                output_stride = output_stride,
+                output_stride=output_stride,
                 **kwargs,
             )
             return encoder
 
@@ -1,4 +1,4 @@
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import timm
 import torch
@@ -15,17 +15,17 @@ class TimmViTEncoder(nn.Module):
         - Ensures consistent multi-level feature extraction across all ViT models.
     """
 
-    _is_torch_scriptable = True
+    _is_torch_scriptable = False
     _is_torch_exportable = True
-    _is_torch_compilable = True
+    _is_torch_compilable = False
 
     def __init__(
         self,
         name: str,
         pretrained: bool = True,
         in_channels: int = 3,
         depth: int = 4,
-        output_indices: Optional[list[int] | int] = None,
+        output_indices: Optional[Union[list[int], int]] = None,
         **kwargs: dict[str, Any],
     ):
         """
@@ -49,16 +49,14 @@ def __init__(
         super().__init__()
         self.name = name
 
-        output_stride = kwargs.pop("output_stride",None)
+        output_stride = kwargs.pop("output_stride", None)
         if output_stride is not None:
-            raise ValueError(
-                "Dilated mode not supported, set output stride to None"
-            )
+            raise ValueError("Dilated mode not supported, set output stride to None")
 
         # Default model configuration for feature extraction
         common_kwargs = dict(
             in_chans=in_channels,
-            features_only=True,
+            features_only=False,
             pretrained=pretrained,
             out_indices=tuple(range(depth)),
         )
@@ -76,6 +74,23 @@ def __init__(
         feature_info = tmp_model.feature_info
         model_num_blocks = len(feature_info)
 
+        if output_indices is not None:
+            if isinstance(output_indices, int):
+                output_indices = list(output_indices)
+
+            for output_index in output_indices:
+                if output_indices < 0 or output_indices > model_num_blocks:
+                    raise ValueError(
+                        f"Output indices for feature extraction should be greater than 0 and less \
+                                     than the number of blocks in the model ({model_num_blocks}), got {output_index}"
+                    )
+
+            if len(output_indices) != depth:
+                raise ValueError(
+                    f"Length of output indices for feature extraction should be equal to the depth of the encoder\
+                                  architecture, got output indices length - {len(output_indices)}, encoder depth - {depth}"
+                )
+
         if depth > model_num_blocks:
             raise ValueError(
                 f"Depth of the encoder cannot exceed the number of blocks in the model \
@@ -87,9 +102,6 @@ def __init__(
                 int((model_num_blocks / 4) * index) - 1 for index in range(1, depth + 1)
             ]
 
-        if isinstance(output_indices,int):
-            output_indices = list(output_indices)
-
         common_kwargs["out_indices"] = self.out_indices = output_indices
         feature_info_obj = timm.models.FeatureInfo(
             feature_info=feature_info, out_indices=output_indices
@@ -109,18 +121,16 @@ def __init__(
         self._output_stride = reduction_scales[0]
 
         if (
-            int(self._output_stride).bit_count() != 1
+            bin(self._output_stride).count("1") != 1
             and not allow_output_stride_not_power_of_two
         ):
             raise ValueError(
                 f"Models with stride which is not a power of 2 are not supported, \
                               got output stride {self._output_stride}"
             )
 
-        self.prefix_token_supported = getattr(tmp_model, "has_class_token", False)
+        self.cls_token_supported = getattr(tmp_model, "has_class_token", False)
         self.num_prefix_tokens = getattr(tmp_model, "num_prefix_tokens", 0)
-        if self.prefix_token_supported:
-            common_kwargs["features_only"] = False
 
         self.model = timm.create_model(
             name, **_merge_kwargs_no_duplicates(common_kwargs, kwargs)
@@ -131,47 +141,40 @@ def __init__(
         self._depth = depth
         self._embed_dim = tmp_model.embed_dim
 
-    def forward(self, x: torch.Tensor) -> list[list[torch.Tensor], list[torch.Tensor]]:
+    def forward(self, x: torch.Tensor) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
         """
         Forward pass to extract multi-stage features.
 
         Args:
             x (torch.Tensor): Input tensor of shape (B, C, H, W).
 
         Returns:
-            list[torch.Tensor]: List of feature maps at different scales.
+            tuple[list[torch.Tensor], list[torch.Tensor]]: Tuple of feature maps and cls tokens (if supported) at different scales.
         """
-        if self.prefix_token_supported:
-            intermediate_outputs = self.model.forward_intermediates(
-                x,
-                indices=self.out_indices,
-                return_prefix_tokens=True,
-                intermediates_only=True,
-            )
-            features, cls_tokens = zip(*intermediate_outputs)
-
-            # Convert NHWC to NCHW if needed
-            if self._is_channel_last:
-                features = [
-                    feature.permute(0, 3, 1, 2).contiguous() for feature in features
-                ]
-
-            if self.num_prefix_tokens > 1:
-                cls_tokens = [cls_token[:, 0, :] for cls_token in cls_tokens]
+        intermediate_outputs = self.model.forward_intermediates(
+            x,
+            indices=self.out_indices,
+            return_prefix_tokens=True,
+            intermediates_only=True,
+        )
 
-            return [features, cls_tokens]
+        cls_tokens = [None] * len(self.out_indices)
 
-        features = self.model(x)
+        if self.num_prefix_tokens > 0:
+            features, prefix_tokens = zip(*intermediate_outputs)
+            if self.cls_token_supported:
+                if self.num_prefix_tokens == 1:
+                    cls_tokens = prefix_tokens
 
-        # Convert NHWC to NCHW if needed
-        if self._is_channel_last:
-            features = [
-                feature.permute(0, 3, 1, 2).contiguous() for feature in features
-            ]
+                elif self.num_prefix_tokens > 1:
+                    cls_tokens = [
+                        prefix_token[:, 0, :] for prefix_token in prefix_tokens
+                    ]
 
-        cls_tokens = [None] * len(features)
+        else:
+            features = intermediate_outputs
 
-        return [features, cls_tokens]
+        return features, cls_tokens
 
     @property
     def embed_dim(self) -> int:
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`PAN,`
`36`	`36`	`UPerNet,`
`37`	`37`	`Segformer,`
`38`		`- DPT`
	`38`	`+ DPT,`
`39`	`39`	`]`
`40`	`40`	`MODEL_ARCHITECTURES_MAPPING = {a.__name__.lower(): a for a in _MODEL_ARCHITECTURES}`
`41`	`41`
Original file line number	Diff line number	Diff line change
`@@ -92,7 +92,7 @@ def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **`
`92`	`92`	`in_channels=in_channels,`
`93`	`93`	`depth=depth,`
`94`	`94`	`pretrained=weights is not None,`
`95`		`- output_stride = output_stride,`
	`95`	`+ output_stride=output_stride,`
`96`	`96`	`**kwargs,`
`97`	`97`	`)`
`98`	`98`	`return encoder`