forked from qubvel-org/segmentation_models.pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
126 lines (109 loc) · 5.35 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from typing import Any, Callable, Literal, Optional, Union
import warnings
from segmentation_models_pytorch.base import (
ClassificationHead,
SegmentationHead,
SegmentationModel,
)
from segmentation_models_pytorch.encoders import get_encoder
from segmentation_models_pytorch.base.hub_mixin import supports_config_loading
from .decoder import PANDecoder
class PAN(SegmentationModel):
"""Implementation of PAN_ (Pyramid Attention Network).
Note:
Currently works with shape of input tensor >= [B x C x 128 x 128] for pytorch <= 1.1.0
and with shape of input tensor >= [B x C x 256 x 256] for pytorch == 1.3.1
Args:
encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
to extract features of different spatial resolution
encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
Default is 5
encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
other pretrained weights (see table with available weights for each encoder_name)
encoder_output_stride: 16 or 32, if 16 use dilation in encoder last layer.
Doesn't work with ***ception***, **vgg***, **densenet*`** backbones.Default is 16.
decoder_channels: A number of convolution layer filters in decoder blocks
decoder_interpolation: Interpolation mode used in decoder of the model. Available options are
**"nearest"**, **"bilinear"**, **"bicubic"**, **"area"**, **"nearest-exact"**. Default is **"bilinear"**.
in_channels: A number of input channels for the model, default is 3 (RGB images)
classes: A number of classes for output mask (or you can think as a number of channels of output mask)
activation: An activation function to apply after the final convolution layer.
Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**,
**callable** and **None**.
Default is **None**
upsampling: Final upsampling factor. Default is 4 to preserve input-output spatial shape identity
aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
on top of encoder if **aux_params** is not **None** (default). Supported params:
- classes (int): A number of classes
- pooling (str): One of "max", "avg". Default is "avg"
- dropout (float): Dropout factor in [0, 1)
- activation (str): An activation function to apply "sigmoid"/"softmax"
(could be **None** to return logits)
kwargs: Arguments passed to the encoder class ``__init__()`` function. Applies only to ``timm`` models. Keys with ``None`` values are pruned before passing.
Returns:
``torch.nn.Module``: **PAN**
.. _PAN:
https://arxiv.org/abs/1805.10180
"""
@supports_config_loading
def __init__(
self,
encoder_name: str = "resnet34",
encoder_depth: Literal[3, 4, 5] = 5,
encoder_weights: Optional[str] = "imagenet",
encoder_output_stride: Literal[16, 32] = 16,
decoder_channels: int = 32,
decoder_interpolation: str = "bilinear",
in_channels: int = 3,
classes: int = 1,
activation: Optional[Union[str, Callable]] = None,
upsampling: int = 4,
aux_params: Optional[dict] = None,
**kwargs: dict[str, Any],
):
super().__init__()
if encoder_output_stride not in [16, 32]:
raise ValueError(
"PAN support output stride 16 or 32, got {}".format(
encoder_output_stride
)
)
upscale_mode = kwargs.pop("upscale_mode", None)
if upscale_mode is not None:
warnings.warn(
"The usage of upscale_mode is deprecated. Please modify your code for decoder_interpolation",
DeprecationWarning,
stacklevel=2,
)
decoder_interpolation = upscale_mode
self.encoder = get_encoder(
encoder_name,
in_channels=in_channels,
depth=encoder_depth,
weights=encoder_weights,
output_stride=encoder_output_stride,
**kwargs,
)
self.decoder = PANDecoder(
encoder_channels=self.encoder.out_channels,
encoder_depth=encoder_depth,
decoder_channels=decoder_channels,
interpolation_mode=decoder_interpolation,
)
self.segmentation_head = SegmentationHead(
in_channels=decoder_channels,
out_channels=classes,
activation=activation,
kernel_size=3,
upsampling=upsampling,
)
if aux_params is not None:
self.classification_head = ClassificationHead(
in_channels=self.encoder.out_channels[-1], **aux_params
)
else:
self.classification_head = None
self.name = "pan-{}".format(encoder_name)
self.initialize()