Skip to content

Commit ace4e96

Browse files
Add per_gpu_scaling = False for distributed training (#1029)
* leave bs same for default distributed training * Update CHANGELOG.md * update all mentions aboutu per_gpu_scaling
1 parent efbd292 commit ace4e96

File tree

4 files changed

+14
-4
lines changed

4 files changed

+14
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
6363

6464
- prevented modifying config during the experiment and runner initialization ([#1004](https://github.com/catalyst-team/catalyst/pull/1004))
6565
- a few test for RecSys MAP computation ([#1018](https://github.com/catalyst-team/catalyst/pull/1014))
66+
- leave batch size the same for default distributed training ([#1023](https://github.com/catalyst-team/catalyst/issues/1023))
6667

6768

6869

catalyst/utils/loaders.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from torch.utils.data.dataloader import default_collate as default_collate_fn
99

1010
from catalyst.registry import SAMPLER
11-
from catalyst.utils.distributed import get_rank
11+
from catalyst.utils.distributed import get_distributed_params, get_rank
1212
from catalyst.utils.misc import merge_dicts, set_global_seed
1313

1414

@@ -205,7 +205,7 @@ def get_loaders_from_params(
205205
drop_last: ``drop_last`` parameter
206206
from ``torch.utils.data.DataLoader``
207207
per_gpu_scaling: boolean flag,
208-
if ``True``, uses ``batch_size=batch_size*num_available_gpus``
208+
if ``True``, scales batch_size in proportion to the number of GPUs
209209
loaders_params (Dict[str, Any]): additional loaders parameters
210210
samplers_params (Dict[str, Any]): additional sampler parameters
211211
initial_seed: initial seed for ``torch.utils.data.DataLoader``
@@ -275,6 +275,15 @@ def get_loaders_from_params(
275275
num_gpus = max(1, torch.cuda.device_count())
276276
batch_size *= num_gpus
277277
num_workers *= num_gpus
278+
elif not per_gpu_scaling and distributed:
279+
world_size = get_distributed_params().pop("world_size", 1)
280+
if batch_size % world_size == 0:
281+
batch_size = int(batch_size / world_size)
282+
else:
283+
raise ValueError(
284+
"For this distributed mode with per_gpu_scaling = False "
285+
"you need to have batch_size divisible by number of GPUs"
286+
)
278287

279288
loader_params = {
280289
"batch_size": batch_size,

examples/configs/config-description-eng.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ stages: # REQUIRED KEYWORD, dictionary of all stages of Catalyst, for training
4141
batch_size: 1 # KEYWORD, batch size for all the stages
4242
num_workers: 1 # KEYWORD, Number of parallel processes for DataLoader
4343
drop_last: False # KEYWORD, parameter for DataLoader (Default is False)
44-
per_gpu_scaling: False # KEYWORD, if True and the working mode are not distributed, it increases the batch size and the number of workers in proportion to the number of GPUs
44+
per_gpu_scaling: False # KEYWORD, if True it increases the batch size and the number of workers in proportion to the number of GPUs (for distributed increases only batch_size)
4545
loaders_params: # KEYWORD, parameters for loaders, optional
4646
# Example
4747
train:

examples/configs/config-description-rus.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ stages: # REQUIRED KEYWORD, словарь всех стадий Catalyst, дл
4141
batch_size: 1 # KEYWORD, размер батча для всех стейджей
4242
num_workers: 1 # KEYWORD, количество параллельных процессов для DataLoader
4343
drop_last: False # KEYWORD, параметр для DataLoader (по умолчанию False)
44-
per_gpu_scaling: False # KEYWORD, если True и режим работы не distributed, то увеличивает батчсайз и количество воркеров пропорционально количиству видеокарт
44+
per_gpu_scaling: False # KEYWORD, если True, то увеличивает батчсайз и количество воркеров пропорционально количиству видеокарт (для distributed увеличивает только батчсайз)
4545
loaders_params: # KEYWORD, параметры для лоадеров, опционально
4646
# Например
4747
train:

0 commit comments

Comments
 (0)