From 4f7e994567fe3a7d3caf373568cddfac750346a1 Mon Sep 17 00:00:00 2001 From: Janusz Lisiecki Date: Fri, 20 Dec 2024 17:54:31 +0100 Subject: [PATCH 1/3] Add a typical data loading pipeline path for the EfficeintNet - adds an option to run the EfficeintNet network with a typical data loading pipeline without very advanced optimization that most users won't implement Signed-off-by: Janusz Lisiecki --- .../image_classification/dataloaders.py | 207 ++++++++++++------ .../use_cases/pytorch/efficientnet/main.py | 7 + qa/TL3_EfficientNet_benchmark/test_pytorch.sh | 21 +- 3 files changed, 156 insertions(+), 79 deletions(-) diff --git a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py index f0e19aa4064..064d441e048 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py +++ b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py @@ -51,9 +51,14 @@ import torchvision.datasets as datasets import torchvision.transforms as transforms + def load_jpeg_from_file(path, cuda=True): img_transforms = transforms.Compose( - [transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor()] + [ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + ] ) img = img_transforms(Image.open(path)) @@ -74,6 +79,7 @@ def load_jpeg_from_file(path, cuda=True): return input + class DALIWrapper(object): def gen_wrapper(dalipipeline, num_classes, one_hot, memory_format): @@ -90,8 +96,11 @@ def gen_wrapper(dalipipeline, num_classes, one_hot, memory_format): def nhwc_to_nchw(t): return t[0], t[3], t[1], t[2] - input = torch.as_strided(data[0]["data"], size=nhwc_to_nchw(shape), - stride=nhwc_to_nchw(stride)) + input = torch.as_strided( + data[0]["data"], + size=nhwc_to_nchw(shape), + stride=nhwc_to_nchw(stride), + ) else: input = data[0]["data"].contiguous(memory_format=memory_format) target = torch.reshape(data[0]["label"], [-1]).cuda().long() @@ -108,7 +117,10 @@ def __init__(self, dalipipeline, num_classes, one_hot, memory_format): def __iter__(self): return DALIWrapper.gen_wrapper( - self.dalipipeline, self.num_classes, self.one_hot, self.memory_format + self.dalipipeline, + self.num_classes, + self.one_hot, + self.memory_format, ) @@ -145,16 +157,23 @@ def gdtl( traindir = os.path.join(data_path, "train") pipeline_kwargs = { - "batch_size" : batch_size, - "num_threads" : workers, - "device_id" : rank % torch.cuda.device_count(), + "batch_size": batch_size, + "num_threads": workers, + "device_id": rank % torch.cuda.device_count(), "seed": 12 + rank % torch.cuda.device_count(), } - pipe = training_pipe(data_dir=traindir, interpolation=interpolation, image_size=image_size, - output_layout=output_layout, automatic_augmentation=augmentation, - dali_device=dali_device, rank=rank, world_size=world_size, - **pipeline_kwargs) + pipe = training_pipe( + data_dir=traindir, + interpolation=interpolation, + image_size=image_size, + output_layout=output_layout, + automatic_augmentation=augmentation, + dali_device=dali_device, + rank=rank, + world_size=world_size, + **pipeline_kwargs, + ) pipe.build() train_loader = DALIClassificationIterator( @@ -201,15 +220,20 @@ def gdvl( valdir = os.path.join(data_path, "val") pipeline_kwargs = { - "batch_size" : batch_size, - "num_threads" : workers, - "device_id" : rank % torch.cuda.device_count(), + "batch_size": batch_size, + "num_threads": workers, + "device_id": rank % torch.cuda.device_count(), "seed": 12 + rank % torch.cuda.device_count(), } - pipe = validation_pipe(data_dir=valdir, interpolation=interpolation, - image_size=image_size + crop_padding, image_crop=image_size, - output_layout=output_layout, **pipeline_kwargs) + pipe = validation_pipe( + data_dir=valdir, + interpolation=interpolation, + image_size=image_size + crop_padding, + image_crop=image_size, + output_layout=output_layout, + **pipeline_kwargs, + ) pipe.build() val_loader = DALIClassificationIterator( @@ -224,11 +248,16 @@ def gdvl( return gdvl -def fast_collate(memory_format, batch): +def fast_collate(memory_format, typical_loader, batch): imgs = [img[0] for img in batch] targets = torch.tensor([target[1] for target in batch], dtype=torch.int64) - w = imgs[0].size[0] - h = imgs[0].size[1] + if typical_loader: + w = imgs[0].size()[1] + h = imgs[0].size()[2] + else: + w = imgs[0].size[0] + h = imgs[0].size[1] + tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous( memory_format=memory_format ) @@ -236,7 +265,8 @@ def fast_collate(memory_format, batch): nump_array = np.asarray(img, dtype=np.uint8) if nump_array.ndim < 3: nump_array = np.expand_dims(nump_array, axis=-1) - nump_array = np.rollaxis(nump_array, 2) + if typical_loader is False: + nump_array = np.rollaxis(nump_array, 2) tensor[i] += torch.from_numpy(nump_array.copy()) @@ -252,57 +282,70 @@ def expand(num_classes, dtype, tensor): class PrefetchedWrapper(object): - def prefetched_loader(loader, num_classes, one_hot): - mean = ( - torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - std = ( - torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - - stream = torch.cuda.Stream() - first = True - - for next_input, next_target in loader: - with torch.cuda.stream(stream): - next_input = next_input.cuda(non_blocking=True) - next_target = next_target.cuda(non_blocking=True) - next_input = next_input.float() - if one_hot: - next_target = expand(num_classes, torch.float, next_target) - - next_input = next_input.sub_(mean).div_(std) - - if not first: - yield input, target - else: - first = False - - torch.cuda.current_stream().wait_stream(stream) - input = next_input - target = next_target + def prefetched_loader(loader, num_classes, one_hot, typical_loader ): + if typical_loader: + stream = torch.cuda.Stream() + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.to(device="cuda") + next_target = next_target.to(device="cuda") + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + yield next_input, next_target + else: + mean = ( + torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) + .cuda() + .view(1, 3, 1, 1) + ) + std = ( + torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) + .cuda() + .view(1, 3, 1, 1) + ) + + stream = torch.cuda.Stream() + first = True + + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.cuda(non_blocking=True) + next_target = next_target.cuda(non_blocking=True) + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + + # next_input = next_input.sub_(mean).div_(std) + + if not first: + yield input, target + else: + first = False + + torch.cuda.current_stream().wait_stream(stream) + input = next_input + target = next_target - yield input, target + yield input, target - def __init__(self, dataloader, start_epoch, num_classes, one_hot): + def __init__(self, dataloader, start_epoch, num_classes, one_hot, typical_loader): self.dataloader = dataloader self.epoch = start_epoch self.one_hot = one_hot self.num_classes = num_classes + self.typical_loader = typical_loader def __iter__(self): if self.dataloader.sampler is not None and isinstance( - self.dataloader.sampler, torch.utils.data.distributed.DistributedSampler + self.dataloader.sampler, + torch.utils.data.distributed.DistributedSampler, ): self.dataloader.sampler.set_epoch(self.epoch) self.epoch += 1 return PrefetchedWrapper.prefetched_loader( - self.dataloader, self.num_classes, self.one_hot + self.dataloader, self.num_classes, self.one_hot, self.typical_loader ) def __len__(self): @@ -322,6 +365,7 @@ def get_pytorch_train_loader( _worker_init_fn=None, prefetch_factor=2, memory_format=torch.contiguous_format, + typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation @@ -336,9 +380,19 @@ def get_pytorch_train_loader( elif augmentation == "autoaugment": transforms_list.append(AutoaugmentImageNetPolicy()) else: - raise NotImplementedError(f"Automatic augmentation: '{augmentation}' is not supported" - " for PyTorch data loader.") - train_dataset = datasets.ImageFolder(traindir, transforms.Compose(transforms_list)) + raise NotImplementedError( + f"Automatic augmentation: '{augmentation}' is not supported" + " for PyTorch data loader." + ) + + if typical_loader: + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) + train_dataset = datasets.ImageFolder( + traindir, transforms.Compose(transforms_list) + ) if torch.distributed.is_initialized(): train_sampler = torch.utils.data.distributed.DistributedSampler( @@ -355,14 +409,14 @@ def get_pytorch_train_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format), + collate_fn=partial(fast_collate, memory_format, typical_loader), drop_last=True, persistent_workers=True, prefetch_factor=prefetch_factor, ) return ( - PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot), + PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot, typical_loader), len(train_loader), ) @@ -379,21 +433,26 @@ def get_pytorch_val_loader( crop_padding=32, memory_format=torch.contiguous_format, prefetch_factor=2, + typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation ] valdir = os.path.join(data_path, "val") + transforms_list = [ + transforms.Resize( + image_size + crop_padding, interpolation=interpolation + ), + transforms.CenterCrop(image_size), + ] + if typical_loader: + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) val_dataset = datasets.ImageFolder( valdir, - transforms.Compose( - [ - transforms.Resize( - image_size + crop_padding, interpolation=interpolation - ), - transforms.CenterCrop(image_size), - ] - ), + transforms.Compose(transforms_list), ) if torch.distributed.is_initialized(): @@ -411,13 +470,15 @@ def get_pytorch_val_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format), + collate_fn=partial(fast_collate, memory_format, typical_loader), drop_last=False, persistent_workers=True, prefetch_factor=prefetch_factor, ) - return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len(val_loader) + return PrefetchedWrapper(val_loader, 0, num_classes, one_hot, typical_loader), len( + val_loader + ) class SynteticDataLoader(object): diff --git a/docs/examples/use_cases/pytorch/efficientnet/main.py b/docs/examples/use_cases/pytorch/efficientnet/main.py index d8d373ae53e..e7677b38388 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/main.py +++ b/docs/examples/use_cases/pytorch/efficientnet/main.py @@ -78,6 +78,11 @@ def add_parser_arguments(parser, skip_arch=False): + " | ".join(DATA_BACKEND_CHOICES) + " (default: dali)", ) + parser.add_argument( + "--typical_loader", + action="store_true", + help="Skip advanced PyTorch data loader optimizations.", + ) parser.add_argument( "--interpolation", metavar="INTERPOLATION", @@ -510,6 +515,7 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, + typical_loader=args.typical_loader, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) @@ -525,6 +531,7 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, + typical_loader=args.typical_loader, ) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: diff --git a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh index 609f9a7a834..abd7cbf4e54 100644 --- a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh +++ b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh @@ -56,22 +56,28 @@ export PATH_TO_IMAGENET=/imagenet export RESULT_WORKSPACE=./ # synthetic benchmark -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 1 --prof 1000 --no-checkpoints --training-only --data-backend synthetic --workspace $RESULT_WORKSPACE --report-file bench_report_synthetic.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --epochs 1 --prof 1000 --no-checkpoints --training-only --data-backend synthetic --workspace $RESULT_WORKSPACE --report-file bench_report_synthetic.json $PATH_TO_IMAGENET # DALI without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_dali.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 13 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_dali.json $PATH_TO_IMAGENET # DALI with AutoAugment -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 13 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_aa.json $PATH_TO_IMAGENET # DALI with TrivialAugment -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation trivialaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_ta.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 13 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation trivialaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_ta.json $PATH_TO_IMAGENET # PyTorch without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET # PyTorch with AutoAugment: -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 128 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET + +# Optimized PyTorch without automatic augmentations +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch.json $PATH_TO_IMAGENET + +# Optimized PyTorch with AutoAugment: +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch_aa.json $PATH_TO_IMAGENET # The line below finds the lines with `train.total_ips`, takes the last one (with the result we @@ -107,6 +113,9 @@ CHECK_PERF_THRESHOLD "bench_report_dali_aa.json" $DALI_AA_THRESHOLD CHECK_PERF_THRESHOLD "bench_report_dali_ta.json" $DALI_TA_THRESHOLD CHECK_PERF_THRESHOLD "bench_report_pytorch.json" $PYTORCH_NONE_THRESHOLD CHECK_PERF_THRESHOLD "bench_report_pytorch_aa.json" $PYTORCH_AA_THRESHOLD +CHECK_PERF_THRESHOLD "bench_report_optimized_pytorch.json" $PYTORCH_NONE_THRESHOLD +CHECK_PERF_THRESHOLD "bench_report_optimized_pytorch_aa.json" $PYTORCH_AA_THRESHOLD + # In the initial training we get significant increase in accuracy on the first few epochs, From fffd337ecf5e2ea79da438d110af113d1b4a2969 Mon Sep 17 00:00:00 2001 From: Janusz Lisiecki Date: Fri, 20 Dec 2024 18:36:18 +0100 Subject: [PATCH 2/3] Review fixes Signed-off-by: Janusz Lisiecki --- .../image_classification/dataloaders.py | 309 +++++++++++++----- .../use_cases/pytorch/efficientnet/main.py | 101 ++++-- qa/TL3_EfficientNet_benchmark/test_pytorch.sh | 8 +- 3 files changed, 307 insertions(+), 111 deletions(-) diff --git a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py index 064d441e048..e97ca7d7765 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py +++ b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py @@ -35,7 +35,7 @@ from image_classification.autoaugment import AutoaugmentImageNetPolicy -DATA_BACKEND_CHOICES = ["pytorch", "synthetic"] +DATA_BACKEND_CHOICES = ["pytorch", "pytorch_optimized", "synthetic"] try: from nvidia.dali.plugin.pytorch import DALIClassificationIterator import nvidia.dali.types as types @@ -248,15 +248,11 @@ def gdvl( return gdvl -def fast_collate(memory_format, typical_loader, batch): +def fast_collate(memory_format, batch): imgs = [img[0] for img in batch] targets = torch.tensor([target[1] for target in batch], dtype=torch.int64) - if typical_loader: - w = imgs[0].size()[1] - h = imgs[0].size()[2] - else: - w = imgs[0].size[0] - h = imgs[0].size[1] + w = imgs[0].size()[1] + h = imgs[0].size()[2] tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous( memory_format=memory_format @@ -265,8 +261,6 @@ def fast_collate(memory_format, typical_loader, batch): nump_array = np.asarray(img, dtype=np.uint8) if nump_array.ndim < 3: nump_array = np.expand_dims(nump_array, axis=-1) - if typical_loader is False: - nump_array = np.rollaxis(nump_array, 2) tensor[i] += torch.from_numpy(nump_array.copy()) @@ -282,59 +276,22 @@ def expand(num_classes, dtype, tensor): class PrefetchedWrapper(object): - def prefetched_loader(loader, num_classes, one_hot, typical_loader ): - if typical_loader: - stream = torch.cuda.Stream() - for next_input, next_target in loader: - with torch.cuda.stream(stream): - next_input = next_input.to(device="cuda") - next_target = next_target.to(device="cuda") - next_input = next_input.float() - if one_hot: - next_target = expand(num_classes, torch.float, next_target) - yield next_input, next_target - else: - mean = ( - torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - std = ( - torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - - stream = torch.cuda.Stream() - first = True - - for next_input, next_target in loader: - with torch.cuda.stream(stream): - next_input = next_input.cuda(non_blocking=True) - next_target = next_target.cuda(non_blocking=True) - next_input = next_input.float() - if one_hot: - next_target = expand(num_classes, torch.float, next_target) - - # next_input = next_input.sub_(mean).div_(std) - - if not first: - yield input, target - else: - first = False - - torch.cuda.current_stream().wait_stream(stream) - input = next_input - target = next_target - - yield input, target - - def __init__(self, dataloader, start_epoch, num_classes, one_hot, typical_loader): + def prefetched_loader(loader, num_classes, one_hot): + stream = torch.cuda.Stream() + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.to(device="cuda") + next_target = next_target.to(device="cuda") + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + yield next_input, next_target + + def __init__(self, dataloader, start_epoch, num_classes, one_hot): self.dataloader = dataloader self.epoch = start_epoch self.one_hot = one_hot self.num_classes = num_classes - self.typical_loader = typical_loader def __iter__(self): if self.dataloader.sampler is not None and isinstance( @@ -345,7 +302,7 @@ def __iter__(self): self.dataloader.sampler.set_epoch(self.epoch) self.epoch += 1 return PrefetchedWrapper.prefetched_loader( - self.dataloader, self.num_classes, self.one_hot, self.typical_loader + self.dataloader, self.num_classes, self.one_hot ) def __len__(self): @@ -365,7 +322,6 @@ def get_pytorch_train_loader( _worker_init_fn=None, prefetch_factor=2, memory_format=torch.contiguous_format, - typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation @@ -385,11 +341,10 @@ def get_pytorch_train_loader( " for PyTorch data loader." ) - if typical_loader: - transforms_list.append(transforms.ToTensor()) - transforms_list.append( - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ) + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) train_dataset = datasets.ImageFolder( traindir, transforms.Compose(transforms_list) ) @@ -409,14 +364,19 @@ def get_pytorch_train_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format, typical_loader), + collate_fn=partial(fast_collate, memory_format), drop_last=True, persistent_workers=True, prefetch_factor=prefetch_factor, ) return ( - PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot, typical_loader), + PrefetchedWrapper( + train_loader, + start_epoch, + num_classes, + one_hot, + ), len(train_loader), ) @@ -433,7 +393,6 @@ def get_pytorch_val_loader( crop_padding=32, memory_format=torch.contiguous_format, prefetch_factor=2, - typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation @@ -445,11 +404,211 @@ def get_pytorch_val_loader( ), transforms.CenterCrop(image_size), ] - if typical_loader: - transforms_list.append(transforms.ToTensor()) - transforms_list.append( - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose(transforms_list), + ) + + if torch.distributed.is_initialized(): + val_sampler = torch.utils.data.distributed.DistributedSampler( + val_dataset, shuffle=False + ) + else: + val_sampler = None + + val_loader = torch.utils.data.DataLoader( + val_dataset, + sampler=val_sampler, + batch_size=batch_size, + shuffle=(val_sampler is None), + num_workers=workers, + worker_init_fn=_worker_init_fn, + pin_memory=True, + collate_fn=partial(fast_collate, memory_format), + drop_last=False, + persistent_workers=True, + prefetch_factor=prefetch_factor, + ) + + return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len( + val_loader + ) + + +def fast_optimized_collate(memory_format, batch): + imgs = [img[0] for img in batch] + targets = torch.tensor([target[1] for target in batch], dtype=torch.int64) + w = imgs[0].size[0] + h = imgs[0].size[1] + + tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous( + memory_format=memory_format + ) + for i, img in enumerate(imgs): + nump_array = np.asarray(img, dtype=np.uint8) + if nump_array.ndim < 3: + nump_array = np.expand_dims(nump_array, axis=-1) + nump_array = np.rollaxis(nump_array, 2) + + tensor[i] += torch.from_numpy(nump_array.copy()) + + return tensor, targets + + +class PrefetchedOptimizedWrapper(object): + def prefetched_loader(loader, num_classes, one_hot): + mean = ( + torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) + .cuda() + .view(1, 3, 1, 1) ) + std = ( + torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) + .cuda() + .view(1, 3, 1, 1) + ) + + stream = torch.cuda.Stream() + first = True + + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.cuda(non_blocking=True) + next_target = next_target.cuda(non_blocking=True) + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + + next_input = next_input.sub_(mean).div_(std) + + if not first: + yield input, target + else: + first = False + + torch.cuda.current_stream().wait_stream(stream) + input = next_input + target = next_target + + yield input, target + + def __init__(self, dataloader, start_epoch, num_classes, one_hot): + self.dataloader = dataloader + self.epoch = start_epoch + self.one_hot = one_hot + self.num_classes = num_classes + + def __iter__(self): + if self.dataloader.sampler is not None and isinstance( + self.dataloader.sampler, + torch.utils.data.distributed.DistributedSampler, + ): + + self.dataloader.sampler.set_epoch(self.epoch) + self.epoch += 1 + return PrefetchedOptimizedWrapper.prefetched_loader( + self.dataloader, self.num_classes, self.one_hot + ) + + def __len__(self): + return len(self.dataloader) + + +def get_pytorch_optimized_train_loader( + data_path, + image_size, + batch_size, + num_classes, + one_hot, + interpolation="bilinear", + augmentation=None, + start_epoch=0, + workers=5, + _worker_init_fn=None, + prefetch_factor=2, + memory_format=torch.contiguous_format, +): + interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ + interpolation + ] + traindir = os.path.join(data_path, "train") + transforms_list = [ + transforms.RandomResizedCrop(image_size, interpolation=interpolation), + transforms.RandomHorizontalFlip(), + ] + if augmentation == "disabled": + pass + elif augmentation == "autoaugment": + transforms_list.append(AutoaugmentImageNetPolicy()) + else: + raise NotImplementedError( + f"Automatic augmentation: '{augmentation}' is not supported" + " for PyTorch data loader." + ) + + train_dataset = datasets.ImageFolder( + traindir, transforms.Compose(transforms_list) + ) + + if torch.distributed.is_initialized(): + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, shuffle=True + ) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, + sampler=train_sampler, + batch_size=batch_size, + shuffle=(train_sampler is None), + num_workers=workers, + worker_init_fn=_worker_init_fn, + pin_memory=True, + collate_fn=partial(fast_optimized_collate, memory_format), + drop_last=True, + persistent_workers=True, + prefetch_factor=prefetch_factor, + ) + + return ( + PrefetchedOptimizedWrapper( + train_loader, + start_epoch, + num_classes, + one_hot, + ), + len(train_loader), + ) + + +def get_pytorch_optimize_val_loader( + data_path, + image_size, + batch_size, + num_classes, + one_hot, + interpolation="bilinear", + workers=5, + _worker_init_fn=None, + crop_padding=32, + memory_format=torch.contiguous_format, + prefetch_factor=2, +): + interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ + interpolation + ] + valdir = os.path.join(data_path, "val") + transforms_list = [ + transforms.Resize( + image_size + crop_padding, interpolation=interpolation + ), + transforms.CenterCrop(image_size), + ] val_dataset = datasets.ImageFolder( valdir, transforms.Compose(transforms_list), @@ -470,13 +629,13 @@ def get_pytorch_val_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format, typical_loader), + collate_fn=partial(fast_optimized_collate, memory_format), drop_last=False, persistent_workers=True, prefetch_factor=prefetch_factor, ) - return PrefetchedWrapper(val_loader, 0, num_classes, one_hot, typical_loader), len( + return PrefetchedOptimizedWrapper(val_loader, 0, num_classes, one_hot), len( val_loader ) diff --git a/docs/examples/use_cases/pytorch/efficientnet/main.py b/docs/examples/use_cases/pytorch/efficientnet/main.py index e7677b38388..43c6eacfa66 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/main.py +++ b/docs/examples/use_cases/pytorch/efficientnet/main.py @@ -29,9 +29,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -os.environ[ - "KMP_AFFINITY" -] = "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389 +os.environ["KMP_AFFINITY"] = ( + "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389 +) import argparse import random @@ -78,11 +78,6 @@ def add_parser_arguments(parser, skip_arch=False): + " | ".join(DATA_BACKEND_CHOICES) + " (default: dali)", ) - parser.add_argument( - "--typical_loader", - action="store_true", - help="Skip advanced PyTorch data loader optimizations.", - ) parser.add_argument( "--interpolation", metavar="INTERPOLATION", @@ -108,8 +103,10 @@ def add_parser_arguments(parser, skip_arch=False): default=4, type=int, metavar="N", - help=("number of data loading workers (default: 4)." - " The number of workers for PyTorch loader is doubled."), + help=( + "number of data loading workers (default: 4)." + " The number of workers for PyTorch loader is doubled." + ), ) parser.add_argument( "--prefetch", @@ -123,7 +120,9 @@ def add_parser_arguments(parser, skip_arch=False): default="gpu", type=str, choices=["cpu", "gpu"], - help=("The placement of DALI decode and random resized crop operations (default: gpu)"), + help=( + "The placement of DALI decode and random resized crop operations (default: gpu)" + ), ) parser.add_argument( "--epochs", @@ -180,13 +179,19 @@ def add_parser_arguments(parser, skip_arch=False): type=str, metavar="SCHEDULE", choices=["step", "linear", "cosine"], - help="Type of LR schedule: {}, {}, {}".format("step", "linear", "cosine"), + help="Type of LR schedule: {}, {}, {}".format( + "step", "linear", "cosine" + ), ) parser.add_argument("--end-lr", default=0, type=float) parser.add_argument( - "--warmup", default=16, type=int, metavar="E", help="number of warmup epochs" + "--warmup", + default=16, + type=int, + metavar="E", + help="number of warmup epochs", ) parser.add_argument( @@ -260,7 +265,11 @@ def add_parser_arguments(parser, skip_arch=False): help="Static loss scale, positive power of 2 values can improve amp convergence.", ) parser.add_argument( - "--prof", type=int, default=-1, metavar="N", help="Run only N iterations" + "--prof", + type=int, + default=-1, + metavar="N", + help="Run only N iterations", ) parser.add_argument( "--amp", @@ -269,7 +278,10 @@ def add_parser_arguments(parser, skip_arch=False): ) parser.add_argument( - "--seed", default=None, type=int, help="random seed used for numpy and pytorch" + "--seed", + default=None, + type=int, + help="random seed used for numpy and pytorch", ) parser.add_argument( @@ -293,7 +305,9 @@ def add_parser_arguments(parser, skip_arch=False): parser.add_argument( "--evaluate", action="store_true", help="evaluate checkpoint/model" ) - parser.add_argument("--training-only", action="store_true", help="do not evaluate") + parser.add_argument( + "--training-only", action="store_true", help="do not evaluate" + ) parser.add_argument( "--no-checkpoints", @@ -309,7 +323,9 @@ def add_parser_arguments(parser, skip_arch=False): help="no -> do not use torch.jit; script -> use torch.jit.script", ) - parser.add_argument("--checkpoint-filename", default="checkpoint.pth.tar", type=str) + parser.add_argument( + "--checkpoint-filename", default="checkpoint.pth.tar", type=str + ) parser.add_argument( "--workspace", @@ -380,7 +396,9 @@ def prepare_for_training(args, model_args, model_arch): def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) - print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") + print( + f"Process {args.local_rank} Worker {id} set affinity to: {affinity}" + ) np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) @@ -390,11 +408,15 @@ def _worker_init_fn(id): def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) - print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") + print( + f"Process {args.local_rank} Worker {id} set affinity to: {affinity}" + ) if args.static_loss_scale != 1.0: if not args.amp: - print("Warning: if --amp is not used, static_loss_scale will be ignored.") + print( + "Warning: if --amp is not used, static_loss_scale will be ignored." + ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 @@ -416,7 +438,8 @@ def _worker_init_fn(id): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( - args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu) + args.resume, + map_location=lambda storage, loc: storage.cuda(args.gpu), ) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] @@ -451,13 +474,17 @@ def _worker_init_fn(id): loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = ( - torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format + torch.channels_last + if args.memory_format == "nhwc" + else torch.contiguous_format ) model = model_arch( **{ - k: v - if k != "pretrained" - else v and (not args.distributed or dist.get_rank() == 0) + k: ( + v + if k != "pretrained" + else v and (not args.distributed or dist.get_rank() == 0) + ) for k, v in model_args.__dict__.items() } ) @@ -492,6 +519,10 @@ def _worker_init_fn(id): args.workers = args.workers * 2 get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader + elif args.data_backend == "pytorch_optimized": + args.workers = args.workers * 2 + get_train_loader = get_pytorch_optimized_train_loader + get_val_loader = get_pytorch_optimize_val_loader elif args.data_backend == "dali": get_train_loader = get_dali_train_loader(dali_device=args.dali_device) get_val_loader = get_dali_val_loader() @@ -515,7 +546,6 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, - typical_loader=args.typical_loader, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) @@ -531,10 +561,12 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, - typical_loader=args.typical_loader, ) - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + if ( + not torch.distributed.is_initialized() + or torch.distributed.get_rank() == 0 + ): logger = log.Logger( args.print_freq, [ @@ -624,9 +656,11 @@ def main(args, model_args, model_arch): val_loader, logger, start_epoch=start_epoch, - end_epoch=min((start_epoch + args.run_epochs), args.epochs) - if args.run_epochs != -1 - else args.epochs, + end_epoch=( + min((start_epoch + args.run_epochs), args.epochs) + if args.run_epochs != -1 + else args.epochs + ), early_stopping_patience=args.early_stopping_patience, best_prec1=best_prec1, prof=args.prof, @@ -639,7 +673,10 @@ def main(args, model_args, model_arch): topk=args.topk, ) exp_duration = time.time() - exp_start_time - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + if ( + not torch.distributed.is_initialized() + or torch.distributed.get_rank() == 0 + ): logger.end() print("Experiment ended") diff --git a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh index abd7cbf4e54..c47f3a7c26c 100644 --- a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh +++ b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh @@ -68,16 +68,16 @@ python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 - python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 13 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation trivialaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_ta.json $PATH_TO_IMAGENET # PyTorch without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET # PyTorch with AutoAugment: -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET # Optimized PyTorch without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch_optimized --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch.json $PATH_TO_IMAGENET # Optimized PyTorch with AutoAugment: -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch_optimized --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch_aa.json $PATH_TO_IMAGENET # The line below finds the lines with `train.total_ips`, takes the last one (with the result we From df52ad42a42a071ff74595b72e4a2c36720e914e Mon Sep 17 00:00:00 2001 From: Janusz Lisiecki Date: Mon, 23 Dec 2024 16:09:00 +0100 Subject: [PATCH 3/3] Adjust tresholds Signed-off-by: Janusz Lisiecki --- qa/TL3_EfficientNet_benchmark/test_pytorch.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh index c47f3a7c26c..a09db046393 100644 --- a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh +++ b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh @@ -85,13 +85,13 @@ python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 - # as JSON using Python. We can now parse the values or directly evaluate the thresholds. # grep "train.total_ips" .json | tail -1 | cut -c 5- | python3 -c "import sys, json; print(json.load(sys.stdin))" -# Actual results are about 500 samples/s more -SYNTH_THRESHOLD=32000 -DALI_NONE_THRESHOLD=27000 -DALI_AA_THRESHOLD=26000 -DALI_TA_THRESHOLD=26000 -PYTORCH_NONE_THRESHOLD=23000 -PYTORCH_AA_THRESHOLD=22000 +# Actual results are about 10% samples/s more +SYNTH_THRESHOLD=38000 +DALI_NONE_THRESHOLD=32000 +DALI_AA_THRESHOLD=32000 +DALI_TA_THRESHOLD=32000 +PYTORCH_NONE_THRESHOLD=32000 +PYTORCH_AA_THRESHOLD=32000 function CHECK_PERF_THRESHOLD { FILENAME=$1