diff --git a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py index 064d441e04..e97ca7d776 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py +++ b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py @@ -35,7 +35,7 @@ from image_classification.autoaugment import AutoaugmentImageNetPolicy -DATA_BACKEND_CHOICES = ["pytorch", "synthetic"] +DATA_BACKEND_CHOICES = ["pytorch", "pytorch_optimized", "synthetic"] try: from nvidia.dali.plugin.pytorch import DALIClassificationIterator import nvidia.dali.types as types @@ -248,15 +248,11 @@ def gdvl( return gdvl -def fast_collate(memory_format, typical_loader, batch): +def fast_collate(memory_format, batch): imgs = [img[0] for img in batch] targets = torch.tensor([target[1] for target in batch], dtype=torch.int64) - if typical_loader: - w = imgs[0].size()[1] - h = imgs[0].size()[2] - else: - w = imgs[0].size[0] - h = imgs[0].size[1] + w = imgs[0].size()[1] + h = imgs[0].size()[2] tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous( memory_format=memory_format @@ -265,8 +261,6 @@ def fast_collate(memory_format, typical_loader, batch): nump_array = np.asarray(img, dtype=np.uint8) if nump_array.ndim < 3: nump_array = np.expand_dims(nump_array, axis=-1) - if typical_loader is False: - nump_array = np.rollaxis(nump_array, 2) tensor[i] += torch.from_numpy(nump_array.copy()) @@ -282,59 +276,22 @@ def expand(num_classes, dtype, tensor): class PrefetchedWrapper(object): - def prefetched_loader(loader, num_classes, one_hot, typical_loader ): - if typical_loader: - stream = torch.cuda.Stream() - for next_input, next_target in loader: - with torch.cuda.stream(stream): - next_input = next_input.to(device="cuda") - next_target = next_target.to(device="cuda") - next_input = next_input.float() - if one_hot: - next_target = expand(num_classes, torch.float, next_target) - yield next_input, next_target - else: - mean = ( - torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - std = ( - torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) - .cuda() - .view(1, 3, 1, 1) - ) - - stream = torch.cuda.Stream() - first = True - - for next_input, next_target in loader: - with torch.cuda.stream(stream): - next_input = next_input.cuda(non_blocking=True) - next_target = next_target.cuda(non_blocking=True) - next_input = next_input.float() - if one_hot: - next_target = expand(num_classes, torch.float, next_target) - - # next_input = next_input.sub_(mean).div_(std) - - if not first: - yield input, target - else: - first = False - - torch.cuda.current_stream().wait_stream(stream) - input = next_input - target = next_target - - yield input, target - - def __init__(self, dataloader, start_epoch, num_classes, one_hot, typical_loader): + def prefetched_loader(loader, num_classes, one_hot): + stream = torch.cuda.Stream() + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.to(device="cuda") + next_target = next_target.to(device="cuda") + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + yield next_input, next_target + + def __init__(self, dataloader, start_epoch, num_classes, one_hot): self.dataloader = dataloader self.epoch = start_epoch self.one_hot = one_hot self.num_classes = num_classes - self.typical_loader = typical_loader def __iter__(self): if self.dataloader.sampler is not None and isinstance( @@ -345,7 +302,7 @@ def __iter__(self): self.dataloader.sampler.set_epoch(self.epoch) self.epoch += 1 return PrefetchedWrapper.prefetched_loader( - self.dataloader, self.num_classes, self.one_hot, self.typical_loader + self.dataloader, self.num_classes, self.one_hot ) def __len__(self): @@ -365,7 +322,6 @@ def get_pytorch_train_loader( _worker_init_fn=None, prefetch_factor=2, memory_format=torch.contiguous_format, - typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation @@ -385,11 +341,10 @@ def get_pytorch_train_loader( " for PyTorch data loader." ) - if typical_loader: - transforms_list.append(transforms.ToTensor()) - transforms_list.append( - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) - ) + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) train_dataset = datasets.ImageFolder( traindir, transforms.Compose(transforms_list) ) @@ -409,14 +364,19 @@ def get_pytorch_train_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format, typical_loader), + collate_fn=partial(fast_collate, memory_format), drop_last=True, persistent_workers=True, prefetch_factor=prefetch_factor, ) return ( - PrefetchedWrapper(train_loader, start_epoch, num_classes, one_hot, typical_loader), + PrefetchedWrapper( + train_loader, + start_epoch, + num_classes, + one_hot, + ), len(train_loader), ) @@ -433,7 +393,6 @@ def get_pytorch_val_loader( crop_padding=32, memory_format=torch.contiguous_format, prefetch_factor=2, - typical_loader=False, ): interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ interpolation @@ -445,11 +404,211 @@ def get_pytorch_val_loader( ), transforms.CenterCrop(image_size), ] - if typical_loader: - transforms_list.append(transforms.ToTensor()) - transforms_list.append( - transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + transforms_list.append(transforms.ToTensor()) + transforms_list.append( + transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) + ) + val_dataset = datasets.ImageFolder( + valdir, + transforms.Compose(transforms_list), + ) + + if torch.distributed.is_initialized(): + val_sampler = torch.utils.data.distributed.DistributedSampler( + val_dataset, shuffle=False + ) + else: + val_sampler = None + + val_loader = torch.utils.data.DataLoader( + val_dataset, + sampler=val_sampler, + batch_size=batch_size, + shuffle=(val_sampler is None), + num_workers=workers, + worker_init_fn=_worker_init_fn, + pin_memory=True, + collate_fn=partial(fast_collate, memory_format), + drop_last=False, + persistent_workers=True, + prefetch_factor=prefetch_factor, + ) + + return PrefetchedWrapper(val_loader, 0, num_classes, one_hot), len( + val_loader + ) + + +def fast_optimized_collate(memory_format, batch): + imgs = [img[0] for img in batch] + targets = torch.tensor([target[1] for target in batch], dtype=torch.int64) + w = imgs[0].size[0] + h = imgs[0].size[1] + + tensor = torch.zeros((len(imgs), 3, h, w), dtype=torch.uint8).contiguous( + memory_format=memory_format + ) + for i, img in enumerate(imgs): + nump_array = np.asarray(img, dtype=np.uint8) + if nump_array.ndim < 3: + nump_array = np.expand_dims(nump_array, axis=-1) + nump_array = np.rollaxis(nump_array, 2) + + tensor[i] += torch.from_numpy(nump_array.copy()) + + return tensor, targets + + +class PrefetchedOptimizedWrapper(object): + def prefetched_loader(loader, num_classes, one_hot): + mean = ( + torch.tensor([0.485 * 255, 0.456 * 255, 0.406 * 255]) + .cuda() + .view(1, 3, 1, 1) ) + std = ( + torch.tensor([0.229 * 255, 0.224 * 255, 0.225 * 255]) + .cuda() + .view(1, 3, 1, 1) + ) + + stream = torch.cuda.Stream() + first = True + + for next_input, next_target in loader: + with torch.cuda.stream(stream): + next_input = next_input.cuda(non_blocking=True) + next_target = next_target.cuda(non_blocking=True) + next_input = next_input.float() + if one_hot: + next_target = expand(num_classes, torch.float, next_target) + + next_input = next_input.sub_(mean).div_(std) + + if not first: + yield input, target + else: + first = False + + torch.cuda.current_stream().wait_stream(stream) + input = next_input + target = next_target + + yield input, target + + def __init__(self, dataloader, start_epoch, num_classes, one_hot): + self.dataloader = dataloader + self.epoch = start_epoch + self.one_hot = one_hot + self.num_classes = num_classes + + def __iter__(self): + if self.dataloader.sampler is not None and isinstance( + self.dataloader.sampler, + torch.utils.data.distributed.DistributedSampler, + ): + + self.dataloader.sampler.set_epoch(self.epoch) + self.epoch += 1 + return PrefetchedOptimizedWrapper.prefetched_loader( + self.dataloader, self.num_classes, self.one_hot + ) + + def __len__(self): + return len(self.dataloader) + + +def get_pytorch_optimized_train_loader( + data_path, + image_size, + batch_size, + num_classes, + one_hot, + interpolation="bilinear", + augmentation=None, + start_epoch=0, + workers=5, + _worker_init_fn=None, + prefetch_factor=2, + memory_format=torch.contiguous_format, +): + interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ + interpolation + ] + traindir = os.path.join(data_path, "train") + transforms_list = [ + transforms.RandomResizedCrop(image_size, interpolation=interpolation), + transforms.RandomHorizontalFlip(), + ] + if augmentation == "disabled": + pass + elif augmentation == "autoaugment": + transforms_list.append(AutoaugmentImageNetPolicy()) + else: + raise NotImplementedError( + f"Automatic augmentation: '{augmentation}' is not supported" + " for PyTorch data loader." + ) + + train_dataset = datasets.ImageFolder( + traindir, transforms.Compose(transforms_list) + ) + + if torch.distributed.is_initialized(): + train_sampler = torch.utils.data.distributed.DistributedSampler( + train_dataset, shuffle=True + ) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, + sampler=train_sampler, + batch_size=batch_size, + shuffle=(train_sampler is None), + num_workers=workers, + worker_init_fn=_worker_init_fn, + pin_memory=True, + collate_fn=partial(fast_optimized_collate, memory_format), + drop_last=True, + persistent_workers=True, + prefetch_factor=prefetch_factor, + ) + + return ( + PrefetchedOptimizedWrapper( + train_loader, + start_epoch, + num_classes, + one_hot, + ), + len(train_loader), + ) + + +def get_pytorch_optimize_val_loader( + data_path, + image_size, + batch_size, + num_classes, + one_hot, + interpolation="bilinear", + workers=5, + _worker_init_fn=None, + crop_padding=32, + memory_format=torch.contiguous_format, + prefetch_factor=2, +): + interpolation = {"bicubic": Image.BICUBIC, "bilinear": Image.BILINEAR}[ + interpolation + ] + valdir = os.path.join(data_path, "val") + transforms_list = [ + transforms.Resize( + image_size + crop_padding, interpolation=interpolation + ), + transforms.CenterCrop(image_size), + ] val_dataset = datasets.ImageFolder( valdir, transforms.Compose(transforms_list), @@ -470,13 +629,13 @@ def get_pytorch_val_loader( num_workers=workers, worker_init_fn=_worker_init_fn, pin_memory=True, - collate_fn=partial(fast_collate, memory_format, typical_loader), + collate_fn=partial(fast_optimized_collate, memory_format), drop_last=False, persistent_workers=True, prefetch_factor=prefetch_factor, ) - return PrefetchedWrapper(val_loader, 0, num_classes, one_hot, typical_loader), len( + return PrefetchedOptimizedWrapper(val_loader, 0, num_classes, one_hot), len( val_loader ) diff --git a/docs/examples/use_cases/pytorch/efficientnet/main.py b/docs/examples/use_cases/pytorch/efficientnet/main.py index e7677b3838..43c6eacfa6 100644 --- a/docs/examples/use_cases/pytorch/efficientnet/main.py +++ b/docs/examples/use_cases/pytorch/efficientnet/main.py @@ -29,9 +29,9 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os -os.environ[ - "KMP_AFFINITY" -] = "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389 +os.environ["KMP_AFFINITY"] = ( + "disabled" # We need to do this before importing anything else as a workaround for this bug: https://github.com/pytorch/pytorch/issues/28389 +) import argparse import random @@ -78,11 +78,6 @@ def add_parser_arguments(parser, skip_arch=False): + " | ".join(DATA_BACKEND_CHOICES) + " (default: dali)", ) - parser.add_argument( - "--typical_loader", - action="store_true", - help="Skip advanced PyTorch data loader optimizations.", - ) parser.add_argument( "--interpolation", metavar="INTERPOLATION", @@ -108,8 +103,10 @@ def add_parser_arguments(parser, skip_arch=False): default=4, type=int, metavar="N", - help=("number of data loading workers (default: 4)." - " The number of workers for PyTorch loader is doubled."), + help=( + "number of data loading workers (default: 4)." + " The number of workers for PyTorch loader is doubled." + ), ) parser.add_argument( "--prefetch", @@ -123,7 +120,9 @@ def add_parser_arguments(parser, skip_arch=False): default="gpu", type=str, choices=["cpu", "gpu"], - help=("The placement of DALI decode and random resized crop operations (default: gpu)"), + help=( + "The placement of DALI decode and random resized crop operations (default: gpu)" + ), ) parser.add_argument( "--epochs", @@ -180,13 +179,19 @@ def add_parser_arguments(parser, skip_arch=False): type=str, metavar="SCHEDULE", choices=["step", "linear", "cosine"], - help="Type of LR schedule: {}, {}, {}".format("step", "linear", "cosine"), + help="Type of LR schedule: {}, {}, {}".format( + "step", "linear", "cosine" + ), ) parser.add_argument("--end-lr", default=0, type=float) parser.add_argument( - "--warmup", default=16, type=int, metavar="E", help="number of warmup epochs" + "--warmup", + default=16, + type=int, + metavar="E", + help="number of warmup epochs", ) parser.add_argument( @@ -260,7 +265,11 @@ def add_parser_arguments(parser, skip_arch=False): help="Static loss scale, positive power of 2 values can improve amp convergence.", ) parser.add_argument( - "--prof", type=int, default=-1, metavar="N", help="Run only N iterations" + "--prof", + type=int, + default=-1, + metavar="N", + help="Run only N iterations", ) parser.add_argument( "--amp", @@ -269,7 +278,10 @@ def add_parser_arguments(parser, skip_arch=False): ) parser.add_argument( - "--seed", default=None, type=int, help="random seed used for numpy and pytorch" + "--seed", + default=None, + type=int, + help="random seed used for numpy and pytorch", ) parser.add_argument( @@ -293,7 +305,9 @@ def add_parser_arguments(parser, skip_arch=False): parser.add_argument( "--evaluate", action="store_true", help="evaluate checkpoint/model" ) - parser.add_argument("--training-only", action="store_true", help="do not evaluate") + parser.add_argument( + "--training-only", action="store_true", help="do not evaluate" + ) parser.add_argument( "--no-checkpoints", @@ -309,7 +323,9 @@ def add_parser_arguments(parser, skip_arch=False): help="no -> do not use torch.jit; script -> use torch.jit.script", ) - parser.add_argument("--checkpoint-filename", default="checkpoint.pth.tar", type=str) + parser.add_argument( + "--checkpoint-filename", default="checkpoint.pth.tar", type=str + ) parser.add_argument( "--workspace", @@ -380,7 +396,9 @@ def prepare_for_training(args, model_args, model_arch): def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) - print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") + print( + f"Process {args.local_rank} Worker {id} set affinity to: {affinity}" + ) np.random.seed(seed=args.seed + args.local_rank + id) random.seed(args.seed + args.local_rank + id) @@ -390,11 +408,15 @@ def _worker_init_fn(id): def _worker_init_fn(id): # Worker process should inherit its affinity from parent affinity = os.sched_getaffinity(0) - print(f"Process {args.local_rank} Worker {id} set affinity to: {affinity}") + print( + f"Process {args.local_rank} Worker {id} set affinity to: {affinity}" + ) if args.static_loss_scale != 1.0: if not args.amp: - print("Warning: if --amp is not used, static_loss_scale will be ignored.") + print( + "Warning: if --amp is not used, static_loss_scale will be ignored." + ) if args.optimizer_batch_size < 0: batch_size_multiplier = 1 @@ -416,7 +438,8 @@ def _worker_init_fn(id): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( - args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu) + args.resume, + map_location=lambda storage, loc: storage.cuda(args.gpu), ) start_epoch = checkpoint["epoch"] best_prec1 = checkpoint["best_prec1"] @@ -451,13 +474,17 @@ def _worker_init_fn(id): loss = lambda: LabelSmoothing(args.label_smoothing) memory_format = ( - torch.channels_last if args.memory_format == "nhwc" else torch.contiguous_format + torch.channels_last + if args.memory_format == "nhwc" + else torch.contiguous_format ) model = model_arch( **{ - k: v - if k != "pretrained" - else v and (not args.distributed or dist.get_rank() == 0) + k: ( + v + if k != "pretrained" + else v and (not args.distributed or dist.get_rank() == 0) + ) for k, v in model_args.__dict__.items() } ) @@ -492,6 +519,10 @@ def _worker_init_fn(id): args.workers = args.workers * 2 get_train_loader = get_pytorch_train_loader get_val_loader = get_pytorch_val_loader + elif args.data_backend == "pytorch_optimized": + args.workers = args.workers * 2 + get_train_loader = get_pytorch_optimized_train_loader + get_val_loader = get_pytorch_optimize_val_loader elif args.data_backend == "dali": get_train_loader = get_dali_train_loader(dali_device=args.dali_device) get_val_loader = get_dali_val_loader() @@ -515,7 +546,6 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, - typical_loader=args.typical_loader, ) if args.mixup != 0.0: train_loader = MixUpWrapper(args.mixup, train_loader) @@ -531,10 +561,12 @@ def _worker_init_fn(id): _worker_init_fn=_worker_init_fn, memory_format=memory_format, prefetch_factor=args.prefetch, - typical_loader=args.typical_loader, ) - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + if ( + not torch.distributed.is_initialized() + or torch.distributed.get_rank() == 0 + ): logger = log.Logger( args.print_freq, [ @@ -624,9 +656,11 @@ def main(args, model_args, model_arch): val_loader, logger, start_epoch=start_epoch, - end_epoch=min((start_epoch + args.run_epochs), args.epochs) - if args.run_epochs != -1 - else args.epochs, + end_epoch=( + min((start_epoch + args.run_epochs), args.epochs) + if args.run_epochs != -1 + else args.epochs + ), early_stopping_patience=args.early_stopping_patience, best_prec1=best_prec1, prof=args.prof, @@ -639,7 +673,10 @@ def main(args, model_args, model_arch): topk=args.topk, ) exp_duration = time.time() - exp_start_time - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: + if ( + not torch.distributed.is_initialized() + or torch.distributed.get_rank() == 0 + ): logger.end() print("Experiment ended") diff --git a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh index abd7cbf4e5..c47f3a7c26 100644 --- a/qa/TL3_EfficientNet_benchmark/test_pytorch.sh +++ b/qa/TL3_EfficientNet_benchmark/test_pytorch.sh @@ -68,16 +68,16 @@ python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 - python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 13 --epochs 3 --no-checkpoints --training-only --data-backend dali --automatic-augmentation trivialaugment --workspace $RESULT_WORKSPACE --report-file bench_report_dali_ta.json $PATH_TO_IMAGENET # PyTorch without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch.json $PATH_TO_IMAGENET # PyTorch with AutoAugment: -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --typical_loader --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_pytorch_aa.json $PATH_TO_IMAGENET # Optimized PyTorch without automatic augmentations -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch_optimized --automatic-augmentation disabled --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch.json $PATH_TO_IMAGENET # Optimized PyTorch with AutoAugment: -python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch_aa.json $PATH_TO_IMAGENET +python multiproc.py --nproc_per_node 8 ./main.py --amp --static-loss-scale 128 --batch-size 512 --workers 10 --epochs 3 --no-checkpoints --training-only --data-backend pytorch_optimized --automatic-augmentation autoaugment --workspace $RESULT_WORKSPACE --report-file bench_report_optimized_pytorch_aa.json $PATH_TO_IMAGENET # The line below finds the lines with `train.total_ips`, takes the last one (with the result we