aten/src/ATen/native/cpu/UpSampleKernel.cpp

#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Context.h>
#include <ATen/Dispatch.h>
#include <ATen/Parallel.h>
#include <ATen/TensorIterator.h>
#include <ATen/cpu/vec/vec.h>
#include <ATen/native/UpSample.h>
#include <ATen/native/cpu/utils.h>
#include <c10/util/irange.h>

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty.h>
#include <ATen/ops/empty_native.h>
#include <ATen/ops/ones.h>
#endif

namespace at::native {
namespace {

using scale_t = std::vector<c10::optional<double>>;

// Helper structs and methods for cpu_upsample_linear
//
// Interpolation methods that used below are separable, and as such we can compute the interpolation
// independently per dimension in a recursive way. Please, refer to #10482 for more context.
//
// Linear Interpolation structure to compute output value in n-dimensional case.
// - recursively compute interpolated output for each dimension
// - we rely a lot on compiler's code optimization such that implemented operations
//   can be automatically factorized and vectorized using SSE and AVX2
template <int n, typename scalar_t, typename index_t, int interp_size>
struct Interpolate {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
      index_t ids = *(index_t*)&data[0][i * strides[0]];
      scalar_t wts = *(scalar_t*)&data[1][i * strides[1]];
      scalar_t t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
      scalar_t output = t * wts;
      for (const auto j : c10::irange(1, interp_size)) {
        ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]];
        wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]];
        t = Interpolate<n - 1, scalar_t, index_t, interp_size>::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i);
        output += t * wts;
      }
      return output;
  }
};

template <typename scalar_t, typename index_t, int interp_size>
struct Interpolate<1, scalar_t, index_t, interp_size> {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
      index_t ids = *(index_t*)&data[0][i * strides[0]];
      scalar_t wts = *(scalar_t*)&data[1][i * strides[1]];
      scalar_t t = *(scalar_t *)&src[ids];
      scalar_t output = t * wts;
      for (const auto j : c10::irange(1, interp_size)) {
        ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]];
        wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]];
        t = *(scalar_t *)&src[ids];
        output += t * wts;
      }
      return output;
    }
};

template <int n, typename scalar_t, typename index_t>
struct Interpolate<n, scalar_t, index_t, 1> {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
      index_t ids = *(index_t*)&data[0][i * strides[0]];
      return Interpolate<n - 1, scalar_t, index_t, 1>::eval(src + ids, &data[2], &strides[2], i);
  }
};

template <typename scalar_t, typename index_t>
struct Interpolate<1, scalar_t, index_t, 1> {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
      index_t ids = *(index_t*)&data[0][i * strides[0]];
      return *(scalar_t *)&src[ids];
    }
};

// There is an unexpected 2x slowdown for upsample_trilinear3d channels_first
// for both 1 and 6 threads. We have to specialize this case as below:
// Once the issue is fixed we can keep generic implementation and remove:
// struct Interpolate<n, scalar_t, index_t, 2> and
// struct Interpolate<1, scalar_t, index_t, 2>
template <int n, typename scalar_t, typename index_t>
struct Interpolate<n, scalar_t, index_t, 2> {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
        index_t i0 = *(index_t*)&data[0][i * strides[0]];
        index_t i1 = *(index_t*)&data[2][i * strides[2]];
        scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]];
        scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]];

        scalar_t t0 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i0, &data[4], &strides[4], i);
        scalar_t t1 = Interpolate<n - 1, scalar_t, index_t, 2>::eval(src + i1, &data[4], &strides[4], i);

        return t0 * w0 + t1 * w1;
  }
};

template <typename scalar_t, typename index_t>
struct Interpolate<1, scalar_t, index_t, 2> {
    static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) {
        index_t i0 = *(index_t*)&data[0][i * strides[0]];
        index_t i1 = *(index_t*)&data[2][i * strides[2]];
        scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]];
        scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]];
        scalar_t t0 = *(scalar_t *)&src[i0];
        scalar_t t1 = *(scalar_t *)&src[i1];
        return t0 * w0 + t1 * w1;
    }
};

template <int n, typename scalar_t, typename index_t, int interp_size>
static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) {
  return Interpolate<n, scalar_t, index_t, interp_size>::eval(src, data, strides, i);
}

template <typename scalar_t, typename index_t>
static inline scalar_t interpolate_aa_single_dim_zero_strides(
    char* src,
    char** data,
    const index_t ids_stride) {
  const index_t ids_min = *(index_t*)&data[0][0];
  const index_t ids_size = *(index_t*)&data[1][0];

  char* src_min = src + ids_min;

  scalar_t t = *(scalar_t*)&src_min[0];
  index_t wts_idx = *(index_t*)&data[4][0];
  scalar_t* wts_ptr = (scalar_t*)&data[3][wts_idx];
  scalar_t wts = wts_ptr[0];

  scalar_t output = t * wts;
  for (const auto j : c10::irange(1, ids_size)) {
    wts = wts_ptr[j];
    t = *(scalar_t*)&src_min[j * ids_stride];
    output += t * wts;
  }
  return output;
}

template <typename scalar_t, typename index_t>
static inline scalar_t interpolate_aa_single_dim(
    char* src,
    char** data,
    const int64_t* strides,
    int64_t i,
    const index_t ids_stride) {
  index_t ids_min = *(index_t*)&data[0][i * strides[0]];
  index_t ids_size = *(index_t*)&data[1][i * strides[1]];

  char* src_min = src + ids_min;

  scalar_t t = *(scalar_t*)&src_min[0];
  index_t wts_idx = *(index_t*)&data[4][i * strides[4]];
  scalar_t* wts_ptr = (scalar_t*)&data[3][wts_idx];
  scalar_t wts = wts_ptr[0];

  scalar_t output = t * wts;
  for (const auto j : c10::irange(1, ids_size)) {
    wts = wts_ptr[j];
    t = *(scalar_t*)&src_min[j * ids_stride];
    output += t * wts;
  }
  return output;
}

template<int m>
static inline bool is_zero_stride(const int64_t* strides) {
  bool output = strides[0] == 0;
  for (const auto i : c10::irange(1, m)) {
    output &= (strides[i] == 0);
  }
  return output;
}

template <typename scalar_t, typename index_t, int interp_size>
static inline bool is_contiguous_stride(const int64_t* strides) {
  bool output = (strides[0] == sizeof(index_t)) && (strides[1] == sizeof(scalar_t));
  for (int i=2; i<2 * interp_size; i+=2) {
    output &= (strides[i] == sizeof(index_t)) && (strides[i + 1] == sizeof(scalar_t));
  }
  return output;
}

// Helper class to recursively check if all input strides corresponding to interpolated dimensions
// are equal zero except on a single dimension.
//
// Inputs: array of strides of size N, non_zero_stride_dim which can be -1, 0, 1, 2, ...
//   if non_zero_stride_dim, we check that all strides are equal zero, otherwise
//   4 strides corresponding to the strides for index_0, weight_0, index_1 and weight_1 for non_zero_stride_dim
//   dimension should be non zero.
//
// Unit check of the recursion is to verify whether 4 strides for one interpolated dimension are either zero,
// see method is_zero_stride, or (sizeof(index_t), sizeof(scalar_t), sizeof(index_t), sizeof(scalar_t)), see
// method is_contiguous_stride.
//
// In practice, we have the following cases:
// - for ND, float32, channel first, strides are
//         dimN-1,              dim1,           dim0
//         i0, w0, i1, w1, ..., i0, w0, i1, w1, i0, w0, i1, w1
// strides=(0,  0,  0,  0, ...,  0,  0,  0,  0,  4,  4,  4,  4)
//
// if size dim0 is 1 then its strides are 0 and dim1 strides are equal 4
//
// - for ND, float32, channel last, strides are
//         dimN-1,         dimN-2,             dim0
//         i0, w0, i1, w1, i0, w0, i1, w1, ... i0, w0, i1, w1
// strides=(0,  0,  0,  0,  0,  0,  0,  0, ..., 0,  0,  0,  0)
//
// Using these methods we can hint the compiler to factorize constant indices and weights
// in cpu_upsample_linear method
template <int N, int non_zero_stride_dim, typename scalar_t, typename index_t, int interp_size>
struct CheckAlmostAllZeroStrides {
  static inline bool eval(const int64_t* strides) {
    // N is dim index: N -> dim0, N-1 -> dim1, ...
    // non_zero_stride_dim should be out_dims - dim
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    bool output;
    if (N == non_zero_stride_dim) {
      output = is_contiguous_stride<scalar_t, index_t, interp_size>(strides);
    } else {
      output = is_zero_stride<2 * interp_size>(strides);
    }
    return output &&
      CheckAlmostAllZeroStrides<N - 1, non_zero_stride_dim, scalar_t, index_t, interp_size>::eval(
        &strides[2 * interp_size]);
  }
};

template <int non_zero_stride_dim, typename scalar_t, typename index_t, int interp_size>
struct CheckAlmostAllZeroStrides<0, non_zero_stride_dim, scalar_t, index_t, interp_size> {
  static inline bool eval(const int64_t* /*strides*/) {
    return true;
  }
};

template <int n, int s, typename scalar_t, typename index_t, int interp_size>
static inline bool check_almost_all_zero_stride(const int64_t* strides) {
  return CheckAlmostAllZeroStrides<n, s, scalar_t, index_t, interp_size>::eval(strides);
}

// Helper method to compute interpolation for nearest, linear, cubic modes
template <typename scalar_t, typename index_t, int out_ndims, int interp_size>
static inline void basic_loop(char** data, const int64_t* strides, int64_t n) {
  char* dst = data[0];
  char* src = data[1];
  for (const auto i : c10::irange(n)) {
    *(scalar_t*)&dst[i * strides[0]] = interpolate<out_ndims, scalar_t, index_t, interp_size>(
        src + i * strides[1], &data[2], &strides[2], i);
  }
}

template <typename scalar_t, typename index_t>
static inline void basic_loop_aa_single_dim_zero_strides(
    char** data,
    const int64_t* strides,
    int64_t n) {
  char* dst = data[0];
  char* src = data[1];
  // index stride is constant for the given dimension
  const index_t ids_stride = *(index_t*)&data[2 + 2][0];

  for (const auto i : c10::irange(n)) {
    *(scalar_t*)&dst[i * strides[0]] =
        interpolate_aa_single_dim_zero_strides<scalar_t, index_t>(
            src + i * strides[1], &data[2], ids_stride);
  }
}

template <typename scalar_t, typename index_t>
static inline void basic_loop_aa_single_dim_nonzero_strides(
    char** data,
    const int64_t* strides,
    int64_t n) {
  char* dst = data[0];
  char* src = data[1];
  // index stride is constant for the given dimension
  const index_t ids_stride = *(index_t*)&data[2 + 2][0];

  if (strides[1] == 0) {
    for (const auto i : c10::irange(n)) {
      *(scalar_t*)&dst[i * strides[0]] =
          interpolate_aa_single_dim<scalar_t, index_t>(
              src, &data[2], &strides[2], i, ids_stride);
    }
  } else {
    for (const auto i : c10::irange(n)) {
      *(scalar_t*)&dst[i * strides[0]] =
          interpolate_aa_single_dim<scalar_t, index_t>(
              src + i * strides[1], &data[2], &strides[2], i, ids_stride);
    }
  }
}

// Generic upsampling computation method using TensorIterator for Nd case.
// Supports: nearest, linear, cubic modes with interp_size template argument: 1, 2, 4
//
// Single loop function for 1d, 2d and 3d cases and modes
// For N dimensions, output value up to Di dimension can be computed as
//
// output_i[a] = interpolate(output_{i+1}[a], w_{i+1}[a], output_{i+1}[a+1], w_{i+1}[a+1], ...)
// with
// output_DN[a] = interpolate(input_DN[a], w_DN[a], input_DN[a+1], w_DN[a+1], ...)
// and i - dimension index and a - linear index for spatial coordinates
//
// The recursive call is implemented with InterpLinear struct using template for
// the loop unrolling on compile time.
template <typename scalar_t, int out_ndims, int interp_size>
void cpu_upsample_generic(at::TensorIterator& iter)
{
  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
    // special-cases to let the compiler apply compile-time input-specific optimizations
    if ((strides[0] == sizeof(scalar_t) && (strides[1] == 0) &&
        // NOLINTNEXTLINE(bugprone-branch-clone)
        check_almost_all_zero_stride<out_ndims, 1, scalar_t, int64_t, interp_size>(&strides[2]))) {
      // contiguous channels-first case
      basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n);
    } else if ((strides[0] == sizeof(scalar_t) && (strides[1] == sizeof(scalar_t)) &&
               check_almost_all_zero_stride<out_ndims, -1, scalar_t, int64_t, interp_size>(&strides[2]))) {
      // contiguous channels-last case
      basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n);
    } else {
      // fallback
      basic_loop<scalar_t, int64_t, out_ndims, interp_size>(data, strides, n);
    }
  };
  iter.for_each(loop);
}

template <typename scalar_t, typename scale_type, nearest_idx_fn_t nearest_idx_fn>
void cpu_upsample_nearest_channels_last(
    const Tensor& output_,
    const Tensor& input_,
    const scale_type& scales) {
  TORCH_CHECK(input_.dtype() == output_.dtype(), "expected dtype ", input_.dtype(),
              " for `output` but got dtype ", output_.dtype());

  auto input_sizes = input_.sizes().vec();
  auto output_sizes = output_.sizes().vec();
  auto ndim = input_sizes.size();
  TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.")

  auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d;
  auto input = input_.contiguous(channels_last_memory_format);
  auto output = output_.contiguous(channels_last_memory_format);

  auto input_data = input.data_ptr<scalar_t>();
  auto output_data = output.data_ptr<scalar_t>();

  int64_t num_batches =  input_sizes[0];
  int64_t channels =  input_sizes[1];
  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
  int64_t input_width = input_sizes[ndim - 1];
  int64_t output_width = output_sizes[ndim - 1];
  int64_t numel = output.numel();

  TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels);

  using Vec = vec::Vectorized<scalar_t>;
  auto copy = [](scalar_t* out, scalar_t* in, int64_t size) {
    int64_t d = 0;
    for (; d < size - (size % Vec::size()); d += Vec::size()) {
      Vec out_vec = Vec::loadu(in + d);
      out_vec.store(out + d);
    }
    for (; d < size; d++) {
      out[d] = in[d];
    }
  };

  auto loop2d = [&](int64_t begin, int64_t end) {
    int64_t n = 0;
    int64_t oh = 0;
    int64_t ow = 0;
    data_index_init(begin, n, num_batches, oh, output_height, ow, output_width);

    for (const auto i : c10::irange(begin, end)) {
      int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]);
      int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]);
      scalar_t* output_ptr = output_data + i * channels;
      scalar_t* input_ptr = input_data + n * input_height * input_width * channels +
          ih * input_width * channels + iw * channels;
      copy(output_ptr, input_ptr, channels);
      data_index_step(n, num_batches, oh, output_height, ow, output_width);
    }
  };

  auto loop3d = [&](int64_t begin, int64_t end) {
    int64_t n = 0;
    int64_t od = 0;
    int64_t oh = 0;
    int64_t ow = 0;
    data_index_init(begin, n, num_batches, od, output_depth, oh, output_height, ow, output_width);

    for (const auto i : c10::irange(begin, end)) {
      int64_t id = nearest_idx_fn(od, input_depth, output_depth, scales[0]);
      int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]);
      int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]);
      scalar_t* output_ptr = output_data + i * channels;
      scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels +
          id * input_height * input_width * channels +
          ih * input_width * channels + iw * channels;
      copy(output_ptr, input_ptr, channels);
      data_index_step(n, num_batches, od, output_depth, oh, output_height, ow, output_width);
    }
  };

  if (ndim == 4) {
    // upsample nearest 2d
    at::parallel_for(0, numel / channels, at::internal::GRAIN_SIZE / channels, loop2d);
  } else {
    // upsample nearest 3d
    TORCH_INTERNAL_ASSERT(ndim == 5);
    at::parallel_for(0, numel / channels, at::internal::GRAIN_SIZE / channels, loop3d);
  }

  if (!output_.is_contiguous(channels_last_memory_format)) {
    output_.copy_(output);
  }
}

template <typename scalar_t, typename accscalar_t>
inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w) {
  return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w);
}

template <typename scalar_t, typename accscalar_t, typename... Args>
inline VecType<scalar_t> interpolate(const scalar_t* t, accscalar_t w, Args... args) {
  return VecType<scalar_t>::loadu(t) * VecType<scalar_t>(w) + interpolate(args...);
}

template <typename scalar_t, typename scale_type>
void cpu_upsample_linear_channels_last(
    const Tensor& output_,
    const Tensor& input_,
    bool align_corners,
    const scale_type& scales) {
  TORCH_CHECK(input_.dtype() == output_.dtype(), "expected dtype ", input_.dtype(),
              " for `output` but got dtype ", output_.dtype());

  auto input_sizes = input_.sizes().vec();
  auto output_sizes = output_.sizes().vec();
  auto ndim = input_sizes.size();
  TORCH_CHECK(ndim >=4 && ndim <= 5, "Upsample with NHWC format supports tensors with 4 or 5 dims.")

  auto channels_last_memory_format = ndim == 4 ? at::MemoryFormat::ChannelsLast : at::MemoryFormat::ChannelsLast3d;
  auto input = input_.contiguous(channels_last_memory_format);
  auto output = output_.contiguous(channels_last_memory_format);

  auto input_data = input.data_ptr<scalar_t>();
  auto output_data = output.data_ptr<scalar_t>();

  int64_t num_batches =  input_sizes[0];
  int64_t channels =  input_sizes[1];
  int64_t input_depth = (ndim == 5) ? input_sizes[2] : 1;
  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
  int64_t input_width = input_sizes[ndim - 1];
  int64_t output_width = output_sizes[ndim - 1];

  TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels);
  int64_t output_slice_size = output_depth * output_height * output_width * channels;

  using opmath_t = at::opmath_type<scalar_t>;
  using Vec = vec::Vectorized<scalar_t>;
  auto loop2d = [&](int64_t begin, int64_t end) {
    const auto height_scale = area_pixel_compute_scale<opmath_t>(
        input_height, output_height, align_corners, scales[0]);
    const auto width_scale = area_pixel_compute_scale<opmath_t>(
        input_width, output_width, align_corners, scales[1]);

    auto input_indexr = [=](int64_t n, int64_t h, int64_t w) {
      return input_data + n * input_height * input_width * channels +
          h * input_width * channels + w * channels;
    };

    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    int64_t ih0, ih1, iw0, iw1;
    opmath_t h0lambda, h1lambda, w0lambda, w1lambda;
    for (const auto n : c10::irange(begin, end)) {
      for (const auto oh : c10::irange(output_height)) {
        compute_source_index_and_lambda(
            ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners);
        for (const auto ow : c10::irange(output_width)) {
          compute_source_index_and_lambda(
              iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);

          scalar_t* out = output_data + n * output_slice_size +
              oh * output_width * channels + ow * channels;
          scalar_t* i00 = input_indexr(n, ih0, iw0);
          scalar_t* i01 = input_indexr(n, ih0, iw1);
          scalar_t* i10 = input_indexr(n, ih1, iw0);
          scalar_t* i11 = input_indexr(n, ih1, iw1);
          opmath_t w00 = h0lambda * w0lambda;
          opmath_t w01 = h0lambda * w1lambda;
          opmath_t w10 = h1lambda * w0lambda;
          opmath_t w11 = h1lambda * w1lambda;

          int64_t size = channels;
          int64_t d = 0;
          for (; d < size - (size % Vec::size()); d += Vec::size()) {
            auto out_vec = interpolate(i00 + d, w00, i01 + d, w01, i10 + d, w10, i11 + d, w11);
            out_vec.store(out + d);
          }
          for (; d < size; d++) {
            out[d] = i00[d] * w00 + i01[d] * w01 + i10[d] * w10 + i11[d] * w11;
          }
        }
      }
    }
  };

  auto loop3d = [&](int64_t begin, int64_t end) {
    const auto depth_scale = area_pixel_compute_scale<opmath_t>(
        input_depth, output_depth, align_corners, scales[0]);
    const auto height_scale = area_pixel_compute_scale<opmath_t>(
        input_height, output_height, align_corners, scales[1]);
    const auto width_scale = area_pixel_compute_scale<opmath_t>(
        input_width, output_width, align_corners, scales[2]);

    auto input_indexr = [=](int64_t n, int64_t d, int64_t h, int64_t w) {
      return input_data + n * input_depth * input_height * input_width * channels +
          d * input_height * input_width * channels +
          h * input_width * channels + w * channels;
    };

    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    int64_t id0, id1, ih0, ih1, iw0, iw1;
    opmath_t d0lambda, d1lambda, h0lambda, h1lambda, w0lambda, w1lambda;
    for (const auto n : c10::irange(begin, end)) {
      for (const auto od : c10::irange(output_depth)) {
        compute_source_index_and_lambda(
            id0, id1, d0lambda, d1lambda, depth_scale, od, input_depth, output_depth, align_corners);
        for (const auto oh : c10::irange(output_height)) {
          compute_source_index_and_lambda(
              ih0, ih1, h0lambda, h1lambda, height_scale, oh, input_height, output_height, align_corners);
          for (const auto ow : c10::irange(output_width)) {
            compute_source_index_and_lambda(
                iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners);

            scalar_t* out = output_data + n * output_slice_size +
                od * output_height * output_width * channels +
                oh * output_width * channels + ow * channels;
            scalar_t* i000 = input_indexr(n, id0, ih0, iw0);
            scalar_t* i001 = input_indexr(n, id0, ih0, iw1);
            scalar_t* i010 = input_indexr(n, id0, ih1, iw0);
            scalar_t* i011 = input_indexr(n, id0, ih1, iw1);
            scalar_t* i100 = input_indexr(n, id1, ih0, iw0);
            scalar_t* i101 = input_indexr(n, id1, ih0, iw1);
            scalar_t* i110 = input_indexr(n, id1, ih1, iw0);
            scalar_t* i111 = input_indexr(n, id1, ih1, iw1);
            opmath_t w000 = d0lambda * h0lambda * w0lambda;
            opmath_t w001 = d0lambda * h0lambda * w1lambda;
            opmath_t w010 = d0lambda * h1lambda * w0lambda;
            opmath_t w011 = d0lambda * h1lambda * w1lambda;
            opmath_t w100 = d1lambda * h0lambda * w0lambda;
            opmath_t w101 = d1lambda * h0lambda * w1lambda;
            opmath_t w110 = d1lambda * h1lambda * w0lambda;
            opmath_t w111 = d1lambda * h1lambda * w1lambda;

            int64_t size = channels;
            int64_t d = 0;
            for (; d < size - (size % Vec::size()); d += Vec::size()) {
              auto out_vec = interpolate(
                  i000 + d, w000, i001 + d, w001, i010 + d, w010, i011 + d, w011,
                  i100 + d, w100, i101 + d, w101, i110 + d, w110, i111 + d, w111);
              out_vec.store(out + d);
            }
            for (; d < size; d++) {
              out[d] =
                  i000[d] * w000 + i001[d] * w001 + i010[d] * w010 + i011[d] * w011 +
                  i100[d] * w100 + i101[d] * w101 + i110[d] * w110 + i111[d] * w111;
            }
          }
        }
      }
    }
  };

  if (ndim == 4) {
    // upsample nearest 2d
    at::parallel_for(0, num_batches, at::internal::GRAIN_SIZE / output_slice_size / 4, loop2d);
  } else {
    // upsample nearest 3d
    TORCH_INTERNAL_ASSERT(ndim == 5);
    at::parallel_for(0, num_batches, at::internal::GRAIN_SIZE / output_slice_size / 8, loop3d);
  }

  if (!output_.is_contiguous(channels_last_memory_format)) {
    output_.copy_(output);
  }
}

// Helper structs to use with upsample_generic_Nd_kernel_impl
struct HelperInterpBase {

  static inline void init_indices_weights(
    at::ScalarType output_type,
    std::vector<Tensor> & output, int64_t output_size, int64_t ndims,
    int64_t reshape_dim, int interp_size
  ) {

    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;

    for (const auto j C10_UNUSED : c10::irange(interp_size)) {
      output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
      output.emplace_back(empty(new_shape, CPU(output_type)));
    }
  }

  template <typename scalar_t, typename aa_filter_fn_t>
  static inline void _compute_weights_aa(
    const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support,
    scalar_t* wt_ptr, const int64_t interp_size, aa_filter_fn_t filter_fn,
    int64_t& xmin, int64_t& xsize
  ) {

    scalar_t center = scale * (i + 0.5);
    scalar_t total_w = 0.0;
    scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0;
    xmin = std::max(
        static_cast<int64_t>(center - support + 0.5), static_cast<int64_t>(0));
    xsize = std::min(static_cast<int64_t>(center + support + 0.5), input_size) -
        xmin;

    int64_t j = 0;
    for (; j < xsize; j++) {
      scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale);
      wt_ptr[j] = w;
      total_w += w;
    }
    for (j = 0; j < xsize; j++) {
      if (total_w != 0.0) {
        wt_ptr[j] /= total_w;
      }
    }
    for (; j < interp_size; j++) {
      wt_ptr[j] = static_cast<scalar_t>(0.0);
    }
  }

  template <typename scalar_t, typename aa_filter_fn_t>
  static inline std::vector<Tensor> _compute_indices_weights_aa(
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
    int64_t reshape_dim, scalar_t scale,
    int interp_size, aa_filter_fn_t aa_filter_fn
  ) {

    std::vector<Tensor> output;

    scalar_t support =
        (scale >= 1.0) ? (interp_size * 0.5) * scale : interp_size * 0.5;
    interp_size = (int)ceilf(support) * 2 + 1;

    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;

    // Bounds approach as in PIL: xmin/xmax
    output.emplace_back(
        empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
    output.emplace_back(
        empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
    output.emplace_back(
        empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));

    {
      // Weights
      new_shape[reshape_dim] = output_size * interp_size;
      auto wts = empty(new_shape, CPU(c10::CppTypeToScalarType<scalar_t>()));
      auto strides = wts.strides().vec();
      strides[reshape_dim] = 0;
      new_shape[reshape_dim] = output_size;
      wts = wts.as_strided(new_shape, strides);
      output.emplace_back(wts);
      // Weights indices
      output.emplace_back(
          empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
    }

    int64_t* idx_ptr_xmin = output[0].data_ptr<int64_t>();
    int64_t* idx_ptr_size = output[1].data_ptr<int64_t>();
    int64_t* idx_ptr_stride = output[2].data_ptr<int64_t>();
    scalar_t* wt_ptr = output[3].data_ptr<scalar_t>();
    int64_t* wt_idx_ptr = output[4].data_ptr<int64_t>();

    int64_t xmin, xmax;

    for (const auto i : c10::irange(output_size)) {
      HelperInterpBase::_compute_weights_aa(
          i,
          input_size,
          scale,
          support,
          wt_ptr + i * interp_size,
          interp_size,
          aa_filter_fn,
          xmin,
          xmax);

      idx_ptr_xmin[i] = xmin * stride;
      idx_ptr_size[i] = xmax;
      idx_ptr_stride[i] = stride;
      wt_idx_ptr[i] = i * interp_size * sizeof(scalar_t);
    }
    return output;
  }

};

struct HelperInterpNearest : public HelperInterpBase {
  // This structure implements outdated and buggy method to compute indices
  // for nearest neighbours interpolation
  // We keep this structure for BC and consider as deprecated.
  // See HelperInterpNearestExact as replacement

  static const int interp_size = 1;

  static inline void init_indices_weights(
    at::ScalarType output_type,
    std::vector<Tensor> & output, int64_t output_size, int64_t ndims,
    int64_t reshape_dim, int interp_size
  ) {
    auto new_shape = std::vector<int64_t>(ndims, 1);
    new_shape[reshape_dim] = output_size;

    for (const auto j C10_UNUSED : c10::irange(interp_size)) {
      output.emplace_back(empty(new_shape, CPU(c10::CppTypeToScalarType<int64_t>())));
      // Defines weights for consistency, but not used
      output.emplace_back(at::ones(new_shape, CPU(output_type)));
    }
  }

  // Compute nearest mode indices and weights for each interpolated dimension
  // indices_weights = {
  //      {indices_0, 1.0, },  // dim -n
  //      {indices_0, 1.0, },  // dim -(n-1)
  //      ...
  //      {indices_0, 1.0, },  // dim -1
  // }
  // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to
  // fit input/output tensors.
  // Indices are already containing the strides to optimize the computations
  static inline std::vector<Tensor> compute_indices_weights(
    at::ScalarType scalar_type,
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale
  ) {

    TORCH_INTERNAL_ASSERT(!align_corners);
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<Tensor> output;
    HelperInterpNearest::init_indices_weights(
      scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size);

    AT_DISPATCH_FLOATING_TYPES_AND(
      ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] {
        scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);

        auto input_index_ptr = output[0].data_ptr<int64_t>();
        int64_t input_index;

        // Indices are computed as following:
        // scale = 1.0 * isize / osize
        // index_f32 = (output_index) * scale
        // input_index = floor(index_f32)
        // Same as OpenCV INTER_NEAREST
        using opmath_t = at::opmath_type<scalar_t>;
        for (const auto i : c10::irange(output_size)) {
          const auto real_input_index =
              area_pixel_compute_source_index<opmath_t>(
                  scale, i, /*align_corners=*/true, /*cubic=*/false);
          input_index = static_cast<int64_t>(floorf(real_input_index));
          input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride;
        }
      }
    );
    return output;
  }

};

struct HelperInterpNearestExact : public HelperInterpNearest {

  // Compute nearest mode indices and weights for each interpolated dimension
  // indices_weights = {
  //      {indices_0, 1.0, },  // dim -n
  //      {indices_0, 1.0, },  // dim -(n-1)
  //      ...
  //      {indices_0, 1.0, },  // dim -1
  // }
  // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to
  // fit input/output tensors.
  // Indices are already containing the strides to optimize the computations
  static inline std::vector<Tensor> compute_indices_weights(
    at::ScalarType scalar_type,
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims,
    int64_t reshape_dim, bool align_corners, const c10::optional<double> opt_scale
  ) {

    TORCH_INTERNAL_ASSERT(!align_corners);
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<Tensor> output;
    HelperInterpNearest::init_indices_weights(
      scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size);

    AT_DISPATCH_FLOATING_TYPES(
      scalar_type, "compute_indices_weights_nearest", [&] {
        scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);

        auto input_index_ptr = output[0].data_ptr<int64_t>();
        int64_t input_index;

        // Indices should be computed as following:
        // scale = 1.0 * isize / osize
        // index_f32 = (output_index + 0.5) * scale - 0.5
        // input_index = round(index_f32)
        // Same as Pillow and Scikit-Image/Scipy ndi.zoom
        using opmath_t = at::opmath_type<scalar_t>;
        for (const auto i : c10::irange(output_size)) {
          const auto real_input_index =
              area_pixel_compute_source_index<opmath_t>(
                  scale, i, /*align_corners=*/align_corners, /*cubic=*/false);
          input_index = static_cast<int64_t>(floorf(real_input_index + 0.5));
          input_index_ptr[i] = static_cast<int64_t>(std::min(input_index, input_size - 1)) * stride;
        }
      }
    );
    return output;
  }
};

struct HelperInterpLinear : public HelperInterpBase {

  static const int interp_size = 2;

  // Compute indices and weights for each interpolated dimension
  // indices_weights = {
  //      {indices_0, weights_0, indices_1, weights_1},  // dim -n
  //      {indices_0, weights_0, indices_1, weights_1},  // dim -(n-1)
  //      ...
  //      {indices_0, weights_0, indices_1, weights_1},  // dim -1
  // }
  // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to
  // fit input/output tensors.
  // Indices are already containing the strides to optimize the computations
  static inline std::vector<Tensor> compute_indices_weights(
    at::ScalarType scalar_type,
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim,
    bool align_corners, const c10::optional<double> opt_scale
  ) {
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<Tensor> output;
    HelperInterpLinear::init_indices_weights(
      scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size);
    AT_DISPATCH_FLOATING_TYPES_AND(
      ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] {
        scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);

        auto input_index0_ptr = output[0].data_ptr<int64_t>();
        auto lambda0_ptr = output[1].data_ptr<scalar_t>();
        auto input_index1_ptr = output[2].data_ptr<int64_t>();
        auto lambda1_ptr = output[3].data_ptr<scalar_t>();

        for (const auto i : c10::irange(output_size)) {

          compute_source_index_and_lambda<scalar_t>(
            input_index0_ptr[i], input_index1_ptr[i],
            lambda0_ptr[i], lambda1_ptr[i],
            scale, i, input_size, output_size, align_corners
          );
          // put stride into indices
          // index values correspond to input indices (0, 1, 2, 3, ...)
          // when multiplied by input stride, maximum possible value
          // input_size[dim-1] * input_size[dim-2] * ... for the given dimension.
          input_index0_ptr[i] *= stride;
          input_index1_ptr[i] *= stride;
        }
      }
    );
    return output;
  }

  // taken from
  // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
  // src/libImaging/Resample.c#L20-L29
  template<typename scalar_t>
  static inline scalar_t aa_filter(scalar_t x) {
    if (x < 0.0) {
      x = -x;
    }
    if (x < 1.0) {
      return 1.0 - x;
    }
    return 0.0;
  }

  static inline std::vector<Tensor> compute_indices_weights_aa(
    at::ScalarType scalar_type,
    int64_t input_size,
    int64_t output_size,
    int64_t stride,
    int64_t ndims,
    int64_t reshape_dim,
    bool align_corners,
    const c10::optional<double> opt_scale
  ) {

    std::vector<Tensor> indices_weights;
    AT_DISPATCH_FLOATING_TYPES(
      scalar_type, "compute_indices_weights_aa", [&] {

        scalar_t scale = area_pixel_compute_scale<scalar_t>(
            input_size, output_size, align_corners, opt_scale);

        auto interp_size = HelperInterpLinear::interp_size;

        indices_weights = HelperInterpLinear::_compute_indices_weights_aa<scalar_t>(
            input_size,
            output_size,
            stride,
            ndims,
            reshape_dim,
            scale,
            interp_size,
            &HelperInterpLinear::aa_filter<scalar_t>);
      }
    );
    return indices_weights;
  }
};

struct HelperInterpCubic : public HelperInterpBase {

  static const int interp_size = 4;

  // Compute indices and weights for each interpolated dimension
  // indices_weights = {
  //      {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3},  // dim -n
  //      {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3},  // dim -(n-1)
  //      ...
  //      {indices_0, weights_0, indices_1, weights_1, ..., indices_3, weights_3},  // dim -1
  // }
  // Indices and weights are reshaped as (1, 1, ..., N, ..., 1, 1) to
  // fit input/output tensors.
  // Indices are already containing the strides to optimize the computations
  static inline std::vector<Tensor> compute_indices_weights(
    at::ScalarType scalar_type,
    int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim,
    bool align_corners, const c10::optional<double> opt_scale
  ) {
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    std::vector<Tensor> output;
    HelperInterpCubic::init_indices_weights(
      scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size);

    AT_DISPATCH_FLOATING_TYPES_AND(
      ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] {
        scalar_t scale = area_pixel_compute_scale<scalar_t>(input_size, output_size, align_corners, opt_scale);

        int64_t input_index;
        int64_t zero = static_cast<int64_t>(0);
        // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
        scalar_t coeffs[4];

        int64_t * idx_ptr;
        scalar_t * wt_ptr;
        using opmath_t = at::opmath_type<scalar_t>;
        for (const auto i : c10::irange(output_size)) {
          const auto real_input_index =
              area_pixel_compute_source_index<opmath_t>(
                  scale, i, align_corners, /*cubic=*/true);
          input_index = static_cast<int64_t>(floorf(real_input_index));
          get_cubic_upsample_coefficients<scalar_t>(coeffs, real_input_index - input_index);

          for (const auto j : c10::irange(interp_size)) {
            idx_ptr = output[2 * j + 0].data_ptr<int64_t>();
            idx_ptr[i] = static_cast<int64_t>(std::max(std::min(input_index + j - 1, input_size - 1), zero)) * stride;
            wt_ptr = output[2 * j + 1].data_ptr<scalar_t>();
            wt_ptr[i] = coeffs[j];
          }
        }
      }
    );
    return output;
  }

  // taken from
  // https://github.com/python-pillow/Pillow/blob/6812205f18ca4ef54372e87e1a13ce4a859434df/
  // src/libImaging/Resample.c#L46-L62
  template<typename scalar_t>
  static inline scalar_t aa_filter(scalar_t x) {
    // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
#define a -0.5
    if (x < 0.0) {
      x = -x;
    }
    if (x < 1.0) {
      return ((a + 2.0) * x - (a + 3.0)) * x * x + 1;
    }
    if (x < 2.0) {
      return (((x - 5) * x + 8) * x - 4) * a;
    }
    return 0.0;
#undef a
  }

  static inline std::vector<Tensor> compute_indices_weights_aa(
    at::ScalarType scalar_type,
    int64_t input_size,
    int64_t output_size,
    int64_t stride,
    int64_t ndims,
    int64_t reshape_dim,
    bool align_corners,
    const c10::optional<double> opt_scale
  ) {

    std::vector<Tensor> indices_weights;
    AT_DISPATCH_FLOATING_TYPES(
      scalar_type, "compute_indices_weights_aa", [&] {

        scalar_t scale = area_pixel_compute_scale<scalar_t>(
            input_size, output_size, align_corners, opt_scale);

        auto interp_size = HelperInterpCubic::interp_size;

        indices_weights = HelperInterpCubic::_compute_indices_weights_aa<scalar_t>(
            input_size,
            output_size,
            stride,
            ndims,
            reshape_dim,
            scale,
            interp_size,
            &HelperInterpCubic::aa_filter<scalar_t>);
      }
    );
    return indices_weights;
  }
};

// Generic upsampling interpolation kernel for N-d case.
// Input is assumed to be like NCHW, NCL, NCKHW - interpolated spatial dimension
// are those from the end up to batch size N and number of channels C.
//
// Internally, it uses TensorIterator to optimize the computations.
// - out_ndims is the number of interpolated dims: 1, 2, 3
// - scale_type is template type for scales, typically c10::optional<double>
// - template<typename> class F is one of the above structs to compute indices and weights
template <int out_ndims, typename scale_type, class F>
void upsample_generic_Nd_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    const scale_type& scales) {

  // input can be NCHW, NCL or NCKHW
  auto shape = input.sizes().vec();
  auto strides = input.strides().vec();
  auto oshape = output.sizes();

  TORCH_INTERNAL_ASSERT(
    shape.size() == oshape.size() && shape.size() == 2 + out_ndims
  );
  TORCH_INTERNAL_ASSERT(strides.size() == 2 + out_ndims);

  for (const auto i : c10::irange(out_ndims)) {
    shape[i + 2] = oshape[i + 2];
    strides[i + 2] = 0;
  }
  auto restrided_input = input.as_strided(shape, strides);

  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
  std::vector<std::vector<Tensor>> indices_weights;

  constexpr int interp_size = F::interp_size;
  auto input_scalar_type = input.scalar_type();
  if ((interp_size == 1 && input_scalar_type == at::ScalarType::Byte)) {
    // nearest also supports uint8 tensor, but we have to use float
    // with compute_indices_weights
    input_scalar_type = at::ScalarType::Float;
  }

  for (const auto i : c10::irange(out_ndims)) {
    // NOLINTNEXTLINE(performance-inefficient-vector-operation)
    indices_weights.emplace_back(
      F::compute_indices_weights(
        input_scalar_type, input.size(i + 2), oshape[i + 2],
        input.stride(i + 2) * input.element_size(),
        input.dim(), i + 2, align_corners, scales[i]
      )
    );
  }

  TensorIteratorConfig config;
  config.check_all_same_dtype(false)
    .declare_static_dtype_and_device(input.scalar_type(), input.device())
    .add_output(output)
    .add_input(restrided_input);

  for (auto & idx_weight: indices_weights) {
    for (auto& tensor : idx_weight) {
      config.add_input(tensor);
    }
  }

  auto iter = config.build();

  if (interp_size > 1) {
    // Nearest also supports uint8 tensor, so need to handle it separately
    AT_DISPATCH_FLOATING_TYPES_AND(
        at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] {
        // MSVC can not catch constexpr int interp_size here
        constexpr int mode = F::interp_size;
        cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
    });
  } else {
    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
        iter.dtype(), "upsample_generic_Nd", [&] {
        constexpr int mode = F::interp_size;
        cpu_upsample_generic<scalar_t, out_ndims, mode>(iter);
    });
  }
}

template <typename scalar_t>
void cpu_upsample_generic_aa(at::TensorIterator& iter) {

  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
    if ((strides[0] == sizeof(scalar_t)) && (strides[1] == sizeof(scalar_t)) &&
        is_zero_stride<3 + 2>(&strides[2])) {
      basic_loop_aa_single_dim_zero_strides<scalar_t, int64_t>(
          data, strides, n);
    } else {
      basic_loop_aa_single_dim_nonzero_strides<scalar_t, int64_t>(
          data, strides, n);
    }
  };

  iter.for_each(loop);
}

// Generic separable upsampling interpolation kernels for N-d case with anti-aliasing
template <int out_ndims, typename scale_type, class F>
void _separable_upsample_generic_Nd_kernel_impl_single_dim(
    const Tensor& output,
    const Tensor& input,
    int interp_dim,
    bool align_corners,
    const scale_type& scales) {

  // input can be NCHW, NCL or NCKHW
  auto shape = input.sizes().vec();
  auto strides = input.strides().vec();
  auto oshape = output.sizes();

  TORCH_INTERNAL_ASSERT(
      shape.size() == oshape.size() && shape.size() == 2 + out_ndims);
  TORCH_INTERNAL_ASSERT(strides.size() == 2 + out_ndims);

  for (const auto i : c10::irange(out_ndims)) {
    shape[i + 2] = oshape[i + 2];
  }
  strides[interp_dim] = 0;
  auto restrided_input = input.as_strided(shape, strides);

  std::vector<std::vector<Tensor>> indices_weights;

  int interp_size = F::interp_size;
  auto input_scalar_type = input.scalar_type();
  if (interp_size == 1 && input_scalar_type == at::ScalarType::Byte) {
    // nearest also supports uint8 tensor, but we have to use float
    // with compute_indices_weights
    input_scalar_type = at::ScalarType::Float;
  }

  indices_weights.emplace_back(
      F::compute_indices_weights_aa(
        input_scalar_type, input.size(interp_dim), oshape[interp_dim],
        input.stride(interp_dim) * input.element_size(),
        input.dim(), interp_dim, align_corners, scales[interp_dim - 2]));

  TensorIteratorConfig config;
  config.check_all_same_dtype(false)
      .declare_static_dtype_and_device(input.scalar_type(), input.device())
      .add_output(output)
      .add_input(restrided_input);

  for (auto& idx_weight : indices_weights) {
    for (auto& tensor : idx_weight) {
      config.add_input(tensor);
    }
  }

  auto iter = config.build();

  if (interp_size > 1) {
    // Nearest also supports uint8 tensor, so need to handle it separately
    AT_DISPATCH_FLOATING_TYPES(iter.dtype(), "upsample_generic_Nd_aa", [&] {
      cpu_upsample_generic_aa<scalar_t>(iter);
    });
  } else {
    AT_DISPATCH_FLOATING_TYPES_AND(
        at::ScalarType::Byte, iter.dtype(), "upsample_generic_Nd_aa", [&] {
          cpu_upsample_generic_aa<scalar_t>(iter);
        });
  }
}

template <int out_ndims, typename scale_type, class F>
void separable_upsample_generic_Nd_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    const scale_type& scales) {

  auto temp_oshape = input.sizes().vec();
  at::Tensor temp_output, temp_input = input;
  for (const auto i : c10::irange(out_ndims - 1)) {
    int interp_dim = 2 + out_ndims - 1 - i;
    temp_oshape[interp_dim] = output.sizes()[interp_dim];
    temp_output = at::empty(temp_oshape, input.options().memory_format(input.suggest_memory_format()));
    _separable_upsample_generic_Nd_kernel_impl_single_dim<
        out_ndims,
        scale_t,
        F>(
        temp_output, temp_input, interp_dim, align_corners, scales);
    temp_input = temp_output;
  }
  _separable_upsample_generic_Nd_kernel_impl_single_dim<
      out_ndims,
      scale_t,
      F>(output, temp_input, 2, align_corners, scales);
}

void upsample_nearest1d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_w) {
  upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearest>(
      output, input, false, {scales_w});
}

void _upsample_nearest_exact1d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_w) {
  upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpNearestExact>(
    output, input, false, {scales_w});
}

int _use_vectorized_kernel_cond_2d(
    const Tensor& output,
    const Tensor& input) {
      // This condition is used to know whether we should dispatch to a vectorized
      // kernel, or to the more general upsample_generic_Nd_kernel_impl(). For now,
      // the vectorized kernels are only optimized for channels_last and when C >= 4
      // (shape = NCHW). For a very wide range of use-cases (typically image or mask
      // resizing where we have C < 4), using upsample_generic_Nd_kernel_impl() is
      // actually faster. On top of that, benchmarks showed that this also depends on
      // the *output* size (output_H + output_W), for both upsampling and
      // downsampling. The current 128 threshold was determined through benchmarks.
      return ((input.is_contiguous(at::MemoryFormat::ChannelsLast)) && (input.size(1) > 3)) || ((output.size(-2) + output.size(-1)) <= 128);
}

int _use_vectorized_kernel_cond_3d(
    // Similar to _use_vectorized_kernel_cond_2d() but for 3d resampling (e.g. videos)
    // Note that unlike the 2d case, this is not subject to small output size
    // overhead - hence the absence of the 128 threshold in the condition.
    const Tensor& output,
    const Tensor& input) {
      return ((input.is_contiguous(at::MemoryFormat::ChannelsLast3d)) && (input.size(1) > 3));
}


void upsample_nearest2d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  if (_use_vectorized_kernel_cond_2d(output, input)) {
    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
        input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
      cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpNearest>(
      output, input, false, {scales_h, scales_w});
  }
}

void _upsample_nearest_exact2d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  if (_use_vectorized_kernel_cond_2d(output, input)) {
    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] {
      cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpNearestExact>(
      output, input, false, {scales_h, scales_w});
  }
}

void upsample_nearest3d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_d,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  if (_use_vectorized_kernel_cond_3d(output, input)) {
    AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16,
        input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
      cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_idx>(output, input, {scales_d, scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpNearest>(
      output, input, false, {scales_d, scales_h, scales_w});
  }
}

void _upsample_nearest_exact3d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    c10::optional<double> scales_d,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  if (_use_vectorized_kernel_cond_3d(output, input)) {
    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] {
      cpu_upsample_nearest_channels_last<scalar_t, scale_t, nearest_exact_idx>(output, input, {scales_d, scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpNearestExact>(
      output, input, false, {scales_d, scales_h, scales_w});
  }
}

void upsample_linear1d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_w) {
  upsample_generic_Nd_kernel_impl<1, scale_t, HelperInterpLinear>(
    output, input, align_corners, {scales_w});
}

void upsample_bilinear2d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {

  // See note above about _use_vectorized_kernel_cond_2d(output, input). The extra cond is present
  // because benchmarks showed that with only 1 thread, images (C == 3) were
  // slightly faster with the vectorized kernel than with the generic one.
  // That's not the case for masks though (C == 1), which strongly benefit from
  // using the generic kernel.
  if ((_use_vectorized_kernel_cond_2d(output, input)) || (at::get_num_threads() == 1 && input.size(1) == 3)) {
    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] {
      cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
      output, input, align_corners, {scales_h, scales_w});
  }
}

void upsample_bilinear2d_aa_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {

  separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpLinear>(
      output, input, align_corners, {scales_h, scales_w});
}

void upsample_trilinear3d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_d,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  if ((_use_vectorized_kernel_cond_3d(output, input))) {
    AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] {
      cpu_upsample_linear_channels_last<scalar_t, scale_t>(output, input, align_corners, {scales_d, scales_h, scales_w});
    });
  } else {
    upsample_generic_Nd_kernel_impl<3, scale_t, HelperInterpLinear>(
      output, input, align_corners, {scales_d, scales_h, scales_w});
  }
}

void upsample_bicubic2d_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
    output, input, align_corners, {scales_h, scales_w});
}

void upsample_bicubic2d_aa_kernel_impl(
    const Tensor& output,
    const Tensor& input,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {

  separable_upsample_generic_Nd_kernel_impl<2, scale_t, HelperInterpCubic>(
      output, input, align_corners, {scales_h, scales_w});
}

template <
    typename scalar_t,
    typename scale_type,
    class F>
void cpu_upsample_genNd_backward_aa(
    const Tensor& grad_input_,
    const Tensor& grad_output_,
    bool align_corners,
    const scale_type& scales) {
  TORCH_CHECK(grad_input_.dtype() == grad_output_.dtype(), "expected dtype ", grad_output_.dtype(),
              " for `grad_input` but got dtype ", grad_input_.dtype());

  auto grad_output = grad_output_.contiguous();
  auto grad_input = grad_input_.contiguous();

  auto grad_output_data = grad_output.data_ptr<scalar_t>();
  auto grad_input_data = grad_input.data_ptr<scalar_t>();
  auto input_sizes = grad_input.sizes().vec();
  auto output_sizes = grad_output.sizes().vec();
  auto ndim = input_sizes.size();

  // treat nbatch and channels as one dimension
  int64_t channels = input_sizes[0] * input_sizes[1];
  int64_t output_depth = (ndim == 5) ? output_sizes[2] : 1;
  int64_t input_height = (ndim >= 4) ? input_sizes[ndim - 2] : 1;
  int64_t output_height = (ndim >= 4) ? output_sizes[ndim - 2] : 1;
  int64_t input_width = input_sizes[ndim - 1];
  int64_t output_width = output_sizes[ndim - 1];

  int64_t output_slice_size = output_depth * output_height * output_width;
  int interp_size = F::interp_size;

  auto loop2d = [&](int64_t begin, int64_t end) {
    const scalar_t height_scale = area_pixel_compute_scale<scalar_t>(
        input_height, output_height, align_corners, scales[0]);
    const scalar_t width_scale = area_pixel_compute_scale<scalar_t>(
        input_width, output_width, align_corners, scales[1]);

    auto input_indexr = [=](int64_t c, int64_t h, int64_t w) {
      return grad_input_data + c * input_height * input_width +
          h * input_width + w;
    };

    const scalar_t support_h = (height_scale >= 1.0)
        ? (interp_size * 0.5) * height_scale
        : interp_size * 0.5;
    const scalar_t support_w = (width_scale >= 1.0)
        ? (interp_size * 0.5) * width_scale
        : interp_size * 0.5;

    const int interp_height = (int)ceilf(support_h) * 2 + 1;
    const int interp_width = (int)ceilf(support_w) * 2 + 1;

    std::vector<scalar_t> wx(interp_width, 0.0);
    std::vector<scalar_t> wy(interp_height, 0.0);

    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    int64_t xmin, ymin;
    int64_t xsize, ysize;

    typedef scalar_t (*aa_filter_fn_t)(scalar_t);
    aa_filter_fn_t filter_fn = &F::aa_filter;

    for (const auto oh : c10::irange(output_height)) {
      F::_compute_weights_aa(
          oh,
          input_height,
          height_scale,
          support_h,
          wy.data(),
          interp_height,
          filter_fn,
          ymin,
          ysize);

      for (const auto ow : c10::irange(output_width)) {
        F::_compute_weights_aa(
            ow,
            input_width,
            width_scale,
            support_w,
            wx.data(),
            interp_width,
            filter_fn,
            xmin,
            xsize);

        for (const auto c : c10::irange(begin, end)) {
          scalar_t grad_output_value =
              grad_output_data[c * output_slice_size + oh * output_width + ow];

          for (const auto y : c10::irange(ysize)) {
            for (const auto x : c10::irange(xsize)) {
              *input_indexr(c, ymin + y, xmin + x) +=
                  wx[x] * wy[y] * grad_output_value;
            }
          }
        }
      }
    }
  };

  if (ndim == 4) {
    // upsample bilinear 2d
    at::parallel_for(
        0, channels, at::internal::GRAIN_SIZE / output_slice_size / 4, loop2d);
  } else {
    TORCH_CHECK(false, "Unsupported tensor ndim");
  }

  if (!grad_input_.is_contiguous()) {
    grad_input_.copy_(grad_input);
  }
}

void upsample_bilinear2d_aa_backward_kernel_impl(
    const Tensor& grad_input,
    const Tensor& grad_output,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  AT_DISPATCH_FLOATING_TYPES(
      grad_output.scalar_type(), "upsample_bilinear2d_aa_backward_cpu", [&] {
        cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpLinear>(
            grad_input, grad_output, align_corners, {scales_h, scales_w});
      });
}

void upsample_bicubic2d_aa_backward_kernel_impl(
    const Tensor& grad_input,
    const Tensor& grad_output,
    bool align_corners,
    c10::optional<double> scales_h,
    c10::optional<double> scales_w) {
  AT_DISPATCH_FLOATING_TYPES(
      grad_output.scalar_type(), "upsample_bicubic2d_aa_backward_cpu", [&] {
        cpu_upsample_genNd_backward_aa<scalar_t, scale_t, HelperInterpCubic>(
            grad_input, grad_output, align_corners, {scales_h, scales_w});
      });
}

} // anonymous namespace

REGISTER_DISPATCH(upsample_nearest1d_kernel, &upsample_nearest1d_kernel_impl);
REGISTER_DISPATCH(_upsample_nearest_exact1d_kernel, &_upsample_nearest_exact1d_kernel_impl);
REGISTER_DISPATCH(upsample_nearest2d_kernel, &upsample_nearest2d_kernel_impl);
REGISTER_DISPATCH(_upsample_nearest_exact2d_kernel, &_upsample_nearest_exact2d_kernel_impl);
REGISTER_DISPATCH(upsample_nearest3d_kernel, &upsample_nearest3d_kernel_impl);
REGISTER_DISPATCH(_upsample_nearest_exact3d_kernel, &_upsample_nearest_exact3d_kernel_impl);

REGISTER_DISPATCH(upsample_linear1d_kernel, &upsample_linear1d_kernel_impl);
REGISTER_DISPATCH(upsample_bilinear2d_kernel, &upsample_bilinear2d_kernel_impl);
REGISTER_DISPATCH(_upsample_bilinear2d_aa_kernel, &upsample_bilinear2d_aa_kernel_impl);
REGISTER_DISPATCH(_upsample_bilinear2d_aa_backward_kernel, &upsample_bilinear2d_aa_backward_kernel_impl);
REGISTER_DISPATCH(upsample_trilinear3d_kernel, &upsample_trilinear3d_kernel_impl);

REGISTER_DISPATCH(upsample_bicubic2d_kernel, &upsample_bicubic2d_kernel_impl);
REGISTER_DISPATCH(_upsample_bicubic2d_aa_kernel, &upsample_bicubic2d_aa_kernel_impl);
REGISTER_DISPATCH(_upsample_bicubic2d_aa_backward_kernel, &upsample_bicubic2d_aa_backward_kernel_impl);
} // namespace at::native