From b5b11f3b29fe5c28174073e1d66facb63a3c77db Mon Sep 17 00:00:00 2001 From: Chenhu Wang Date: Thu, 20 Jul 2023 21:40:37 +0800 Subject: [PATCH 1/6] [CPU]Shape agnostic jit kernel for MVN (#17988) --- .../intel_cpu/src/nodes/executors/mvn.hpp | 1 - src/plugins/intel_cpu/src/nodes/mvn.cpp | 1679 ++++++++++++----- src/plugins/intel_cpu/src/nodes/mvn.h | 24 +- 3 files changed, 1193 insertions(+), 511 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp index 6b294f4b690a48..0a1dbb6904cf16 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp +++ b/src/plugins/intel_cpu/src/nodes/executors/mvn.hpp @@ -25,7 +25,6 @@ enum MVNEpsMode { struct MVNAttrs { MVNLayoutType layout = mvn_planar; - std::tuple shape5D = std::make_tuple(0u, 0u, 0u, 0u, 0u); bool initAcrossChannels_ = false; bool execAcrossChannels_ = false; bool normalizeVariance_ = false; diff --git a/src/plugins/intel_cpu/src/nodes/mvn.cpp b/src/plugins/intel_cpu/src/nodes/mvn.cpp index 298d1351bf36a1..9ef9427958bd6b 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.cpp +++ b/src/plugins/intel_cpu/src/nodes/mvn.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include "fake_quantize.h" #include "eltwise.h" @@ -53,12 +54,6 @@ size_t MVNKey::hash() const { using namespace dnnl::impl::primitive_hashing; size_t seed = 0; - - seed = hash_combine(seed, std::get<0>(mvnAttrs.shape5D)); - seed = hash_combine(seed, std::get<1>(mvnAttrs.shape5D)); - seed = hash_combine(seed, std::get<2>(mvnAttrs.shape5D)); - seed = hash_combine(seed, std::get<3>(mvnAttrs.shape5D)); - seed = hash_combine(seed, std::get<4>(mvnAttrs.shape5D)); seed = hash_combine(seed, mvnAttrs.initAcrossChannels_); seed = hash_combine(seed, mvnAttrs.execAcrossChannels_); seed = hash_combine(seed, mvnAttrs.normalizeVariance_); @@ -73,7 +68,7 @@ size_t MVNKey::hash() const { bool MVNKey::operator==(const MVNKey& rhs) const { bool retVal = true; - retVal = retVal && mvnAttrs.shape5D == rhs.mvnAttrs.shape5D && + retVal = retVal && mvnAttrs.initAcrossChannels_ == rhs.mvnAttrs.initAcrossChannels_ && mvnAttrs.execAcrossChannels_ == rhs.mvnAttrs.execAcrossChannels_ && mvnAttrs.normalizeVariance_ == rhs.mvnAttrs.normalizeVariance_ && @@ -94,6 +89,8 @@ static inline bool isFloatCompatible(Precision prc) { return one_of(prc, Precision::FP32, Precision::BF16, Precision::FP16); } +static const int kTileNum = 3; + // normalize_variance = false : src->mean // normalize_variance = true : src+mean->variance:sqr(x-mean) template @@ -108,14 +105,15 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } void generate() override { - tail_step = jcp_.layout == MVNLayoutType::mvn_planar ? (jcp_.D * jcp_.H * jcp_.W) - ((jcp_.D * jcp_.H * jcp_.W) / vector_step) * vector_step : - jcp_.C - (jcp_.C / vector_step) * vector_step; - Precision dst_prc = isFloatCompatible(jcp_.src_prc) ? Precision::FP32 : Precision::I32; - load_vector_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, vector_step)); - load_tail_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, tail_step)); - load_tail_with_fill_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, tail_step, Precision::FP32, true, "zero")); - load_scalar_with_fill_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, scalar_step, Precision::FP32, true, "zero")); + load_emitter[VECTOR] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, vector_step)); + load_emitter[TAIL8] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 8)); + load_emitter[TAIL4] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 4)); + load_emitter[TAIL2] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 2)); + load_emitter[TAIL1] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 1)); + load_emitter[TAIL8_FILL] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 8, Precision::FP32, true, "zero")); + load_emitter[TAIL4_FILL] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 4, Precision::FP32, true, "zero")); + load_emitter[TAIL1_FILL] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, dst_prc, 1, Precision::FP32, true, "zero")); this->preamble(); mov(reg_table, l_table); @@ -129,8 +127,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k uni_vpxor(vmm_sum, vmm_sum, vmm_sum); } mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - mov(reg_stride, ptr[reg_params + GET_OFF(src_stride)]); - mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]); + mov(reg_rt_shape, ptr[reg_params + GET_OFF(rt_shape_size)]); if (jcp_.normalize_variance) { if (jcp_.layout == MVNLayoutType::mvn_planar || jcp_.across_channels) { @@ -140,14 +137,19 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } } + size_t data_step = (isa == cpu::x64::sse41 && jcp_.layout == MVNLayoutType::mvn_block) ? vector_step * 2 : vector_step; + src_stride = data_step * jcp_.src_data_size; + load_pool_gpr_idxs = {static_cast(reg_load_store_mask.getIdx()), static_cast(reg_load_table.getIdx())}; if (jcp_.layout == MVNLayoutType::mvn_planar) { - worker_unroll(); - if (tail_step != 0) { - worker_partial(false, true); - } - + worker_vector_unroll(); + // for tails. [0-15] for avx512, [0-7] for avx2, [0-3] for sse + auto tails_func = [&](int tile_size) { + worker_block(tile_size, true); + add(reg_src, tile_size * jcp_.src_data_size); + }; + worker_tails(reg_rt_shape, tails_func); // hsum+store if (!jcp_.normalize_variance && !isFloatCompatible(jcp_.src_prc)) uni_vcvtdq2ps(vmm_sum, vmm_sum); @@ -159,92 +161,13 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k else nspc_pc_ker(); } else { - // blk - int repeats = (isa == cpu::x64::sse41) ? 2 : 1; // block size is also 8 on cpu::x64::sse41 with two step process - int sse42_step = 4; - for (int i = 0; i < repeats; i++) { - int offset_sse42 = i * sse42_step; - if (i > 0) { - mov(reg_src, ptr[reg_params + GET_OFF(src)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - add(reg_src, offset_sse42 * jcp_.src_data_size); - - if (jcp_.normalize_variance) { - // mean and vaiance for variance kernel - if (!jcp_.across_channels) { - // mean is bc when across_channel, no need shift - add(reg_mean, offset_sse42 * sizeof(float)); - uni_vmovups(vmm_mean, ptr[reg_mean]); - } - add(reg_variance, offset_sse42 * sizeof(float)); - uni_vpxor(vmm_variance, vmm_variance, vmm_variance); - } else { - // sum for mean kernel - add(reg_sum, offset_sse42 * sizeof(float)); - uni_vpxor(vmm_sum, vmm_sum, vmm_sum); - } - add(reg_oc_off, offset_sse42 * sizeof(float)); - } - - Xbyak::Label label_empty_2half_sse42; - if (tail_step == 0) { - cmp(reg_oc_off, static_cast(jcp_.C * sizeof(float))); - jae(label_empty_2half_sse42, T_NEAR); - - worker_unroll(); - } else { - // maybe tail blk - cmp(reg_oc_off, static_cast(jcp_.C * sizeof(float))); - jae(label_empty_2half_sse42, T_NEAR); - - Xbyak::Label label_full_size; - Xbyak::Label label_size_end; - cmp(reg_oc_off, static_cast((jcp_.C - vector_step) * sizeof(float))); - jle(label_full_size, T_NEAR); - - // no need care and fill rest - // for per_channel, do not use tail mean(variance), do not store computed tail values. - // for across_channel, partial sum for tail one time out of kernel from perf. - worker_unroll(true); - - jmp(label_size_end, T_NEAR); - L(label_full_size); - { - worker_unroll(); - } - L(label_size_end); - } - - // add input_base value and store for per_channel - // store for across_channels - if (jcp_.normalize_variance) { - if (!jcp_.across_channels) { - uni_vmovups(vmm_val, ptr[reg_variance]); - uni_vaddps(vmm_variance, vmm_variance, vmm_val); - } - uni_vmovups(ptr[reg_variance], vmm_variance); - } else { - if (!isFloatCompatible(jcp_.src_prc)) // add with int for int-family data type, other compute go with float - uni_vcvtdq2ps(vmm_sum, vmm_sum); - - if (!jcp_.across_channels) { - uni_vmovups(vmm_val, ptr[reg_sum]); - uni_vaddps(vmm_sum, vmm_sum, vmm_val); - } - uni_vmovups(ptr[reg_sum], vmm_sum); - } - - L(label_empty_2half_sse42); - } + block_ker(); } this->postamble(); - load_vector_emitter->emit_data(); - load_tail_emitter->emit_data(); - load_tail_with_fill_emitter->emit_data(); - load_scalar_with_fill_emitter->emit_data(); + for (size_t i = 0; i < LOAD_EMITTERS_NUM; i++) + load_emitter[i]->emit_data(); prepare_table(); } @@ -255,22 +178,19 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k const int vlen = cpu_isa_traits::vlen; const int vector_step = vlen / sizeof(float); - int tail_step = 0; - int scalar_step = 1; Xbyak::Reg64 reg_src = r8; Xbyak::Reg64 reg_mean = r9; Xbyak::Reg64 reg_variance = r10; Xbyak::Reg64 reg_work_amount = r11; - Xbyak::Reg64 reg_stride = r12; Xbyak::Reg64 reg_sum = reg_mean; Xbyak::Reg64 reg_params = abi_param1; Xbyak::Reg64 reg_load_table = r13; Xbyak::Reg64 reg_load_store_mask = r14; Xbyak::Reg64 reg_aux = r15; - Xbyak::Reg64 reg_oc_off = rax; - Xbyak::Reg64 reg_table = rdx; + Xbyak::Reg64 reg_rt_shape = rbx; + Xbyak::Reg64 reg_table = rsi; Xbyak::Label l_table; Vmm vmm_val = Vmm(1); @@ -285,19 +205,21 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k Xbyak::Opmask k_mask = Xbyak::Opmask(7); - std::unique_ptr load_vector_emitter = nullptr; - std::unique_ptr load_tail_emitter = nullptr; - std::unique_ptr load_tail_with_fill_emitter = nullptr; - std::unique_ptr load_scalar_with_fill_emitter = nullptr; + size_t src_stride = 0; + enum { VECTOR, TAIL8, TAIL4, TAIL2, TAIL1, TAIL8_FILL, TAIL4_FILL, TAIL1_FILL, LOAD_EMITTERS_NUM }; + std::unique_ptr load_emitter[LOAD_EMITTERS_NUM]; std::vector load_pool_gpr_idxs; + // used for tails process(except nspc&&per_channel) + Label tail_start[kTileNum]; + Label tail_exit[kTileNum]; + const int tile_size[kTileNum] = {8, 4, 1}; + // nspc across channel inline void nspc_ac_ker() { Xbyak::Label loop_label; Xbyak::Label loop_end_label; - Xbyak::Label scalar_loop_label; - Xbyak::Label scalar_loop_end_label; L(loop_label); { cmp(reg_work_amount, vector_step); @@ -311,18 +233,11 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } L(loop_end_label); - L(scalar_loop_label); - { - cmp(reg_work_amount, 1); - jl(scalar_loop_end_label, T_NEAR); - - worker_partial(true, true); - add(reg_src, scalar_step * jcp_.src_data_size); - - sub(reg_work_amount, scalar_step); - jmp(scalar_loop_label, T_NEAR); - } - L(scalar_loop_end_label); + auto tails_func = [&](int tile_size) { + worker_block(tile_size, true); + add(reg_src, tile_size * jcp_.src_data_size); + }; + worker_tails(reg_work_amount, tails_func); if (!jcp_.normalize_variance && !isFloatCompatible(jcp_.src_prc)) uni_vcvtdq2ps(vmm_sum, vmm_sum); @@ -333,33 +248,130 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k // nspc per channel with unroll inline void nspc_pc_ker() { // 4 unroll vector - size_t unroll_size = 4; - size_t vec_num = div_up(jcp_.C, vector_step); - unroll_size = vec_num >= unroll_size ? unroll_size : vec_num; - size_t unroll_number = div_up(vec_num, unroll_size); + // r12, rax, rdx, rbp, r15, rcx and rdi is available + // r13 is available as no fill need for this layout + // reg_rt_shape is C + Xbyak::Reg64 reg_unroll_size = r12; + Xbyak::Reg64 reg_unroll_num = rax; + Xbyak::Reg64 reg_vector_num = rbp; + Xbyak::Reg64 reg_tail_num = r13; + // size_t unroll_size = 4; + mov(reg_unroll_size, 4); + // size_t vec_num = C / vector_step + mov(rax, reg_rt_shape); + mov(reg_vector_num, vector_step); + xor_(rdx, rdx); + div(reg_vector_num); // reg_rt_shape / vector_step, rax is result, rdx is tails(remainder) + mov(reg_vector_num, rax); + mov(reg_tail_num, rdx); + + Xbyak::Reg64 reg_src_aux = rdx; + Xbyak::Reg64 reg_work_amount_bk = r15; + mov(reg_work_amount_bk, reg_work_amount); // should before tail jmp + + Xbyak::Label tail_label; + cmp(reg_vector_num, 0); + je(tail_label, T_NEAR); + + // unroll_size = vec_num >= unroll_size ? unroll_size : vec_num; + Xbyak::Label label_reset_unroll_size_end; + cmp(reg_unroll_size, reg_vector_num); + jle(label_reset_unroll_size_end, T_NEAR); + mov(reg_unroll_size, reg_vector_num); + L(label_reset_unroll_size_end); + + // last unroll_size + Xbyak::Label label_reset_last_unroll_size; + Xbyak::Label label_reset_last_unroll_size_end; + Xbyak::Reg64 last_unroll_size = rcx; + mov(rax, reg_vector_num); + xor_(rdx, rdx); + div(reg_unroll_size); // rdx + cmp(rdx, 0); + je(label_reset_last_unroll_size, T_NEAR); + mov(last_unroll_size, rdx); + jmp(label_reset_last_unroll_size_end); + L(label_reset_last_unroll_size); + { + mov(last_unroll_size, reg_unroll_size); + } + L(label_reset_last_unroll_size_end); + + // size_t unroll_number = div_up(vec_num, unroll_size); --> (vec_num + unroll_size - 1) / unroll_size; + mov(rdi, reg_vector_num); + add(rdi, reg_unroll_size); + sub(rdi, 1); + mov(rax, rdi); + xor_(rdx, rdx); + div(reg_unroll_size); // result is in rax, that is reg_unroll_num, no mov need. + // 4-15 for unroll. 4-7 for src, 8-11 for m/v sum, 12-15 for mean, 4 vector for 4 unroll int ur_base = 4; - Xbyak::Reg64 reg_src_aux = reg_stride; - Xbyak::Reg64 reg_work_amount_bk = rbx; - mov(reg_work_amount_bk, reg_work_amount); - for (size_t ur_num = 0; ur_num < unroll_number; ur_num++) { - // 4-15 for unroll. 4-7 for src, 8-11 for m/v sum, 12-15 for mean - int ur_offset_elt = ur_num * unroll_size * vector_step; - int ur_offset = ur_offset_elt * sizeof(float); - size_t unroll_size_rt = std::min(vec_num - ur_num * unroll_size, unroll_size); - size_t elt_num = std::min(jcp_.C - ur_num * unroll_size * vector_step, unroll_size * vector_step); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vpxor(Vmm(ur_base + 4 + ur_size), Vmm(ur_base + 4 + ur_size), Vmm(ur_base + 4 + ur_size)); - } + auto init = [&](int vmm_id) { + uni_vpxor(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id)); + if (jcp_.normalize_variance) + uni_vmovups(Vmm(ur_base + 8 + vmm_id), ptr[reg_mean + vmm_id * vlen]); + }; + auto load_src = [&](int vmm_id) { + load_emitter[VECTOR]->emit_code({static_cast(reg_src_aux.getIdx())}, {static_cast(ur_base + vmm_id)}, {}, {load_pool_gpr_idxs}); + add(reg_src_aux, vector_step * jcp_.src_data_size); + }; + auto mv = [&](int vmm_id) { if (jcp_.normalize_variance) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmovups(Vmm(ur_base + 8 + ur_size), ptr[reg_mean + ur_offset + ur_size * vector_step * sizeof(float)]); + if (!isFloatCompatible(jcp_.src_prc)) { + uni_vcvtdq2ps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id)); } + uni_vsubps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + 8 + vmm_id)); + uni_vfmadd231ps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id)); + } else { + if (!isFloatCompatible(jcp_.src_prc)) + uni_vpaddd(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id)); + else + uni_vaddps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id)); } + }; + auto store = [&](int vmm_id) { + if (jcp_.normalize_variance) { + uni_vmovups(ptr[reg_variance + vmm_id * vector_step * sizeof(float)], Vmm(ur_base + 4 + vmm_id)); + } else { + if (!isFloatCompatible(jcp_.src_prc)) + uni_vcvtdq2ps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id)); + uni_vmovups(ptr[reg_sum + vmm_id * vector_step * sizeof(float)], Vmm(ur_base + 4 + vmm_id)); + } + }; + + auto vector_worker = [&](std::function func) { + Xbyak::Label label_end; + func(0); + cmp(reg_unroll_size, 1); + jle(label_end, T_NEAR); + func(1); + cmp(reg_unroll_size, 2); + jle(label_end, T_NEAR); + func(2); + cmp(reg_unroll_size, 3); + jle(label_end, T_NEAR); + func(3); + L(label_end); + }; + + Xbyak::Label label_unroll_num; + Xbyak::Label label_unroll_num_end; + L(label_unroll_num); + { + cmp(reg_unroll_num, 0); + jle(label_unroll_num_end, T_NEAR); + + Xbyak::Label label_not_last; + cmp(reg_unroll_num, 1); + jne(label_not_last, T_NEAR); + mov(reg_unroll_size, last_unroll_size); + L(label_not_last); + + vector_worker(init); mov(reg_src_aux, reg_src); mov(reg_work_amount, reg_work_amount_bk); - Xbyak::Label loop_label; Xbyak::Label loop_end_label; L(loop_label); @@ -367,63 +379,288 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k cmp(reg_work_amount, 0); jle(loop_end_label, T_NEAR); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - bool is_tails = ur_offset_elt + ur_size * vector_step + vector_step > static_cast(jcp_.C); - if (is_tails) { - load_tail_emitter->emit_code({static_cast(reg_src_aux.getIdx())}, - {static_cast(ur_base + ur_size)}, {}, {load_pool_gpr_idxs}); - add(reg_src_aux, tail_step * jcp_.src_data_size); - } else { - load_vector_emitter->emit_code({static_cast(reg_src_aux.getIdx())}, - {static_cast(ur_base + ur_size)}, {}, {load_pool_gpr_idxs}); - add(reg_src_aux, vector_step * jcp_.src_data_size); - } - } - add(reg_src_aux, (jcp_.C - elt_num) * jcp_.src_data_size); + // load unroll + vector_worker(load_src); + + // advance src and prefetch + mov(rdi, reg_unroll_size); + imul(rdi, rdi, vector_step * jcp_.src_data_size); + sub(reg_src_aux, rdi); + mov(rdi, reg_rt_shape); + imul(rdi, rdi, jcp_.src_data_size); + add(reg_src_aux, rdi); prefetcht0(ptr[reg_src_aux]); - if (jcp_.normalize_variance) { - if (!isFloatCompatible(jcp_.src_prc)) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vcvtdq2ps(Vmm(ur_base + ur_size), Vmm(ur_base + ur_size)); - } - } - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vsubps(Vmm(ur_base + ur_size), Vmm(ur_base + ur_size), Vmm(ur_base + 8 + ur_size)); - } - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vfmadd231ps(Vmm(ur_base + 4 + ur_size), Vmm(ur_base + ur_size), Vmm(ur_base + ur_size)); - } - } else { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - if (!isFloatCompatible(jcp_.src_prc)) - uni_vpaddd(Vmm(ur_base + 4 + ur_size), Vmm(ur_base + 4 + ur_size), Vmm(ur_base + ur_size)); - else - uni_vaddps(Vmm(ur_base + 4 + ur_size), Vmm(ur_base + 4 + ur_size), Vmm(ur_base + ur_size)); - } - } + // mv compute + vector_worker(mv); sub(reg_work_amount, 1); jmp(loop_label, T_NEAR); } L(loop_end_label); - // store sum/variance - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - if (jcp_.normalize_variance) { - uni_vmovups(ptr[reg_variance + ur_offset + ur_size * vector_step * sizeof(float)], Vmm(ur_base + 4 + ur_size)); - } else { - if (!isFloatCompatible(jcp_.src_prc)) - uni_vcvtdq2ps(Vmm(ur_base + 4 + ur_size), Vmm(ur_base + 4 + ur_size)); - uni_vmovups(ptr[reg_sum + ur_offset + ur_size * vector_step * sizeof(float)], Vmm(ur_base + 4 + ur_size)); + // store mv vector to memory + vector_worker(store); + + // src advance + mov(rdi, reg_unroll_size); + imul(rdi, rdi, vector_step * jcp_.src_data_size); + add(reg_src, rdi); + // m/v advance + mov(rdi, reg_unroll_size); + imul(rdi, rdi, vlen); + if (jcp_.normalize_variance) { + add(reg_mean, rdi); + add(reg_variance, rdi); + } else { + add(reg_sum, rdi); + } + sub(reg_unroll_num, 1); + jmp(label_unroll_num, T_NEAR); + } + L(label_unroll_num_end); + + // tails + L(tail_label); + + Xbyak::Label label_exit; + cmp(reg_tail_num, 0); + je(label_exit, T_NEAR); + + Xbyak::Reg64 reg_tails_num_active = reg_unroll_size; + mov(reg_src_aux, reg_src); + mov(reg_work_amount, reg_work_amount_bk); + + // 4-7 for src, 8-11 for sum, 12-15 for mean. 4 vector for 8/4/2/1 tiles + auto init_tails = [&](int vmm_id, int step) { + uni_vpxor(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id)); + if (jcp_.normalize_variance) { + uni_vmovups(Vmm(ur_base + 8 + vmm_id), ptr[reg_mean]); + add(reg_mean, step * sizeof(float)); + } + }; + auto load_src_tails = [&](int vmm_id, int step) { + int emitter_id = 4; + if (step == 8) { + emitter_id = 1; + } else if (step == 4) { + emitter_id = 2; + } else if (step == 2) { + emitter_id = 3; + } + load_emitter[emitter_id]->emit_code({static_cast(reg_src_aux.getIdx())}, {static_cast(ur_base + vmm_id)}, + {}, {load_pool_gpr_idxs}); + add(reg_src_aux, step * jcp_.src_data_size); + }; + auto mv_tails = [&](int vmm_id, int step) { + if (jcp_.normalize_variance) { + if (!isFloatCompatible(jcp_.src_prc)) { + uni_vcvtdq2ps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id)); } + uni_vsubps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + 8 + vmm_id)); + uni_vfmadd231ps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id)); + } else { + if (!isFloatCompatible(jcp_.src_prc)) + uni_vpaddd(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id)); + else + uni_vaddps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + vmm_id)); } + }; + auto store_tails = [&](int vmm_id, size_t step) { + if (jcp_.normalize_variance) { + uni_vmovups(ptr[reg_variance], Vmm(ur_base + 4 + vmm_id)); + add(reg_variance, step * sizeof(float)); + } else { + if (!isFloatCompatible(jcp_.src_prc)) + uni_vcvtdq2ps(Vmm(ur_base + 4 + vmm_id), Vmm(ur_base + 4 + vmm_id)); + uni_vmovups(ptr[reg_sum], Vmm(ur_base + 4 + vmm_id)); + add(reg_sum, step * sizeof(float)); + } + }; + + auto tails_worker = [&](std::function func) { + Label tail_blk8_exit_label; + Label tail_blk4_exit_label; + Label tail_blk2_exit_label; + Label tail_blk1_exit_label; + cmp(reg_tails_num_active, 8); + jl(tail_blk8_exit_label, T_NEAR); + func(0, 8); + sub(reg_tails_num_active, 8); + L(tail_blk8_exit_label); + cmp(reg_tails_num_active, 4); + jl(tail_blk4_exit_label, T_NEAR); + func(1, 4); + sub(reg_tails_num_active, 4); + L(tail_blk4_exit_label); + cmp(reg_tails_num_active, 2); + jl(tail_blk2_exit_label, T_NEAR); + func(2, 2); + sub(reg_tails_num_active, 2); + L(tail_blk2_exit_label); + cmp(reg_tails_num_active, 1); + jl(tail_blk1_exit_label, T_NEAR); + func(3, 1); + sub(reg_tails_num_active, 1); + L(tail_blk1_exit_label); + }; + + // init + mov(reg_tails_num_active, reg_tail_num); + tails_worker(init_tails); - add(reg_src, unroll_size_rt * vector_step * jcp_.src_data_size); + Xbyak::Label loop_tail_label; + Xbyak::Label label_tails_end; + + L(loop_tail_label); + { + cmp(reg_work_amount, 0); + jle(label_tails_end, T_NEAR); + + // load src + mov(reg_tails_num_active, reg_tail_num); + tails_worker(load_src_tails); + + // m/v compute + mov(reg_tails_num_active, reg_tail_num); + tails_worker(mv_tails); + + mov(rdi, reg_vector_num); + imul(rdi, rdi, vector_step * jcp_.src_data_size); + add(reg_src_aux, rdi); + sub(reg_work_amount, 1); + jmp(loop_tail_label, T_NEAR); + } + L(label_tails_end); + + // store tails + mov(reg_tails_num_active, reg_tail_num); + tails_worker(store_tails); + + L(label_exit); + } + + inline void block_ker() { + // safe to use abi reg now. + Xbyak::Reg64 reg_src_bk = rcx; + Xbyak::Reg64 reg_work_amount_bk = rdi; + mov(reg_src_bk, reg_src); + mov(reg_work_amount_bk, reg_work_amount); + int repeats = (isa == cpu::x64::sse41) ? 2 : 1; // block size is also 8 on cpu::x64::sse41 with two step process + + auto reset_with_offset = [&](int offset) { + add(reg_src_bk, offset * jcp_.src_data_size); + mov(reg_src, reg_src_bk); + mov(reg_work_amount, reg_work_amount_bk); + if (jcp_.normalize_variance) { + // mean and vaiance for variance kernel + if (!jcp_.across_channels) { + // mean is bc when across_channel, no need shift + add(reg_mean, offset * sizeof(float)); + uni_vmovups(vmm_mean, ptr[reg_mean]); + } + add(reg_variance, offset * sizeof(float)); + uni_vpxor(vmm_variance, vmm_variance, vmm_variance); + } else { + // sum for mean kernel + add(reg_sum, offset * sizeof(float)); + uni_vpxor(vmm_sum, vmm_sum, vmm_sum); + } + }; + + auto save_result = [&]() { + // add input_base value and store for per_channel + // store for across_channels + if (jcp_.normalize_variance) { + if (!jcp_.across_channels) { + uni_vmovups(vmm_val, ptr[reg_variance]); + uni_vaddps(vmm_variance, vmm_variance, vmm_val); + } + uni_vmovups(ptr[reg_variance], vmm_variance); + } else { + if (!isFloatCompatible(jcp_.src_prc)) // add with int for int-family data type, other compute go with float + uni_vcvtdq2ps(vmm_sum, vmm_sum); + + if (!jcp_.across_channels) { + uni_vmovups(vmm_val, ptr[reg_sum]); + uni_vaddps(vmm_sum, vmm_sum, vmm_val); + } + uni_vmovups(ptr[reg_sum], vmm_sum); + } + }; + + auto worker_tails_unroll = [&]() { + auto unroll_w = [&](int block_num) { + Xbyak::Label loop_label; + Xbyak::Label loop_end_label; + L(loop_label); + { + cmp(reg_work_amount, 0); + jle(loop_end_label, T_NEAR); + + worker_block(block_num, true); + + add(reg_src, src_stride); + sub(reg_work_amount, 1); + + jmp(loop_label, T_NEAR); + } + L(loop_end_label); + }; + auto tails_func = [&](int tile_size) { + unroll_w(tile_size); + save_result(); + reset_with_offset(tile_size); + }; + worker_tails(reg_rt_shape, tails_func); + }; + + // cover vector and tails on avx512, avx2 + // cover on sse, 2 part vector, first part vector and second part tails, first part tails + for (int i = 0; i < repeats; i++) { + if (i > 0) { + reset_with_offset(4); + } + + Xbyak::Label label_tails; + Xbyak::Label label_end; + cmp(reg_rt_shape, 0); + jne(label_tails, T_NEAR); + + worker_vector_unroll(); + save_result(); + jmp(label_end, T_NEAR); + + L(label_tails); + { + if (i > 0) { + // empty second half on sse + cmp(reg_rt_shape, 0); + jbe(label_end); + } + + Xbyak::Label label_sse_full_size; + if (isa == cpu::x64::sse41) { + // on sse, first 4 could be done with vector manner + cmp(reg_rt_shape, 4); + jae(label_sse_full_size, T_NEAR); + } + + worker_tails_unroll(); + jmp(label_end, T_NEAR); + + L(label_sse_full_size); + { + worker_vector_unroll(); + save_result(); + sub(reg_rt_shape, 4); + } + } + L(label_end); } } - inline void worker_unroll(bool is_tail = false) { + inline void worker_vector_unroll() { // if mean(sum) for continous data, then fast pass for major part if (!jcp_.normalize_variance && jcp_.layout == MVNLayoutType::mvn_planar) { Vmm vmm_one = Vmm(15); @@ -479,13 +716,9 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k cmp(reg_work_amount, 0); jle(loop_end_label, T_NEAR); - if (jcp_.layout != MVNLayoutType::mvn_planar && is_tail) { - worker_partial(false, false); - } else { - worker_full_size(); - } + worker_full_size(); - add(reg_src, reg_stride); + add(reg_src, src_stride); sub(reg_work_amount, 1); jmp(loop_label, T_NEAR); @@ -494,7 +727,7 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } inline void worker_full_size() { - load_vector_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + load_emitter[VECTOR]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, {}, {load_pool_gpr_idxs}); if (jcp_.normalize_variance) { @@ -513,37 +746,78 @@ struct jit_uni_mvn_mean_variance_kernel_f32 : public jit_uni_mvn_mean_variance_k } } + inline void worker_tails(Xbyak::Reg64& reg_tail_num, std::function func) { + for (int i = 0; i < kTileNum; i++) { + L(tail_start[i]); + { + cmp(reg_tail_num, tile_size[i]); + jl(tail_exit[i], T_NEAR); + + func(tile_size[i]); + + sub(reg_tail_num, tile_size[i]); + jmp(tail_start[i], T_NEAR); + } + L(tail_exit[i]); + } + } + // needed and supported case: 1. scalar with zero pad. 2. tails w/ or w/o zero pad - inline void worker_partial(bool is_scalar, bool is_zero_pad) { - if (is_scalar) { - load_scalar_with_fill_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, - {}, {load_pool_gpr_idxs}); + inline void worker_block(int block_num, bool is_zero_pad) { + if (is_zero_pad) { + switch (block_num) { + case 8: + load_emitter[TAIL8_FILL]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 4: + load_emitter[TAIL4_FILL]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 1: + load_emitter[TAIL1_FILL]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + default: + assert(!"MVN layer tails is processed only with 8/4/1 blocks."); + break; + } } else { - if (is_zero_pad) - load_tail_with_fill_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, - {}, {load_pool_gpr_idxs}); - else - load_tail_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, - {}, {load_pool_gpr_idxs}); + switch (block_num) { + case 8: + load_emitter[TAIL8]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 4: + load_emitter[TAIL4]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 1: + load_emitter[TAIL1]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + default: + assert(!"MVN layer tails is processed only with 8/4/1 blocks."); + break; + } } if (jcp_.normalize_variance) { if (!isFloatCompatible(jcp_.src_prc)) uni_vcvtdq2ps(vmm_val, vmm_val); uni_vsubps(vmm_val, vmm_val, vmm_mean); if (is_zero_pad) { - int elt_num = is_scalar ? 1 : tail_step; uni_vpxor(vmm_zero, vmm_zero, vmm_zero); if (isa == cpu::x64::sse41) { uint8 imm = 1; - imm = ~((imm << elt_num) - imm); + imm = ~((imm << block_num) - imm); blendps(vmm_val, vmm_zero, imm); } else if (isa == cpu::x64::avx2) { uint8 imm = 1; - imm = ~((imm << elt_num) - imm); + imm = ~((imm << block_num) - imm); vblendps(vmm_val, vmm_val, vmm_zero, imm); } else if (isa == cpu::x64::avx512_core) { uint64_t tail_mask = 1; - tail_mask = ~((tail_mask << elt_num) - tail_mask); + tail_mask = ~((tail_mask << block_num) - tail_mask); mov(reg_aux, tail_mask); kmovq(k_mask, reg_aux); vblendmps(vmm_val | k_mask, vmm_val, vmm_zero); @@ -658,13 +932,16 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator } } - tail_step = jcp_.layout == MVNLayoutType::mvn_planar ? (jcp_.D * jcp_.H * jcp_.W) - ((jcp_.D * jcp_.H * jcp_.W) / vector_step) * vector_step : - jcp_.C - (jcp_.C / vector_step) * vector_step; - - load_vector_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, vector_step)); - load_tail_emitter.reset(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, tail_step)); - store_vector_emitter.reset(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, vector_step)); - store_tail_emitter.reset(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, tail_step)); + load_emitter[VECTOR] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, vector_step)); + load_emitter[TAIL8] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, 8)); + load_emitter[TAIL4] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, 4)); + load_emitter[TAIL2] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, 2)); + load_emitter[TAIL1] = std::unique_ptr(new jit_load_emitter(this, isa, jcp_.src_prc, Precision::FP32, 1)); + store_emitter[VECTOR] = std::unique_ptr(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, vector_step)); + store_emitter[TAIL8] = std::unique_ptr(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, 8)); + store_emitter[TAIL4] = std::unique_ptr(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, 4)); + store_emitter[TAIL2] = std::unique_ptr(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, 2)); + store_emitter[TAIL1] = std::unique_ptr(new jit_store_emitter(this, isa, Precision::FP32, jcp_.dst_prc, 1)); this->preamble(); @@ -675,10 +952,13 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator mov(reg_variance_inv, ptr[reg_params + GET_OFF(variance)]); mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - mov(reg_src_stride, ptr[reg_params + GET_OFF(src_stride)]); - mov(reg_dst_stride, ptr[reg_params + GET_OFF(dst_stride)]); + mov(reg_rt_shape, ptr[reg_params + GET_OFF(rt_shape_size)]); mov(reg_oc_off, ptr[reg_params + GET_OFF(oc_off)]); + size_t data_step = (isa == cpu::x64::sse41 && jcp_.layout == MVNLayoutType::mvn_block) ? vector_step * 2 : vector_step; + src_stride = data_step * jcp_.src_data_size; + dst_stride = data_step * jcp_.dst_data_size; + if (jcp_.layout == MVNLayoutType::mvn_planar || jcp_.across_channels) { uni_vbroadcastss(vmm_mean, ptr[reg_mean]); if (jcp_.normalize_variance) @@ -696,74 +976,29 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator store_pool_vec_idxs = {static_cast(vmm_zero.getIdx()), static_cast(vmm_val.getIdx())}; if (jcp_.layout == MVNLayoutType::mvn_planar) { - worker_mvn_unroll(); - if (tail_step != 0) { - worker_mvn(true); - } + worker_mvn_vector_unroll(reg_work_amount); + // tails + auto tails_func = [&](int tile_size) { + worker_mvn_block(tile_size); + add(reg_src, tile_size * jcp_.src_data_size); + add(reg_dst, tile_size * jcp_.dst_data_size); + }; + worker_mvn_tails(reg_rt_shape, tails_func); } else if (jcp_.layout == MVNLayoutType::mvn_by_channel) { if (jcp_.across_channels) norm_nspc_ac_ker(); else norm_nspc_pc_ker(); } else { - // blk - int repeats = (isa == cpu::x64::sse41) ? 2 : 1; // block size is also 8 on cpu::x64::sse41 - for (int i = 0; i < repeats; i++) { - int offset_sse42 = i * 4; - if (i > 0) { - // reset modified input - mov(reg_src, ptr[reg_params + GET_OFF(src)]); - mov(reg_dst, ptr[reg_params + GET_OFF(dst)]); - mov(reg_work_amount, ptr[reg_params + GET_OFF(work_amount)]); - - add(reg_src, offset_sse42 * jcp_.src_data_size); - add(reg_dst, offset_sse42 * jcp_.dst_data_size); - add(reg_oc_off, offset_sse42 * sizeof(float)); - - if (!jcp_.across_channels) { - add(reg_mean, offset_sse42 * sizeof(float)); - uni_vmovups(vmm_mean, ptr[reg_mean]); - if (jcp_.normalize_variance) { - add(reg_variance_inv, offset_sse42 * sizeof(float)); - uni_vmovups(vmm_variance_inv, ptr[reg_variance_inv]); - } - } - } - - Xbyak::Label label_empty_2half_sse42; - if (tail_step == 0) { - cmp(reg_oc_off, static_cast(jcp_.C * sizeof(float))); - jae(label_empty_2half_sse42, T_NEAR); - worker_mvn_unroll(); - } else { - cmp(reg_oc_off, static_cast(jcp_.C * sizeof(float))); - jae(label_empty_2half_sse42, T_NEAR); - - Xbyak::Label label_full_size_block; - Xbyak::Label label_size_end; - - cmp(reg_oc_off, static_cast((jcp_.C - vector_step) * sizeof(float))); - jle(label_full_size_block, T_NEAR); - - worker_mvn_unroll(true); - jmp(label_size_end, T_NEAR); - - L(label_full_size_block); - { - worker_mvn_unroll(); - } - L(label_size_end); - } - L(label_empty_2half_sse42); - } + norm_block_ker(); } this->postamble(); - load_vector_emitter->emit_data(); - load_tail_emitter->emit_data(); - store_vector_emitter->emit_data(); - store_tail_emitter->emit_data(); + for (size_t i = 0; i < EMITTERS_NUM; i++) + load_emitter[i]->emit_data(); + for (size_t i = 0; i < EMITTERS_NUM; i++) + store_emitter[i]->emit_data(); for (auto& inj : eltwise_injectors) inj->prepare_table(); @@ -775,25 +1010,26 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator const int vlen = cpu_isa_traits::vlen; const int vector_step = vlen / sizeof(float); - int tail_step = 0; Xbyak::Reg64 reg_src = r8; Xbyak::Reg64 reg_mean = r9; Xbyak::Reg64 reg_variance_inv = r10; Xbyak::Reg64 reg_dst = r11; Xbyak::Reg64 reg_work_amount = r12; - Xbyak::Reg64 reg_src_stride = r13; - Xbyak::Reg64 reg_dst_stride = r14; Xbyak::Reg64 reg_params = abi_param1; - Xbyak::Reg64 reg_oc_off = rax; + Xbyak::Reg64 reg_oc_off = r13; Xbyak::Reg64 reg_d_weights = rbx; - Xbyak::Reg64 reg_d_bias = rdx; + Xbyak::Reg64 reg_d_bias = r14; Xbyak::Reg64 reg_post_ops_data = rsi; - Xbyak::Reg64 reg_load_table = r15; + Xbyak::Reg64 reg_rt_shape = r15; + Xbyak::Reg64 reg_load_table = r15; // fill not needed, dummy Xbyak::Reg64 reg_load_store_mask = rbp; + size_t src_stride = 0; + size_t dst_stride = 0; + Vmm vmm_val = Vmm(3); Vmm vmm_mean = Vmm(4); Vmm vmm_variance_inv = Vmm(5); @@ -802,72 +1038,289 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator Vmm vmm_d_weights = Vmm(0); Vmm vmm_d_bias = Vmm(1); - std::unique_ptr load_vector_emitter = nullptr; - std::unique_ptr load_tail_emitter = nullptr; - std::unique_ptr store_vector_emitter = nullptr; - std::unique_ptr store_tail_emitter = nullptr; + enum { VECTOR, TAIL8, TAIL4, TAIL2, TAIL1, EMITTERS_NUM }; + std::unique_ptr load_emitter[EMITTERS_NUM]; + std::unique_ptr store_emitter[EMITTERS_NUM]; + std::vector store_pool_gpr_idxs; + std::vector store_pool_vec_idxs; + std::vector load_pool_gpr_idxs; + + // used for tails process(except nspc&&per_channel) + Label tail_start[kTileNum]; + Label tail_exit[kTileNum]; + const int tile_size[kTileNum] = {8, 4, 1}; std::vector>> eltwise_injectors; std::vector>> depthwise_injectors; std::vector>> quantization_injectors; - std::vector store_pool_gpr_idxs; - std::vector store_pool_vec_idxs; - std::vector load_pool_gpr_idxs; + inline void norm_block_ker() { + Xbyak::Reg64 reg_src_bk = rax; + Xbyak::Reg64 reg_dst_bk = rdx; + Xbyak::Reg64 reg_work_amount_bk = rdi; + mov(reg_src_bk, reg_src); + mov(reg_dst_bk, reg_dst); + mov(reg_work_amount_bk, reg_work_amount); + + auto reset_with_offset = [&](int offset) { + add(reg_src_bk, offset * jcp_.src_data_size); + add(reg_dst_bk, offset * jcp_.dst_data_size); + add(reg_oc_off, offset * sizeof(float)); // for post ops + mov(reg_src, reg_src_bk); + mov(reg_dst, reg_dst_bk); + mov(reg_work_amount, reg_work_amount_bk); + if (!jcp_.across_channels) { + add(reg_mean, offset * sizeof(float)); + uni_vmovups(vmm_mean, ptr[reg_mean]); + if (jcp_.normalize_variance) { + add(reg_variance_inv, offset * sizeof(float)); + uni_vmovups(vmm_variance_inv, ptr[reg_variance_inv]); + } + } + }; + + // unroll for block layout, w/o zero pading + auto worker_tails_unroll = [&]() { + auto unroll_w = [&](int block_num) { + Xbyak::Label loop_label; + Xbyak::Label loop_end_label; + L(loop_label); + { + cmp(reg_work_amount, 0); + jle(loop_end_label, T_NEAR); + + worker_mvn_block(block_num); + + add(reg_src, src_stride); + add(reg_dst, dst_stride); + sub(reg_work_amount, 1); + + jmp(loop_label, T_NEAR); + } + L(loop_end_label); + }; + auto tails_func = [&](int tile_size) { + unroll_w(tile_size); + reset_with_offset(tile_size); + }; + worker_mvn_tails(reg_rt_shape, tails_func); + }; + + // cover vector and tails on avx512, avx2 + // cover on sse, 2 part vector, first part vector and second part tails, first part tails + int repeats = (isa == cpu::x64::sse41) ? 2 : 1; + for (int i = 0; i < repeats; i++) { + if (i > 0) { + reset_with_offset(4); + } + + Xbyak::Label label_tails; + Xbyak::Label label_end; + cmp(reg_rt_shape, 0); + jne(label_tails, T_NEAR); + + worker_mvn_vector_unroll(reg_work_amount); + jmp(label_end, T_NEAR); + + L(label_tails); + { + if (i > 0) { + // empty second half on sse + cmp(reg_rt_shape, 0); + jbe(label_end); + } + + Xbyak::Label label_sse_full_size; + if (isa == cpu::x64::sse41) { + // on sse, first 4 could be done with vector manner + cmp(reg_rt_shape, 4); + jae(label_sse_full_size, T_NEAR); + } + + worker_tails_unroll(); + jmp(label_end, T_NEAR); + + L(label_sse_full_size); + { + worker_mvn_vector_unroll(reg_work_amount); + sub(reg_rt_shape, 4); + } + } + L(label_end); + } + } // nspc norm per channel with unroll inline void norm_nspc_pc_ker() { - // 4 unroll vector - size_t unroll_size = 4; - size_t vec_num = div_up(jcp_.C, vector_step); - unroll_size = vec_num >= unroll_size ? unroll_size : vec_num; - size_t unroll_number = div_up(vec_num, unroll_size); + // stack used as no more GPR. + const int gpr_size = 8; + sub(rsp, 7 * gpr_size); + const Xbyak::Address addr_unroll_size = qword[rsp]; + const Xbyak::Address addr_unroll_num = qword[rsp + 8]; + const Xbyak::Address addr_vector_num = qword[rsp + 16]; + const Xbyak::Address addr_tail_num = qword[rsp + 24]; + const Xbyak::Address addr_last_unroll_size = qword[rsp + 32]; + const Xbyak::Address addr_work_amount_bk = qword[rsp + 40]; + const Xbyak::Address addr_oc_off_bk = qword[rsp + 48]; + + // size_t vec_num = C / vector_step + mov(rax, reg_rt_shape); + mov(addr_vector_num, vector_step); + xor_(rdx, rdx); + div(addr_vector_num); // reg_rt_shape / vector_step, rax is result, rdx is tails + mov(addr_vector_num, rax); + mov(addr_tail_num, rdx); + + // should before tail jmp + Xbyak::Reg64 reg_src_aux = rcx; + Xbyak::Reg64 reg_dst_aux = rdi; + mov(addr_work_amount_bk, reg_work_amount); + mov(addr_oc_off_bk, reg_oc_off); + + Xbyak::Label tail_label; + cmp(addr_vector_num, 0); + je(tail_label, T_NEAR); + + // unroll_size = vec_num >= unroll_size ? unroll_size : vec_num; + mov(addr_unroll_size, 4); // default is 4 for addr_unroll_size + mov(rax, addr_unroll_size); + Xbyak::Label label_reset_unroll_size_end; + cmp(rax, addr_vector_num); + jle(label_reset_unroll_size_end, T_NEAR); + mov(rax, addr_vector_num); + mov(addr_unroll_size, rax); + L(label_reset_unroll_size_end); + + // last unroll_size: vector_num % unroll_size + Xbyak::Label label_reset_last_unroll_size; + Xbyak::Label label_reset_last_unroll_size_end; + mov(rax, addr_vector_num); + xor_(rdx, rdx); + div(addr_unroll_size); // rdx + cmp(rdx, 0); + je(label_reset_last_unroll_size, T_NEAR); + mov(addr_last_unroll_size, rdx); + jmp(label_reset_last_unroll_size_end); + L(label_reset_last_unroll_size); + { + mov(rax, addr_unroll_size); + mov(addr_last_unroll_size, rax); + } + L(label_reset_last_unroll_size_end); + + // unroll_number = div_up(vec_num, unroll_size) --> (vec_num + unroll_size - 1) / unroll_size; + mov(rax, addr_vector_num); + add(rax, addr_unroll_size); + sub(rax, 1); + xor_(rdx, rdx); + div(addr_unroll_size); + mov(addr_unroll_num, rax); int ur_base = 4; - Xbyak::Reg64 reg_src_aux = reg_src_stride; - Xbyak::Reg64 reg_dst_aux = reg_dst_stride; - // 2 abi - Xbyak::Reg64 reg_work_amount_bk = rcx; - Xbyak::Reg64 reg_oc_off_bk = rdi; - mov(reg_oc_off_bk, reg_oc_off); - mov(reg_work_amount_bk, reg_work_amount); - for (size_t ur_num = 0; ur_num < unroll_number; ur_num++) { - // 4-15 for unroll. 4-7 for src, 8-11 for m, 12-15 for v - int ur_offset_elt = ur_num * unroll_size * vector_step; - int ur_offset = ur_offset_elt * sizeof(float); - size_t unroll_size_rt = std::min(vec_num - ur_num * unroll_size, unroll_size); - size_t elt_num = std::min(jcp_.C - ur_num * unroll_size * vector_step, unroll_size * vector_step); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmovups(Vmm(ur_base + 4 + ur_size), ptr[reg_mean + ur_offset + ur_size * vector_step * sizeof(float)]); + auto load_mv = [&](int vmm_id, int step) { + uni_vmovups(Vmm(ur_base + 4 + vmm_id), ptr[reg_mean]); + add(reg_mean, step * sizeof(float)); + if (jcp_.normalize_variance) { + uni_vmovups(Vmm(ur_base + 8 + vmm_id), ptr[reg_variance_inv]); + add(reg_variance_inv, step * sizeof(float)); } + }; + + // optimized scaleshift fusion data init + int ss_repeat_id = 0; + auto load_weight_bias = [&](int vmm_id, int step) { + uni_vmovups(Vmm(16 + ss_repeat_id * 4 + vmm_id), ptr[reg_d_weights]); + add(reg_d_weights, step * sizeof(float)); + uni_vmovups(Vmm(24 + ss_repeat_id * 4 + vmm_id), ptr[reg_d_bias]); + add(reg_d_bias, step * sizeof(float)); + }; + + auto load_src = [&](int vmm_id, int step) { + load_emitter[VECTOR]->emit_code({static_cast(reg_src_aux.getIdx())}, + {static_cast(ur_base + vmm_id)}, {}, {load_pool_gpr_idxs}); + add(reg_src_aux, step * jcp_.src_data_size); + }; + + auto norm = [&](int vmm_id, int step) { + uni_vsubps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + 4 + vmm_id)); if (jcp_.normalize_variance) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmovups(Vmm(ur_base + 8 + ur_size), ptr[reg_variance_inv + ur_offset + ur_size * vector_step * sizeof(float)]); - } + uni_vmulps(Vmm(ur_base + vmm_id), Vmm(ur_base + vmm_id), Vmm(ur_base + 8 + vmm_id)); } - // optimized scaleshift + }; + + // optimized scaleshift fusion + auto optimized_ss = [&](int vmm_id, int step) { + uni_vfmadd132ps(Vmm(ur_base + vmm_id), Vmm(24 + ss_repeat_id * 4 + vmm_id), Vmm(16 + ss_repeat_id * 4 + vmm_id)); + }; + + auto post_ops = [&](int vmm_id, int step) { + apply_post_ops(jcp_.dst_prc, ur_base + vmm_id, false); + add(reg_oc_off, step * sizeof(float)); + }; + + auto store_dst = [&](int vmm_id, int step) { + store_emitter[VECTOR]->emit_code({static_cast(ur_base + vmm_id)}, {static_cast(reg_dst_aux.getIdx())}, + {store_pool_vec_idxs}, {store_pool_gpr_idxs}); + add(reg_dst_aux, step * jcp_.dst_data_size); + }; + + auto vector_worker = [&](std::function func) { + Xbyak::Label label_end; + func(0, vector_step); + cmp(addr_unroll_size, 1); + jle(label_end, T_NEAR); + func(1, vector_step); + cmp(addr_unroll_size, 2); + jle(label_end, T_NEAR); + func(2, vector_step); + cmp(addr_unroll_size, 3); + jle(label_end, T_NEAR); + func(3, vector_step); + L(label_end); + }; + + Xbyak::Label label_unroll_num; + Xbyak::Label label_unroll_num_end; + L(label_unroll_num); + { + cmp(addr_unroll_num, 0); + jle(label_unroll_num_end, T_NEAR); + + Xbyak::Label label_not_last; + cmp(addr_unroll_num, 1); + jne(label_not_last, T_NEAR); + mov(rax, addr_last_unroll_size); + mov(addr_unroll_size, rax); + L(label_not_last); + + mov(reg_src_aux, reg_src); + mov(reg_dst_aux, reg_dst); + mov(reg_work_amount, addr_work_amount_bk); + + // 4-15 for unroll. 4-7 for src, 8-11 for m, 12-15 for v + // load m/v + vector_worker(load_mv); + + // optimized scaleshift fusion arg init. 16-23 for weight, 24-31 for bias. + // reg_post_ops_data[0]:----w0---- ----b0---- reg_post_ops_data[1]:----w1---- ----b1---- + mov(reg_oc_off, addr_oc_off_bk); size_t post_ops_data_offset = 0; + ss_repeat_id = 0; for (int i = 0; i < optimized_scaleshift_num; i++) { mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); - add(reg_d_weights, ur_offset); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmovups(Vmm(16 + i * 4 + ur_size), ptr[reg_d_weights]); - add(reg_d_weights, vector_step * sizeof(float)); - } - mov(reg_d_bias, ptr[reg_post_ops_data + post_ops_data_offset]); - add(reg_d_bias, ur_offset + jcp_.C * sizeof(float)); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmovups(Vmm(24 + i * 4 + ur_size), ptr[reg_d_bias]); - add(reg_d_bias, vector_step * sizeof(float)); - } + add(reg_d_weights, reg_oc_off); + // bias = weight + C + mov(reg_d_bias, reg_d_weights); + mov(rax, reg_rt_shape); + imul(rax, rax, sizeof(float)); + add(reg_d_bias, rax); + + vector_worker(load_weight_bias); + post_ops_data_offset += sizeof(float*); + ss_repeat_id++; } - mov(reg_src_aux, reg_src); - mov(reg_dst_aux, reg_dst); - mov(reg_work_amount, reg_work_amount_bk); - mov(reg_oc_off, reg_oc_off_bk); - Xbyak::Label loop_label; Xbyak::Label loop_end_label; L(loop_label); @@ -875,81 +1328,220 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator cmp(reg_work_amount, 0); jle(loop_end_label, T_NEAR); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - bool is_tails = ur_offset_elt + ur_size * vector_step + vector_step > static_cast(jcp_.C); - if (is_tails) { - load_tail_emitter->emit_code({static_cast(reg_src_aux.getIdx())}, - {static_cast(ur_base + ur_size)}, {}, {load_pool_gpr_idxs}); - add(reg_src_aux, tail_step * jcp_.src_data_size); - } else { - load_vector_emitter->emit_code({static_cast(reg_src_aux.getIdx())}, - {static_cast(ur_base + ur_size)}, {}, {load_pool_gpr_idxs}); - add(reg_src_aux, vector_step * jcp_.src_data_size); - } - } - add(reg_src_aux, (jcp_.C - elt_num) * jcp_.src_data_size); + // load + vector_worker(load_src); + + // to next iteration(next work_amount) + mov(rax, addr_unroll_size); + imul(rax, rax, vector_step * jcp_.src_data_size); + sub(reg_src_aux, rax); + mov(rax, reg_rt_shape); + imul(rax, rax, jcp_.src_data_size); + add(reg_src_aux, rax); prefetcht0(ptr[reg_src_aux]); - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vsubps(Vmm(ur_base + ur_size), Vmm(ur_base + ur_size), Vmm(ur_base + 4 + ur_size)); - } - if (jcp_.normalize_variance) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vmulps(Vmm(ur_base + ur_size), Vmm(ur_base + ur_size), Vmm(ur_base + 8 + ur_size)); - } - } + // norm + vector_worker(norm); + // optimized ss fusion + ss_repeat_id = 0; for (int i = 0; i < optimized_scaleshift_num; i++) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - uni_vfmadd132ps(Vmm(ur_base + ur_size), Vmm(24 + i * 4 + ur_size), Vmm(16 + i * 4 + ur_size)); - } + vector_worker(optimized_ss); + ss_repeat_id++; } + // post-ops if (attr_.post_ops_.len() != 0) { - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - apply_post_ops(jcp_.dst_prc, ur_base + ur_size, false); - bool is_tails = ur_offset_elt + ur_size * vector_step + vector_step > static_cast(jcp_.C); - if (is_tails) - add(reg_oc_off, tail_step * sizeof(float)); - else - add(reg_oc_off, vector_step * sizeof(float)); - } + vector_worker(post_ops); } - for (size_t ur_size = 0; ur_size < unroll_size_rt; ur_size++) { - bool is_tails = ur_offset_elt + ur_size * vector_step + vector_step > static_cast(jcp_.C); - if (is_tails) { - store_tail_emitter->emit_code({static_cast(ur_base + ur_size)}, {static_cast(reg_dst_aux.getIdx())}, - {store_pool_vec_idxs}, {store_pool_gpr_idxs}); - add(reg_dst_aux, tail_step * jcp_.dst_data_size); - } else { - store_vector_emitter->emit_code({static_cast(ur_base + ur_size)}, {static_cast(reg_dst_aux.getIdx())}, - {store_pool_vec_idxs}, {store_pool_gpr_idxs}); - add(reg_dst_aux, vector_step * jcp_.dst_data_size); - } - } + // store + vector_worker(store_dst); + + // dst advance + mov(rax, addr_unroll_size); + imul(rax, rax, vector_step * jcp_.dst_data_size); + sub(reg_dst_aux, rax); + mov(rax, reg_rt_shape); + imul(rax, rax, jcp_.dst_data_size); + add(reg_dst_aux, rax); + prefetcht0(ptr[reg_dst_aux]); + + // reg_oc_off reset + mov(rax, addr_unroll_size); + imul(rax, rax, vector_step * sizeof(float)); + sub(reg_oc_off, rax); - add(reg_dst_aux, (jcp_.C - elt_num) * jcp_.dst_data_size); - sub(reg_oc_off, elt_num * sizeof(float)); sub(reg_work_amount, 1); jmp(loop_label, T_NEAR); } L(loop_end_label); - add(reg_src, unroll_size_rt * vector_step * jcp_.src_data_size); - add(reg_dst, unroll_size_rt * vector_step * jcp_.dst_data_size); - add(reg_oc_off_bk, unroll_size_rt * vector_step * sizeof(float)); + // src/dst advance + mov(rax, addr_unroll_size); + imul(rdx, rax, vector_step * jcp_.src_data_size); + add(reg_src, rdx); + imul(rdx, rax, vector_step * jcp_.dst_data_size); + add(reg_dst, rdx); + imul(rdx, rax, vector_step * sizeof(float)); + add(addr_oc_off_bk, rdx); + + sub(addr_unroll_num, 1); + jmp(label_unroll_num, T_NEAR); + } + L(label_unroll_num_end); + + // tails + L(tail_label); + + Xbyak::Label label_exit; + cmp(addr_tail_num, 0); + je(label_exit, T_NEAR); + + mov(reg_src_aux, reg_src); + mov(reg_dst_aux, reg_dst); + mov(reg_work_amount, addr_work_amount_bk); + Xbyak::Reg64 reg_tails_num_active = rdx; + mov(reg_tails_num_active, addr_tail_num); + + auto get_tile_emitter_id = [&](const int& step) -> int { + int emitter_id = 4; + if (step == 8) { + emitter_id = 1; + } else if (step == 4) { + emitter_id = 2; + } else if (step == 2) { + emitter_id = 3; + } + return emitter_id; + }; + auto load_src_tails = [&](int vmm_id, int step) { + int emitter_id = get_tile_emitter_id(step); + load_emitter[emitter_id]->emit_code({static_cast(reg_src_aux.getIdx())}, + {static_cast(ur_base + vmm_id)}, {}, {load_pool_gpr_idxs}); + add(reg_src_aux, step * jcp_.src_data_size); + }; + auto store_tails = [&](int vmm_id, int step) { + int emitter_id = get_tile_emitter_id(step); + store_emitter[emitter_id]->emit_code({static_cast(ur_base + vmm_id)}, {static_cast(reg_dst_aux.getIdx())}, + {store_pool_vec_idxs}, {store_pool_gpr_idxs}); + add(reg_dst_aux, step * jcp_.dst_data_size); + }; + auto tails_worker = [&](std::function func) { + Label tail_blk8_exit_label; + Label tail_blk4_exit_label; + Label tail_blk2_exit_label; + Label tail_blk1_exit_label; + cmp(reg_tails_num_active, 8); + jl(tail_blk8_exit_label, T_NEAR); + func(0, 8); + sub(reg_tails_num_active, 8); + L(tail_blk8_exit_label); + cmp(reg_tails_num_active, 4); + jl(tail_blk4_exit_label, T_NEAR); + func(1, 4); + sub(reg_tails_num_active, 4); + L(tail_blk4_exit_label); + cmp(reg_tails_num_active, 2); + jl(tail_blk2_exit_label, T_NEAR); + func(2, 2); + sub(reg_tails_num_active, 2); + L(tail_blk2_exit_label); + cmp(reg_tails_num_active, 1); + jl(tail_blk1_exit_label, T_NEAR); + func(3, 1); + sub(reg_tails_num_active, 1); + L(tail_blk1_exit_label); + }; + + // load m/v m:8-11, v:12-15 + tails_worker(load_mv); + + // optimized scaleshift. 16-23 for weight, 24-31 for bias. + // reg_post_ops_data[0]:----w0---- ----b0---- reg_post_ops_data[1]:----w1---- ----b1---- + mov(reg_oc_off, addr_oc_off_bk); + size_t post_ops_data_offset = 0; + ss_repeat_id = 0; + for (int i = 0; i < optimized_scaleshift_num; i++) { + mov(reg_tails_num_active, addr_tail_num); + mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]); + add(reg_d_weights, reg_oc_off); + // bias = weight + C + mov(reg_d_bias, reg_d_weights); + mov(rax, reg_rt_shape); + imul(rax, rax, sizeof(float)); + add(reg_d_bias, rax); + + tails_worker(load_weight_bias); + + post_ops_data_offset += sizeof(float*); + ss_repeat_id++; + } + + Xbyak::Label loop_tails_label; + Xbyak::Label loop_tails_end_label; + L(loop_tails_label); + { + cmp(reg_work_amount, 0); + jle(loop_tails_end_label, T_NEAR); + mov(reg_tails_num_active, addr_tail_num); + + tails_worker(load_src_tails); + + // to next iteration(next work_amount) + mov(rax, addr_vector_num); + imul(rax, rax, vector_step * jcp_.src_data_size); + add(reg_src_aux, rax); + + // norm + mov(reg_tails_num_active, addr_tail_num); + tails_worker(norm); + + // optimized scaleShift + ss_repeat_id = 0; + for (int i = 0; i < optimized_scaleshift_num; i++) { + mov(reg_tails_num_active, addr_tail_num); + tails_worker(optimized_ss); + ss_repeat_id++; + } + + // post-ops + if (attr_.post_ops_.len() != 0) { + mov(reg_tails_num_active, addr_tail_num); + tails_worker(post_ops); + } + + // store + mov(reg_tails_num_active, addr_tail_num); + tails_worker(store_tails); + + // dst advance + mov(rax, reg_rt_shape); + sub(rax, addr_tail_num); + imul(rax, rax, jcp_.dst_data_size); + add(reg_dst_aux, rax); + + // reg_oc_off reset + mov(rax, addr_tail_num); + imul(rax, rax, sizeof(float)); + sub(reg_oc_off, rax); + + sub(reg_work_amount, 1); + jmp(loop_tails_label, T_NEAR); } + L(loop_tails_end_label); + L(label_exit); + add(rsp, 7 * gpr_size); } inline void norm_nspc_ac_ker() { - Xbyak::Reg64 reg_oc_off_bk = reg_src_stride; + Xbyak::Reg64 reg_rt_shape_bk = rdx; + Xbyak::Reg64 reg_oc_off_bk = rax; + mov(reg_rt_shape_bk, reg_rt_shape); if (attr_.post_ops_.len() != 0) { mov(reg_oc_off_bk, reg_oc_off); } - size_t vec_num = div_up(jcp_.C, vector_step); - Xbyak::Label loop_label; Xbyak::Label loop_end_label; L(loop_label); @@ -957,25 +1549,20 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator cmp(reg_work_amount, 0); jle(loop_end_label, T_NEAR); + mov(reg_rt_shape, reg_rt_shape_bk); if (attr_.post_ops_.len() != 0) { mov(reg_oc_off, reg_oc_off_bk); } - for (size_t v_num = 0; v_num < vec_num; v_num++) { - bool is_tail = (v_num * vector_step + vector_step > static_cast(jcp_.C)) ? true : false; - worker_mvn(is_tail); - if (is_tail) { - add(reg_src, tail_step * jcp_.src_data_size); - add(reg_dst, tail_step * jcp_.dst_data_size); - if (attr_.post_ops_.len() != 0) - add(reg_oc_off, tail_step * sizeof(float)); - } else { - add(reg_src, vector_step * jcp_.src_data_size); - add(reg_dst, vector_step * jcp_.dst_data_size); - if (attr_.post_ops_.len() != 0) - add(reg_oc_off, vector_step * sizeof(float)); - } - } + worker_mvn_vector_unroll(reg_rt_shape); + auto tails_func = [&](int tile_size) { + worker_mvn_block(tile_size); + add(reg_src, tile_size * jcp_.src_data_size); + add(reg_dst, tile_size * jcp_.dst_data_size); + if (attr_.post_ops_.len() != 0) + add(reg_oc_off, tile_size * sizeof(float)); + }; + worker_mvn_tails(reg_rt_shape, tails_func); sub(reg_work_amount, 1); jmp(loop_label, T_NEAR); @@ -983,11 +1570,34 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator L(loop_end_label); } - inline void worker_mvn(bool is_tail) { - const auto& load_emitter = is_tail ? load_tail_emitter : load_vector_emitter; - const auto& store_emitter = is_tail ? store_tail_emitter : store_vector_emitter; + inline void worker_mvn_vector_unroll(Xbyak::Reg64& reg_work_amount) { + Xbyak::Label mvn_loop_label; + Xbyak::Label mvn_loop_end_label; + + int step_sub = jcp_.layout == MVNLayoutType::mvn_by_channel ? vector_step : 1; + int step_left = jcp_.layout == MVNLayoutType::mvn_by_channel ? vector_step : 0; + + L(mvn_loop_label); + { + cmp(reg_work_amount, step_left); + jle(mvn_loop_end_label, T_NEAR); + + worker_mvn_vector(); - load_emitter->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + add(reg_src, src_stride); + add(reg_dst, dst_stride); + if (jcp_.layout == MVNLayoutType::mvn_by_channel && attr_.post_ops_.len() != 0) + add(reg_oc_off, vector_step * sizeof(float)); + + sub(reg_work_amount, step_sub); + + jmp(mvn_loop_label, T_NEAR); + } + L(mvn_loop_end_label); + } + + inline void worker_mvn_vector() { + load_emitter[VECTOR]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, {}, {load_pool_gpr_idxs}); uni_vsubps(vmm_val, vmm_val, vmm_mean); @@ -996,28 +1606,68 @@ struct jit_uni_mvn_kernel_f32 : public jit_uni_mvn_kernel, public jit_generator apply_post_ops(jcp_.dst_prc, vmm_val.getIdx(), jcp_.layout == MVNLayoutType::mvn_planar); - store_emitter->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, + store_emitter[VECTOR]->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, {store_pool_vec_idxs}, {store_pool_gpr_idxs}); } - inline void worker_mvn_unroll(bool is_tail = false) { - Xbyak::Label mvn_loop_label; - Xbyak::Label mvn_loop_end_label; + inline void worker_mvn_tails(Xbyak::Reg64& reg_tail_num, std::function func) { + for (int i = 0; i < kTileNum; i++) { + L(tail_start[i]); + { + cmp(reg_tail_num, tile_size[i]); + jl(tail_exit[i], T_NEAR); - L(mvn_loop_label); - { - cmp(reg_work_amount, 0); - jle(mvn_loop_end_label, T_NEAR); + func(tile_size[i]); - worker_mvn(is_tail); + sub(reg_tail_num, tile_size[i]); + jmp(tail_start[i], T_NEAR); + } + L(tail_exit[i]); + } + } - add(reg_src, reg_src_stride); - add(reg_dst, reg_dst_stride); - sub(reg_work_amount, 1); + inline void worker_mvn_block(int block_num) { + switch (block_num) { + case 8: + load_emitter[TAIL8]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 4: + load_emitter[TAIL4]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + case 1: + load_emitter[TAIL1]->emit_code({static_cast(reg_src.getIdx())}, {static_cast(vmm_val.getIdx())}, + {}, {load_pool_gpr_idxs}); + break; + default: + assert(!"MVN layer tails is processed only with 8/4/1 blocks."); + break; + } - jmp(mvn_loop_label, T_NEAR); + uni_vsubps(vmm_val, vmm_val, vmm_mean); + if (jcp_.normalize_variance) + uni_vmulps(vmm_val, vmm_val, vmm_variance_inv); + + apply_post_ops(jcp_.dst_prc, vmm_val.getIdx(), jcp_.layout == MVNLayoutType::mvn_planar); + + switch (block_num) { + case 8: + store_emitter[TAIL8]->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, + {store_pool_vec_idxs}, {store_pool_gpr_idxs}); + break; + case 4: + store_emitter[TAIL4]->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, + {store_pool_vec_idxs}, {store_pool_gpr_idxs}); + break; + case 1: + store_emitter[TAIL1]->emit_code({static_cast(vmm_val.getIdx())}, {static_cast(reg_dst.getIdx())}, + {store_pool_vec_idxs}, {store_pool_gpr_idxs}); + break; + default: + assert(!"MVN layer tails is processed only with 8/4/1 blocks."); + break; } - L(mvn_loop_end_label); } void apply_post_ops(InferenceEngine::Precision dst_prc, size_t vmm_idx, bool is_broadcast) { @@ -1167,6 +1817,25 @@ MVN::MVN(const std::shared_ptr& op, const GraphContext::CPtr conte void MVN::getSupportedDescriptors() {} +static inline bool isUnaryEltwise(const NodePtr& node) { + return one_of(node->getAlgorithm(), Algorithm::EltwiseRelu, + Algorithm::EltwiseGeluErf, + Algorithm::EltwiseGeluTanh, + Algorithm::EltwiseElu, + Algorithm::EltwiseSigmoid, + Algorithm::EltwiseClamp, + Algorithm::EltwiseTanh, + Algorithm::EltwiseSwish, + Algorithm::EltwiseHswish, + Algorithm::EltwiseMish, + Algorithm::EltwiseHsigmoid, + Algorithm::EltwiseRoundHalfToEven, + Algorithm::EltwiseRoundHalfAwayFromZero, + Algorithm::EltwiseAbs, + Algorithm::EltwiseSqrt, + Algorithm::EltwiseSoftRelu); +} + void MVN::initSupportedPrimitiveDescriptors() { if (!supportedPrimitiveDescriptors.empty()) return; @@ -1180,6 +1849,15 @@ void MVN::initSupportedPrimitiveDescriptors() { if (!fusedWith.empty()) { outputPrecision = fusedWith[fusedWith.size() - 1]->getOriginalOutputPrecisionAtPort(0); + onlyUnaryPostOps = true; + for (auto &node : fusedWith) { + if (isUnaryEltwise(node)) { + continue; + } else { + onlyUnaryPostOps = false; + break; + } + } } // ref with float planar and no fusion @@ -1288,8 +1966,6 @@ MVN::MVNJitExecutor::MVNJitExecutor(const MVNAttrs& mvnAttrs, jcp.layout = mvnAttrs.layout; jcp.normalize_variance = mvnAttrs.normalizeVariance_; jcp.across_channels = mvnAttrs.execAcrossChannels_; - int N = 0; - std::tie(N, jcp.C, jcp.D, jcp.H, jcp.W) = mvnAttrs.shape5D; #if defined(OPENVINO_ARCH_X86_64) if (mayiuse(cpu::x64::avx512_core)) { mvn_kernel.reset(new jit_uni_mvn_kernel_f32(jcp, *attr.get())); @@ -1327,23 +2003,23 @@ MVN::MVNJitExecutor::MVNJitExecutor(const MVNAttrs& mvnAttrs, mvn_variance_kernel->create_ker(); } -void MVN::MVNJitExecutor::exec(const uint8_t *src_data, uint8_t *dst_data, const void *post_ops_data_) { +void MVN::MVNJitExecutor::exec(const uint8_t *src_data, uint8_t *dst_data, const void *post_ops_data_, const VectorDims& shape5d) { if (!mvn_mean_kernel || (mvnAttrs.normalizeVariance_ && !mvn_variance_kernel) || !mvn_kernel) { IE_THROW() << "MVN layer doesn't create kernel to execute on sse41 above platform."; } if (mvnAttrs.layout == MVNLayoutType::mvn_planar) { - mvn_pln(src_data, dst_data, post_ops_data_); + mvn_pln(src_data, dst_data, post_ops_data_, shape5d); } else if (mvnAttrs.layout == MVNLayoutType::mvn_by_channel) { - mvn_nspc(src_data, dst_data, post_ops_data_); + mvn_nspc(src_data, dst_data, post_ops_data_, shape5d); } else { - mvn_blk(src_data, dst_data, post_ops_data_); + mvn_blk(src_data, dst_data, post_ops_data_, shape5d); } } MVN::MVNRefExecutor::MVNRefExecutor(const MVNAttrs& mvnAttrs):MVNExecutorBase(mvnAttrs) {} -void MVN::MVNRefExecutor::exec(const uint8_t *src_data, uint8_t *dst_data, const void *post_ops_data_) { - mvn_ref(src_data, dst_data); +void MVN::MVNRefExecutor::exec(const uint8_t *src_data, uint8_t *dst_data, const void *post_ops_data_, const VectorDims& shape5d) { + mvn_ref(src_data, dst_data, shape5d); } void MVN::prepareParams() { @@ -1356,9 +2032,20 @@ void MVN::prepareParams() { if (getSelectedPrimitiveDescriptor() == nullptr) IE_THROW() << "Preferable primitive descriptor is not set."; - const SizeVector in_dims = srcMemPtr->getStaticDims(); + const VectorDims in_dims = srcMemPtr->getStaticDims(); transformTo5DCase(in_dims); +#if defined(OPENVINO_ARCH_X86_64) + // New shape5D always need prepare via transformTo5DCase(), which is need in exec(). + // MVN itself and unary post ops is totally shape agnostic, execPtr can be reused directly w/o recompilation and setPostOps when shape is changed. + // As key have not shape, if shape changes and new post ops attr is also the same, execPtr can still hit. + // If new shape(channel changes) impact post ops attr, such as entry.quantization.offset, entry.depthwise.offset, entry.quantization.per_channel, + // which is participate in compilation, even postOpsData is passed in runtime, still need recompilation. + if (execPtr != nullptr && (fusedWith.empty() || onlyUnaryPostOps)) { + return; + } +#endif + auto selectedPD = getSelectedPrimitiveDescriptor(); mvnAttrs.src_prc = selectedPD->getConfig().inConfs[0].getMemDesc()->getPrecision(); mvnAttrs.dst_prc = selectedPD->getConfig().outConfs[0].getMemDesc()->getPrecision(); @@ -1403,40 +2090,38 @@ void MVN::prepareParams() { execPtr = result.first; } -void MVN::transformTo5DCase(const SizeVector& shape) { - switch (shape.size()) { - // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. - // otherwise there are not enough data in spatial dimension to process in one kernel. +void MVN::transformTo5DCase(const VectorDims& shape) { + size_t rank = shape.size(); + // for 1 and 2 rank, if initAcrossChannels_ is true, adjust shape to fully vectorize under unified 5d procedure. + // otherwise there are not enough data in spatial dimension to process in one kernel. + switch (rank) { case 1 : // C if (mvnAttrs.initAcrossChannels_) { - mvnAttrs.shape5D = std::make_tuple(1, 1, 1, 1, shape[0]); + shape5D = {1, 1, 1, 1, shape[0]}; mvnAttrs.execAcrossChannels_ = false; break; } else { - mvnAttrs.shape5D = std::make_tuple(1, shape[0], 1, 1, 1); + shape5D = {1, shape[0], 1, 1, 1}; break; } case 2 : // NC if (mvnAttrs.initAcrossChannels_) { - mvnAttrs.shape5D = std::make_tuple(1, shape[0], 1, shape[1], 1); + shape5D = {1, shape[0], 1, shape[1], 1}; mvnAttrs.execAcrossChannels_ = false; break; } else { - mvnAttrs.shape5D = std::make_tuple(shape[0], shape[1], 1, 1, 1); + shape5D = {shape[0], shape[1], 1, 1, 1}; break; } - case 3 : { mvnAttrs.shape5D = std::make_tuple(shape[0], shape[1], 1, shape[2], 1); break; } - case 4 : { mvnAttrs.shape5D = std::make_tuple(shape[0], shape[1], 1, shape[2], shape[3]); break; } - case 5 : { mvnAttrs.shape5D = std::make_tuple(shape[0], shape[1], shape[2], shape[3], shape[4]); break; } + case 3 : { shape5D = {shape[0], shape[1], 1, shape[2], 1}; break; } + case 4 : { shape5D = {shape[0], shape[1], 1, shape[2], shape[3]}; break; } + case 5 : { shape5D = {shape[0], shape[1], shape[2], shape[3], shape[4]}; break; } default : { IE_THROW() << "MVN layer with name '" << getName() << "' doesn't support planar layout with rank: " << shape.size(); } } } void MVN::setPostOps(dnnl::primitive_attr &attr, bool initWeights) { dnnl::post_ops ops; - VectorDims postOpDims(5); - std::tie(postOpDims[0], postOpDims[1], postOpDims[2], postOpDims[3], postOpDims[4]) = mvnAttrs.shape5D; - postOpsDataPtrs.clear(); for (auto &node : fusedWith) { auto* fakeQuantizeNode = dynamic_cast(node.get()); @@ -1447,7 +2132,7 @@ void MVN::setPostOps(dnnl::primitive_attr &attr, bool initWeights) { auto* eltwiseNode = dynamic_cast(node.get()); if (eltwiseNode) { - eltwiseNode->appendPostOps(ops, postOpDims, postOpsDataPtrs); + eltwiseNode->appendPostOps(ops, shape5D, postOpsDataPtrs); continue; } IE_THROW() << "Fusing of " << NameFromType(node->getType()) << " operation to " << NameFromType(this->getType()) << " node is not implemented"; @@ -1466,7 +2151,7 @@ void MVN::execute(dnnl::stream strm) { if (execPtr) { uint8_t *dst_data = reinterpret_cast(dstMemPtr->getData()); uint8_t *src_data = reinterpret_cast(srcMemPtr->getData()); - execPtr->exec(src_data, dst_data, postOpsDataPtrs.data()); + execPtr->exec(src_data, dst_data, postOpsDataPtrs.data(), shape5D); } else if (aclExecPtr) { aclExecPtr->exec({srcMemPtr}, {dstMemPtr}, postOpsDataPtrs.data()); } else { @@ -1474,7 +2159,7 @@ void MVN::execute(dnnl::stream strm) { } } -void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_) { +void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_, const VectorDims& shape5d) { size_t blk_size = 1; // blk size in vmm if (mayiuse(cpu::x64::avx512_core)) { blk_size = 16; @@ -1484,16 +2169,16 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co blk_size = 4; } - size_t N = 0; size_t C = 0; size_t D = 0; size_t H = 0; size_t W = 0; - std::tie(N, C, D, H, W) = mvnAttrs.shape5D; + const size_t N = shape5d[0]; + const size_t C = shape5d[1]; + const size_t D = shape5d[2]; + const size_t H = shape5d[3]; + const size_t W = shape5d[4]; size_t C1 = H * W; size_t C2 = C1 * D; size_t C3 = C2 * C; - size_t src_stride_size = static_cast(blk_size * src_data_size); - size_t dst_stride_size = static_cast(blk_size * dst_data_size); - if (mvnAttrs.execAcrossChannels_) { parallel_for(N, [&](int b) { size_t cb = b * C3; @@ -1507,8 +2192,8 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co auto arg = jit_mvn_call_args(); arg.src = src_data + cc * src_data_size; arg.sum = static_cast(&mean_internal); - arg.src_stride = src_stride_size; arg.work_amount = static_cast(C2 / blk_size); // for vector part + arg.rt_shape_size = static_cast(C2 % blk_size); arg.post_op_data = post_ops_data_; (*mvn_mean_kernel)(&arg); return mean_internal; @@ -1527,8 +2212,8 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + cc * src_data_size; arg.mean = static_cast(&mean); arg.variance = static_cast(&variance_internal); - arg.src_stride = src_stride_size; arg.work_amount = static_cast(C2 / blk_size); // vector part + arg.rt_shape_size = static_cast(C2 % blk_size); // for tails arg.post_op_data = post_ops_data_; (*mvn_variance_kernel)(&arg); return variance_internal; @@ -1548,9 +2233,8 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co arg.dst = dst_data + cc * dst_data_size; arg.mean = static_cast(&mean); arg.variance = static_cast(&variance); - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(C2 / blk_size); // work amount for vector part + arg.rt_shape_size = static_cast(C2 % blk_size); // for tails arg.oc_off = sizeof(float) * c; arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -1563,9 +2247,8 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + cc * src_data_size; arg.dst = dst_data + cc * dst_data_size; arg.mean = static_cast(&mean); - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(C2 / blk_size); + arg.rt_shape_size = static_cast(C2 % blk_size); // for tails arg.oc_off = sizeof(float) * c; arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -1585,9 +2268,8 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + cc * src_data_size; arg.dst = dst_data + cc * dst_data_size; arg.sum = static_cast(&mean); - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(C2 / blk_size); + arg.rt_shape_size = static_cast(C2 % blk_size); arg.oc_off = static_cast(c * sizeof(float)); arg.post_op_data = post_ops_data_; (*mvn_mean_kernel)(&arg); @@ -1617,11 +2299,14 @@ void MVN::MVNJitExecutor::mvn_pln(const uint8_t* src_data, uint8_t* dst_data, co } } -void MVN::MVNRefExecutor::mvn_ref(const uint8_t* src_data, uint8_t* dst_data) { +void MVN::MVNRefExecutor::mvn_ref(const uint8_t* src_data, uint8_t* dst_data, const VectorDims& shape5d) { const float *src_data_ptr = reinterpret_cast(src_data); float *dst_data_ptr = reinterpret_cast(dst_data); - size_t N = 0; size_t C = 0; size_t D = 0; size_t H = 0; size_t W = 0; - std::tie(N, C, D, H, W) = mvnAttrs.shape5D; + const size_t N = shape5d[0]; + const size_t C = shape5d[1]; + const size_t D = shape5d[2]; + const size_t H = shape5d[3]; + const size_t W = shape5d[4]; size_t C1 = H * W; size_t C2 = C1 * D; @@ -1715,7 +2400,7 @@ void MVN::MVNRefExecutor::mvn_ref(const uint8_t* src_data, uint8_t* dst_data) { }); } -void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_) { +void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_, const VectorDims& shape5d) { size_t blk_size = 1; // channel blk for memory layout if (mayiuse(cpu::x64::avx512_core)) { blk_size = 16; @@ -1725,16 +2410,19 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c blk_size = 4; } - size_t N = 1; size_t C = 1; size_t D = 1; size_t H = 1; size_t W = 1; - std::tie(N, C, D, H, W) = mvnAttrs.shape5D; + const size_t N = shape5d[0]; + const size_t C = shape5d[1]; + const size_t D = shape5d[2]; + const size_t H = shape5d[3]; + const size_t W = shape5d[4]; size_t threads_num = parallel_get_num_threads(); - size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? 1 : rnd_up(C, blk_size); + size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? 1 : rnd_up(C, blk_size) + blk_size; parallel_for(N, [&](size_t b) { - std::vector mean_buffer(aux_buffer_size * threads_num); + std::vector mean_buffer(aux_buffer_size * threads_num, 0.f); std::vector variance_buffer; if (mvnAttrs.normalizeVariance_) { - variance_buffer.resize(aux_buffer_size * threads_num); + variance_buffer.resize(aux_buffer_size * threads_num, 0.f); } size_t b_offset = b * C * D * H * W; @@ -1759,7 +2447,17 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c arg.oc_off = 0; arg.post_op_data = post_ops_data_; } - arg.work_amount = (across_channel && kernel_type != 2) ? (end - start) * C : (end - start); + if (across_channel) { + if (kernel_type == 2) { + arg.work_amount = end - start; + arg.rt_shape_size = C; + } else { + arg.work_amount = (end - start) * C; + } + } else { + arg.work_amount = (end - start); + arg.rt_shape_size = C; + } if (0 == kernel_type) { (*mvn_mean_kernel)(&arg); @@ -1816,7 +2514,7 @@ void MVN::MVNJitExecutor::mvn_nspc(const uint8_t* src_data, uint8_t* dst_data, c }); } -void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_) { +void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, const void *post_ops_data_, const VectorDims& shape5d) { size_t blk_size = 1; // channel blk for memory layout if (mayiuse(cpu::x64::avx512_core)) { blk_size = 16; @@ -1824,8 +2522,11 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co blk_size = 8; } - size_t N = 1; size_t C = 1; size_t D = 1; size_t H = 1; size_t W = 1; - std::tie(N, C, D, H, W) = mvnAttrs.shape5D; + const size_t N = shape5d[0]; + const size_t C = shape5d[1]; + const size_t D = shape5d[2]; + const size_t H = shape5d[3]; + const size_t W = shape5d[4]; size_t CB = div_up(C, blk_size); @@ -1837,12 +2538,10 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co size_t threads_num = parallel_get_num_threads(); size_t aux_buffer_size = mvnAttrs.execAcrossChannels_ ? blk_size : rnd_up(C, blk_size); + aux_buffer_size += blk_size; std::vector mean_buffer(aux_buffer_size * threads_num); std::vector variance_buffer(aux_buffer_size * threads_num); - size_t src_stride_size = static_cast(blk_size * src_data_size); - size_t dst_stride_size = static_cast(blk_size * dst_data_size); - for (size_t b = 0lu; b < N; b++) { size_t b_offset = b * C3; if (mvnAttrs.execAcrossChannels_) { @@ -1862,15 +2561,16 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co // // | // // \|/ ///////////////////////////////// - auto mean_buffer_ptr = &mean_buffer[blk_size * parallel_get_thread_num()]; + auto mean_buffer_ptr = &mean_buffer[aux_buffer_size * parallel_get_thread_num()]; for (size_t i = 0; i < blk_size; i++) mean_buffer_ptr[i] = 0.f; auto arg = jit_mvn_call_args(); arg.src = src_data + src_offset * src_data_size; arg.sum = mean_buffer_ptr; - arg.src_stride = src_stride_size; arg.work_amount = static_cast(W); + // real tail number or tail is 0(for full vector block). + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = static_cast(cb * blk_size * sizeof(float)); // for tail process (*mvn_mean_kernel)(&arg); // for W * blk @@ -1888,7 +2588,7 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co size_t src_offset = b_offset + cb * C2 + d * C1 + h * C0; float variance_internal = 0.0f; - auto variance_buffer_ptr = &variance_buffer[blk_size * parallel_get_thread_num()]; + auto variance_buffer_ptr = &variance_buffer[aux_buffer_size * parallel_get_thread_num()]; for (size_t i = 0; i < blk_size; i++) variance_buffer_ptr[i] = 0.f; @@ -1896,8 +2596,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + src_offset * src_data_size; arg.mean = static_cast(&mean); arg.variance = variance_buffer_ptr; - arg.src_stride = src_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_variance_kernel)(&arg); @@ -1922,9 +2622,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.dst = dst_data + src_offset * dst_data_size; arg.mean = static_cast(&mean); arg.variance = static_cast(&variance); - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -1937,9 +2636,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + src_offset * src_data_size; arg.dst = dst_data + src_offset * dst_data_size; arg.mean = static_cast(&mean); - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -1960,8 +2658,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co auto arg = jit_mvn_call_args(); arg.src = src_data + src_offset * src_data_size; arg.sum = mean_buffer_ptr; - arg.src_stride = src_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_mean_kernel)(&arg); @@ -1989,8 +2687,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + src_offset * src_data_size; arg.mean = mean_buffer_ptr; arg.variance = variance_buffer_ptr; - arg.src_stride = src_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_variance_kernel)(&arg); @@ -2018,9 +2716,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.dst = dst_data + src_offset * dst_data_size; arg.mean = mean_buffer_ptr; arg.variance = variance_buffer_ptr; - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -2037,9 +2734,8 @@ void MVN::MVNJitExecutor::mvn_blk(const uint8_t* src_data, uint8_t* dst_data, co arg.src = src_data + src_offset * src_data_size; arg.dst = dst_data + src_offset * dst_data_size; arg.mean = mean_buffer_ptr; - arg.src_stride = src_stride_size; - arg.dst_stride = dst_stride_size; arg.work_amount = static_cast(W); + arg.rt_shape_size = (C - cb * blk_size) < blk_size ? static_cast(C % blk_size) : 0; arg.oc_off = cb * blk_size * sizeof(float); arg.post_op_data = post_ops_data_; (*mvn_kernel)(&arg); @@ -2057,22 +2753,7 @@ bool MVN::canFuse(const NodePtr& node) const { // limit post ops to unary when shape transformed on channel // 1D only fused with unary int inputRank = getInputShapeAtPort(0).getRank(); - bool unaryEltwise = one_of(node->getAlgorithm(), Algorithm::EltwiseRelu, - Algorithm::EltwiseGeluErf, - Algorithm::EltwiseGeluTanh, - Algorithm::EltwiseElu, - Algorithm::EltwiseSigmoid, - Algorithm::EltwiseClamp, - Algorithm::EltwiseTanh, - Algorithm::EltwiseSwish, - Algorithm::EltwiseHswish, - Algorithm::EltwiseMish, - Algorithm::EltwiseHsigmoid, - Algorithm::EltwiseRoundHalfToEven, - Algorithm::EltwiseRoundHalfAwayFromZero, - Algorithm::EltwiseAbs, - Algorithm::EltwiseSqrt, - Algorithm::EltwiseSoftRelu); + bool unaryEltwise = isUnaryEltwise(node); if ((inputRank == 1 && !unaryEltwise) || (inputRank == 2 && !unaryEltwise && mvnAttrs.initAcrossChannels_)) { return false; diff --git a/src/plugins/intel_cpu/src/nodes/mvn.h b/src/plugins/intel_cpu/src/nodes/mvn.h index 9d862820cebc0c..20668784c76559 100644 --- a/src/plugins/intel_cpu/src/nodes/mvn.h +++ b/src/plugins/intel_cpu/src/nodes/mvn.h @@ -23,7 +23,6 @@ struct jit_mvn_config_params { InferenceEngine::Precision dst_prc; int src_data_size; int dst_data_size; - int C, D, H, W; }; struct jit_mvn_call_args { @@ -32,10 +31,11 @@ struct jit_mvn_call_args { float *sum; float *mean; float *variance; - size_t src_stride; - size_t dst_stride; size_t work_amount; size_t oc_off; + // shape need for shape agnostic kernel passed with each infer. + // OC for block layout and nspc per channel, tails for ncsp and nspc across channel. + size_t rt_shape_size; const void* post_op_data; }; @@ -101,16 +101,18 @@ class MVN : public Node { private: void setPostOps(dnnl::primitive_attr &attr, bool initWeights = false); - void transformTo5DCase(const InferenceEngine::SizeVector& shape); + void transformTo5DCase(const VectorDims& shape); std::vector postOpsDataPtrs; MVNAttrs mvnAttrs; + VectorDims shape5D = {0, 0, 0, 0, 0}; + bool onlyUnaryPostOps = true; class MVNExecutorBase { public: MVNExecutorBase(const MVNAttrs& mvnAttrs); - virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) = 0; + virtual void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d) = 0; virtual ~MVNExecutorBase() = default; protected: @@ -128,12 +130,12 @@ class MVN : public Node { MVNJitExecutor(const MVNAttrs& mvnAttrs, const dnnl::primitive_attr &attr); - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; + void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d) override; private: - void mvn_pln(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_); - void mvn_blk(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_); - void mvn_nspc(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_); + void mvn_pln(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d); + void mvn_blk(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d); + void mvn_nspc(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d); std::shared_ptr mvn_mean_kernel; std::shared_ptr mvn_variance_kernel; @@ -144,10 +146,10 @@ class MVN : public Node { public: MVNRefExecutor(const MVNAttrs& mvnAttrs); - void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_) override; + void exec(const uint8_t *in_ptr_, uint8_t *out_ptr_, const void *post_ops_data_, const VectorDims& shape5d) override; private: - void mvn_ref(const uint8_t *in_ptr_, uint8_t *out_ptr_); + void mvn_ref(const uint8_t *in_ptr_, uint8_t *out_ptr_, const VectorDims& shape5d); }; }; From 4df6ef3a2607bce11bbecf248c343318b5e081ec Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Thu, 20 Jul 2023 17:46:16 +0400 Subject: [PATCH 2/6] Removed ie, ngraph and core_tools cpack components (#18636) * Removed ie, ngraph cpack components * Introduced new components --- .../packaging/common-libraries.cmake | 16 ++-------------- .../packaging/debian/debian.cmake | 13 ++----------- cmake/developer_package/packaging/nsis.cmake | 5 ++--- .../packaging/packaging.cmake | 14 +++++--------- .../developer_package/packaging/rpm/rpm.cmake | 14 ++------------ src/bindings/python/CMakeLists.txt | 2 +- .../src/compatibility/openvino/CMakeLists.txt | 2 +- .../openvino/inference_engine/CMakeLists.txt | 6 +++--- .../src/compatibility/pyngraph/CMakeLists.txt | 10 +++++----- src/bindings/python/wheel/setup.py | 18 ++---------------- tools/benchmark_tool/CMakeLists.txt | 6 +++--- tools/ovc/CMakeLists.txt | 12 ++++++------ 12 files changed, 34 insertions(+), 84 deletions(-) diff --git a/cmake/developer_package/packaging/common-libraries.cmake b/cmake/developer_package/packaging/common-libraries.cmake index 1efab8576c42ed..f00995872335d8 100644 --- a/cmake/developer_package/packaging/common-libraries.cmake +++ b/cmake/developer_package/packaging/common-libraries.cmake @@ -67,17 +67,6 @@ macro(ov_override_component_names) # merge C++ and C runtimes set(OV_CPACK_COMP_CORE_C "${OV_CPACK_COMP_CORE}") set(OV_CPACK_COMP_CORE_C_DEV "${OV_CPACK_COMP_CORE_DEV}") - # merge all pythons into a single component - set(OV_CPACK_COMP_PYTHON_OPENVINO "pyopenvino") - set(OV_CPACK_COMP_PYTHON_IE_API "${OV_CPACK_COMP_PYTHON_OPENVINO}") - set(OV_CPACK_COMP_PYTHON_NGRAPH "${OV_CPACK_COMP_PYTHON_OPENVINO}") - # merge all C / C++ samples as a single samples component - set(OV_CPACK_COMP_CPP_SAMPLES "samples") - set(OV_CPACK_COMP_C_SAMPLES "${OV_CPACK_COMP_CPP_SAMPLES}") - # move requirements.txt to core-dev - # set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES "${OV_CPACK_COMP_CORE_DEV}") - # move core_tools to core-dev - # set(OV_CPACK_COMP_CORE_TOOLS "${OV_CPACK_COMP_CORE_DEV}") endmacro() ov_override_component_names() @@ -102,15 +91,14 @@ macro(ov_define_component_include_rules) set(OV_CPACK_COMP_PYTHON_SAMPLES_EXCLUDE_ALL EXCLUDE_FROM_ALL) # python set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) - set(OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) - set(OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) + set(OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) + set(OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) # we don't pack artifacts of setup.py install, because it's called explicitly in conda / brew # or not used at all like in cases with conan / vcpkg set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) # we don't need wheels in package, it's used installed only in open source distribution set(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL EXCLUDE_FROM_ALL) # tools - set(OV_CPACK_COMP_CORE_TOOLS_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_DEPLOYMENT_MANAGER_EXCLUDE_ALL EXCLUDE_FROM_ALL) # scripts diff --git a/cmake/developer_package/packaging/debian/debian.cmake b/cmake/developer_package/packaging/debian/debian.cmake index 31d21a7d8f1ad0..ab317b5eaca9f2 100644 --- a/cmake/developer_package/packaging/debian/debian.cmake +++ b/cmake/developer_package/packaging/debian/debian.cmake @@ -61,17 +61,9 @@ macro(ov_override_component_names) # merge C++ and C runtimes set(OV_CPACK_COMP_CORE_C "${OV_CPACK_COMP_CORE}") set(OV_CPACK_COMP_CORE_C_DEV "${OV_CPACK_COMP_CORE_DEV}") - # merge all pythons into a single component - set(OV_CPACK_COMP_PYTHON_OPENVINO "pyopenvino") - set(OV_CPACK_COMP_PYTHON_IE_API "${OV_CPACK_COMP_PYTHON_OPENVINO}") - set(OV_CPACK_COMP_PYTHON_NGRAPH "${OV_CPACK_COMP_PYTHON_OPENVINO}") # merge all C / C++ samples as a single samples component set(OV_CPACK_COMP_CPP_SAMPLES "samples") set(OV_CPACK_COMP_C_SAMPLES "${OV_CPACK_COMP_CPP_SAMPLES}") - # move requirements.txt to core-dev - # set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES "${OV_CPACK_COMP_CORE_DEV}") - # move core_tools to core-dev - # set(OV_CPACK_COMP_CORE_TOOLS "${OV_CPACK_COMP_CORE_DEV}") endmacro() ov_override_component_names() @@ -105,14 +97,13 @@ macro(ov_define_component_include_rules) else() set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) endif() + set(OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL}) + set(OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL}) # we don't pack python components itself, we pack artifacts of setup.py install set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) - set(OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) - set(OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) # we don't need wheels in Debian packages set(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL EXCLUDE_FROM_ALL) # tools - set(OV_CPACK_COMP_CORE_TOOLS_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_DEPLOYMENT_MANAGER_EXCLUDE_ALL EXCLUDE_FROM_ALL) # scripts diff --git a/cmake/developer_package/packaging/nsis.cmake b/cmake/developer_package/packaging/nsis.cmake index 77239fee00b084..dc2c4be32bc3c4 100644 --- a/cmake/developer_package/packaging/nsis.cmake +++ b/cmake/developer_package/packaging/nsis.cmake @@ -62,13 +62,12 @@ macro(ov_define_component_include_rules) unset(OV_CPACK_COMP_C_SAMPLES_EXCLUDE_ALL) unset(OV_CPACK_COMP_PYTHON_SAMPLES_EXCLUDE_ALL) # python - unset(OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL) - unset(OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL) unset(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL) + set(OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) + set(OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) set(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) # tools - unset(OV_CPACK_COMP_CORE_TOOLS_EXCLUDE_ALL) unset(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES_EXCLUDE_ALL) unset(OV_CPACK_COMP_DEPLOYMENT_MANAGER_EXCLUDE_ALL) # scripts diff --git a/cmake/developer_package/packaging/packaging.cmake b/cmake/developer_package/packaging/packaging.cmake index b8c961a4f3ffab..28601404c43765 100644 --- a/cmake/developer_package/packaging/packaging.cmake +++ b/cmake/developer_package/packaging/packaging.cmake @@ -154,13 +154,12 @@ macro(ov_define_component_names) set(OV_CPACK_COMP_C_SAMPLES "c_samples") set(OV_CPACK_COMP_PYTHON_SAMPLES "python_samples") # python - set(OV_CPACK_COMP_PYTHON_IE_API "pyie") - set(OV_CPACK_COMP_PYTHON_NGRAPH "pyngraph") set(OV_CPACK_COMP_PYTHON_OPENVINO "pyopenvino") + set(OV_CPACK_COMP_PYTHON_BENCHMARK_APP "benchmark_app") + set(OV_CPACK_COMP_PYTHON_OVC "ovc") set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE "pyopenvino_package") set(OV_CPACK_COMP_PYTHON_WHEELS "python_wheels") # tools - set(OV_CPACK_COMP_CORE_TOOLS "core_tools") set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES "openvino_dev_req_files") set(OV_CPACK_COMP_DEPLOYMENT_MANAGER "deployment_manager") # scripts @@ -184,15 +183,12 @@ macro(ov_define_component_include_rules) unset(OV_CPACK_COMP_C_SAMPLES_EXCLUDE_ALL) unset(OV_CPACK_COMP_PYTHON_SAMPLES_EXCLUDE_ALL) # python - unset(OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL) - unset(OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL) unset(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL) - unset(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL) - # TODO: think about python entry points - # maybe we can create entry points without python interpreter and use it in debian / rpm as well? + unset(OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL) + unset(OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL) set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) + unset(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL) # tools - unset(OV_CPACK_COMP_CORE_TOOLS_EXCLUDE_ALL) set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES_EXCLUDE_ALL EXCLUDE_FROM_ALL) unset(OV_CPACK_COMP_DEPLOYMENT_MANAGER_EXCLUDE_ALL) # scripts diff --git a/cmake/developer_package/packaging/rpm/rpm.cmake b/cmake/developer_package/packaging/rpm/rpm.cmake index 4c5f745fc1025e..22d9c17f6445ee 100644 --- a/cmake/developer_package/packaging/rpm/rpm.cmake +++ b/cmake/developer_package/packaging/rpm/rpm.cmake @@ -56,18 +56,9 @@ macro(ov_override_component_names) # merge C++ and C runtimes set(OV_CPACK_COMP_CORE_C "${OV_CPACK_COMP_CORE}") set(OV_CPACK_COMP_CORE_C_DEV "${OV_CPACK_COMP_CORE_DEV}") - # merge all pythons into a single component - set(OV_CPACK_COMP_PYTHON_OPENVINO "pyopenvino") - set(OV_CPACK_COMP_PYTHON_IE_API "${OV_CPACK_COMP_PYTHON_OPENVINO}") - set(OV_CPACK_COMP_PYTHON_NGRAPH "${OV_CPACK_COMP_PYTHON_OPENVINO}") # merge all C / C++ samples as a single samples component set(OV_CPACK_COMP_CPP_SAMPLES "samples") set(OV_CPACK_COMP_C_SAMPLES "${OV_CPACK_COMP_CPP_SAMPLES}") - # set(OV_CPACK_COMP_PYTHON_SAMPLES "${OV_CPACK_COMP_CPP_SAMPLES}") - # move requirements.txt to core-dev - # set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES "${OV_CPACK_COMP_CORE_DEV}") - # move core_tools to core-dev - # set(OV_CPACK_COMP_CORE_TOOLS "${OV_CPACK_COMP_CORE_DEV}") endmacro() ov_override_component_names() @@ -101,14 +92,13 @@ macro(ov_define_component_include_rules) else() set(OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL EXCLUDE_FROM_ALL) endif() + set(OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL}) + set(OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_PACKAGE_EXCLUDE_ALL}) # we don't pack python components itself, we pack artifacts of setup.py install set(OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL EXCLUDE_FROM_ALL) - set(OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) - set(OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) # we don't need wheels in RPM packages set(OV_CPACK_COMP_PYTHON_WHEELS_EXCLUDE_ALL EXCLUDE_FROM_ALL) # tools - set(OV_CPACK_COMP_CORE_TOOLS_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_OPENVINO_DEV_REQ_FILES_EXCLUDE_ALL EXCLUDE_FROM_ALL) set(OV_CPACK_COMP_DEPLOYMENT_MANAGER_EXCLUDE_ALL EXCLUDE_FROM_ALL) # scripts diff --git a/src/bindings/python/CMakeLists.txt b/src/bindings/python/CMakeLists.txt index fd351fdef96324..f75454f393f637 100644 --- a/src/bindings/python/CMakeLists.txt +++ b/src/bindings/python/CMakeLists.txt @@ -341,7 +341,7 @@ if(ENABLE_PYTHON_PACKAGING) add_custom_target(_python_api_package ALL DEPENDS ${meta_info_file}) # install python package, which will be later packed into DEB | RPM - ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_NGRAPH}_package_${pyversion} HIDDEN) + ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OPENVINO}_package_${pyversion} HIDDEN) install(DIRECTORY ${python_package_prefix}/ DESTINATION ${CMAKE_INSTALL_PREFIX} diff --git a/src/bindings/python/src/compatibility/openvino/CMakeLists.txt b/src/bindings/python/src/compatibility/openvino/CMakeLists.txt index c6da4b6882658d..bdc95bc4cc9ef6 100644 --- a/src/bindings/python/src/compatibility/openvino/CMakeLists.txt +++ b/src/bindings/python/src/compatibility/openvino/CMakeLists.txt @@ -30,7 +30,7 @@ endif() set(pyversion python${PYTHON_VERSION_MAJOR}.${PYTHON_VERSION_MINOR}) -set(PYTHON_COMPONENT ${OV_CPACK_COMP_PYTHON_IE_API}_${pyversion}) +set(PYTHON_COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion}) if(OV_GENERATOR_MULTI_CONFIG) set(PYTHON_BRIDGE_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/$/python/openvino) else() diff --git a/src/bindings/python/src/compatibility/openvino/inference_engine/CMakeLists.txt b/src/bindings/python/src/compatibility/openvino/inference_engine/CMakeLists.txt index 69a5d8f47667bc..fe628c9c70ab61 100644 --- a/src/bindings/python/src/compatibility/openvino/inference_engine/CMakeLists.txt +++ b/src/bindings/python/src/compatibility/openvino/inference_engine/CMakeLists.txt @@ -84,14 +84,14 @@ add_custom_command(TARGET ${TARGET_NAME} install(TARGETS ${INSTALLED_TARGETS} RUNTIME DESTINATION ${OV_CPACK_PYTHONDIR}/openvino/inference_engine - COMPONENT ${PYTHON_COMPONENT} ${OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL} + COMPONENT ${PYTHON_COMPONENT} ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL} LIBRARY DESTINATION ${OV_CPACK_PYTHONDIR}/openvino/inference_engine - COMPONENT ${PYTHON_COMPONENT} ${OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL}) + COMPONENT ${PYTHON_COMPONENT} ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) install(PROGRAMS __init__.py DESTINATION ${OV_CPACK_PYTHONDIR}/openvino/inference_engine COMPONENT ${PYTHON_COMPONENT} - ${OV_CPACK_COMP_PYTHON_IE_API_EXCLUDE_ALL}) + ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) add_clang_format_target(${TARGET_NAME}_clang FOR_TARGETS ${TARGET_NAME} EXCLUDE_PATTERNS ".*\\.cxx;.*\\.pxd;.*\\.pyx") diff --git a/src/bindings/python/src/compatibility/pyngraph/CMakeLists.txt b/src/bindings/python/src/compatibility/pyngraph/CMakeLists.txt index 7fbbc8ca09c23c..ea17ddc9680247 100644 --- a/src/bindings/python/src/compatibility/pyngraph/CMakeLists.txt +++ b/src/bindings/python/src/compatibility/pyngraph/CMakeLists.txt @@ -74,17 +74,17 @@ if(OpenVINO_SOURCE_DIR OR OpenVINODeveloperPackage_FOUND) ie_python_minimal_api(_${PROJECT_NAME}) add_clang_format_target(_${PROJECT_NAME}_clang FOR_TARGETS _${PROJECT_NAME}) - ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_NGRAPH}_${pyversion} HIDDEN) + ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} HIDDEN) install(TARGETS _${PROJECT_NAME} DESTINATION ${OV_CPACK_PYTHONDIR} - COMPONENT ${OV_CPACK_COMP_PYTHON_NGRAPH}_${pyversion} - ${OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL}) + COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} + ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL}) install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/../ngraph DESTINATION ${OV_CPACK_PYTHONDIR} - COMPONENT ${OV_CPACK_COMP_PYTHON_NGRAPH}_${pyversion} - ${OV_CPACK_COMP_PYTHON_NGRAPH_EXCLUDE_ALL} + COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} + ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL} USE_SOURCE_PERMISSIONS) install(DIRECTORY ${OpenVINOPython_SOURCE_DIR}/tests_compatibility diff --git a/src/bindings/python/wheel/setup.py b/src/bindings/python/wheel/setup.py index 28f4afb604a8bb..ee0de58d68842b 100644 --- a/src/bindings/python/wheel/setup.py +++ b/src/bindings/python/wheel/setup.py @@ -157,20 +157,6 @@ } PY_INSTALL_CFG = { - "pyie": { - "name": f"pyie_{PYTHON_VERSION}", - "prefix": f"{BUILD_BASE}/site-packages", - "source_dir": f"{OPENVINO_SOURCE_DIR}/src/bindings/python", - "install_dir": PY_PACKAGES_DIR, - "binary_dir": OPENVINO_PYTHON_BINARY_DIR, - }, - "pyngraph": { - "name": f"pyngraph_{PYTHON_VERSION}", - "prefix": f"{BUILD_BASE}/site-packages", - "source_dir": f"{OPENVINO_SOURCE_DIR}/src/bindings/python", - "install_dir": PY_PACKAGES_DIR, - "binary_dir": OPENVINO_PYTHON_BINARY_DIR, - }, "pyopenvino": { "name": f"pyopenvino_{PYTHON_VERSION}", "prefix": f"{BUILD_BASE}/site-packages", @@ -184,7 +170,7 @@ "ovc = openvino.tools.ovc.main:main", ], }, - "name": f"pyopenvino_{PYTHON_VERSION}", + "name": "ovc", "prefix": f"{BUILD_BASE}/site-packages", "source_dir": f"{OPENVINO_SOURCE_DIR}/tools/ovc", "install_dir": PY_PACKAGES_DIR, @@ -196,7 +182,7 @@ "benchmark_app = openvino.tools.benchmark.main:main", ], }, - "name": f"pyopenvino_{PYTHON_VERSION}", + "name": "benchmark_app", "prefix": f"{BUILD_BASE}/site-packages", "source_dir": f"{OPENVINO_SOURCE_DIR}/tools/benchmark_tool", "install_dir": PY_PACKAGES_DIR, diff --git a/tools/benchmark_tool/CMakeLists.txt b/tools/benchmark_tool/CMakeLists.txt index bd258ff79c5b12..b41aca487997f6 100644 --- a/tools/benchmark_tool/CMakeLists.txt +++ b/tools/benchmark_tool/CMakeLists.txt @@ -26,11 +26,11 @@ endif() # ov_get_pyversion(pyversion) -ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} +ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_BENCHMARK_APP} HIDDEN) install(DIRECTORY ${OpenVINOBenchmarkTool_SOURCE_DIR}/openvino DESTINATION ${OV_CPACK_PYTHONDIR} - COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} - ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL} + COMPONENT ${OV_CPACK_COMP_PYTHON_BENCHMARK_APP} + ${OV_CPACK_COMP_PYTHON_BENCHMARK_APP_EXCLUDE_ALL} USE_SOURCE_PERMISSIONS) diff --git a/tools/ovc/CMakeLists.txt b/tools/ovc/CMakeLists.txt index 0d7f396c0ba026..300526a9cc2e38 100644 --- a/tools/ovc/CMakeLists.txt +++ b/tools/ovc/CMakeLists.txt @@ -16,9 +16,9 @@ endif() if(NOT IEDevScripts_FOUND) find_package(IEDevScripts REQUIRED - PATHS "${OpenVINO_SOURCE_DIR}/cmake/developer_package" - NO_CMAKE_FIND_ROOT_PATH - NO_DEFAULT_PATH) + PATHS "${OpenVINO_SOURCE_DIR}/cmake/developer_package" + NO_CMAKE_FIND_ROOT_PATH + NO_DEFAULT_PATH) endif() # @@ -26,11 +26,11 @@ endif() # ov_get_pyversion(pyversion) -ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} +ov_cpack_add_component(${OV_CPACK_COMP_PYTHON_OVC} HIDDEN) install(DIRECTORY ${OpenVINOConverter_SOURCE_DIR}/openvino DESTINATION ${OV_CPACK_PYTHONDIR} - COMPONENT ${OV_CPACK_COMP_PYTHON_OPENVINO}_${pyversion} - ${OV_CPACK_COMP_PYTHON_OPENVINO_EXCLUDE_ALL} + COMPONENT ${OV_CPACK_COMP_PYTHON_OVC} + ${OV_CPACK_COMP_PYTHON_OVC_EXCLUDE_ALL} USE_SOURCE_PERMISSIONS) From dd358fc95a93cf330b7fc6b1988ef37af6ca6354 Mon Sep 17 00:00:00 2001 From: bstankix Date: Thu, 20 Jul 2023 15:48:44 +0200 Subject: [PATCH 3/6] [DOCS] Features update (#18676) * Update newsletter to work with new footer * Update footer * Add carousel in homepage banner * Update banner carousel --- docs/_static/css/custom.css | 41 ++++++------------- docs/_static/css/homepage_style.css | 39 +++++++++++++++++- docs/_static/html/newsletter.html | 2 +- docs/_static/js/custom.js | 25 +++++++++++ docs/_static/js/newsletter.js | 19 ++++++--- docs/_templates/layout.html | 3 ++ docs/home.rst | 16 +++++++- .../templates/footer.html | 3 +- 8 files changed, 109 insertions(+), 39 deletions(-) diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css index dfc99f005ec542..be803a07a0e14a 100644 --- a/docs/_static/css/custom.css +++ b/docs/_static/css/custom.css @@ -111,28 +111,6 @@ ul#navbar-main-elements > li:hover { } -/* Footer */ -/* =================================================== */ -footer iframe { - background-color: white; - width: 100%; - border: none; - height: 70px; -} - -@media screen and (min-width: 860px) { - footer iframe { - height: 60px; - } -} - -@media screen and (max-width: 596px) { - footer iframe { - height: 85px; - } -} - - /* Doc version dropdown formatting override */ /* =================================================== */ [aria-labelledby="version-selector"] { @@ -1025,8 +1003,8 @@ table#model-accuracy-and-perf-int8-fp32-table td.data { box-shadow: 0 0 5px 2px #a8a8a8; } -.newsletter-btn, -.newsletter-btn:focus { +.newsletter-submit-btn, +.newsletter-submit-btn:focus { background: #cdedff; color: rgba(0, 104, 181, 1); border: 0; @@ -1043,19 +1021,19 @@ table#model-accuracy-and-perf-int8-fp32-table td.data { outline: none; } -.newsletter-btn:hover, -.newsletter-btn:active { +.newsletter-submit-btn:hover, +.newsletter-submit-btn:active { background: #00A3F6; color: white; outline: none; } -.newsletter-btn:disabled { +.newsletter-submit-btn:disabled { background: #a8a8a8; color: white; } -.newsletter-btn:before { +.newsletter-submit-btn:before { font-family: "Font Awesome 5 Free"; content: "\f0e0\00a0"; font-size: 1rem; @@ -1140,3 +1118,10 @@ table#model-accuracy-and-perf-int8-fp32-table td.data { input:-webkit-autofill { -webkit-box-shadow: 0 0 0px 1000px white inset; } + + +/* Splide carousel */ +.splide__slide { + margin-right: 2rem; + overflow: hidden; +} diff --git a/docs/_static/css/homepage_style.css b/docs/_static/css/homepage_style.css index 6b9e81c3f3b305..e76b61374d2fed 100644 --- a/docs/_static/css/homepage_style.css +++ b/docs/_static/css/homepage_style.css @@ -3,6 +3,7 @@ #openvino-documentation > h1 { display: none; } + h1 { /*font-size: var(--pst-font-size-h2);*/ /*margin-bottom: 3rem;*/ @@ -13,12 +14,14 @@ h1 { #ov-homepage-banner, .openvino-diagram, .ov-homepage-higlight-grid { margin-bottom: 90px!important; } + #ov-homepage-banner { padding: 2rem; background-color: #76CEFF; background-image: linear-gradient(346deg, #728EFA 0%, #76CEFF 50%, #BBE8BD 100%); border-bottom: 5px solid #0068b5; } + #ov-homepage-banner p:first-of-type { margin-top: 0; margin-bottom: 1rem; @@ -30,64 +33,92 @@ h1 { line-height: 1em; text-align: left; } + #ov-homepage-banner .line-block { + line-height: 1.5; text-align: left; color: #000000; } + .ov-homepage-banner-btn { transition: 0.7s; font-weight: bold; background-color: #0068b5; color: #ffffff !important; } + .ov-homepage-banner-btn:hover { background-color: white!important; color: var(--sd-color-primary)!important; } + #ov-homepage-banner > p:nth-child(3) { margin-bottom: 0; } + +#ov-homepage-banner a, +#ov-homepage-banner a:visited { + text-decoration: none; + color: #00A3F6; + transition: .7s; + font-weight: 600; +} + +#ov-homepage-banner a:hover { + color: #653171; +} + .openvino-diagram { width: 65%; margin-bottom: 3rem; } + @media (max-width: 720px) { .openvino-diagram { width: 90%; } } + .ov-homepage-higlight-grid { padding: 0; } + .ov-homepage-higlight-grid > div { justify-content:space-evenly; row-gap: 20px; } + .ov-homepage-higlight-grid > div > div.sd-col { width: 230px; min-height: 300px; padding: 0; margin-inline: 5px; } + .ov-homepage-higlight-grid .sd-card { box-shadow: 0 0 20px 5px #f3f3f3!important; transition: 0.5s; overflow: hidden; } + .ov-homepage-higlight-grid .sd-card-hover:hover { border-color: var(--sd-color-card-border)!important; transform: scale(1.00)!important; } + .ov-homepage-higlight-grid .sd-shadow-sm:hover { box-shadow: 0 0 10px 2px rgba(108,36,240,0.3) !important; } + .ov-homepage-higlight-grid .sd-card-title { height: 52.781px; margin-bottom: 2rem; } + .ov-homepage-higlight-grid .sd-card-text { font-size: 0.9rem; } + .ov-homepage-higlight-grid .sd-card::after { align-self: flex-end; display: block; @@ -101,19 +132,24 @@ h1 { height: 3rem; background-color: #CDEDFF; } + .ov-homepage-feature-grid .sd-col { padding: 0; max-width: 48%; } + .ov-homepage-feature-grid .sd-card { border: none; box-shadow: 0 0 20px 2px #f3f3f3!important; /* box-shadow: none!important; */ } + .ov-homepage-feature-grid .sd-row { gap: 1rem; justify-content: center; } + + /* =================================================================== */ /* @media screen and (min-width: 720px) { main.col-xl-7.bd-content { @@ -121,6 +157,7 @@ h1 { max-width: 75%!important; } }*/ + @media screen and (max-width: 535px) { .ov-homepage-feature-grid .sd-row { flex-direction: column; @@ -129,4 +166,4 @@ h1 { .ov-homepage-feature-grid .sd-col { max-width: 100%; } -} \ No newline at end of file +} diff --git a/docs/_static/html/newsletter.html b/docs/_static/html/newsletter.html index 48f379fe098c4b..2bfdfde3d14eee 100644 --- a/docs/_static/html/newsletter.html +++ b/docs/_static/html/newsletter.html @@ -264,7 +264,7 @@
- +
diff --git a/docs/_static/js/custom.js b/docs/_static/js/custom.js index 63dde7d186c441..83282ab3d7f96f 100644 --- a/docs/_static/js/custom.js +++ b/docs/_static/js/custom.js @@ -34,6 +34,7 @@ function addLegalNotice() { } $(document).ready(function () { + addFooter(); createVersions(); updateTitleTag(); updateLanguageSelector(); @@ -46,6 +47,7 @@ $(document).ready(function () { initBenchmarkPickers(); // included with the new benchmarks page initCollapsibleHeaders(); // included with the new benchmarks page createSphinxTabSets(); + initSplide(); }); // Determine where we'd go if clicking on a version selector option @@ -253,3 +255,26 @@ function initBenchmarkPickers() { $('#performance-information-frequently-asked-questions section p, #performance-information-frequently-asked-questions section table').hide(); } } + +function addFooter() { + const footerAnchor = $('.footer'); + + fetch('../footer.html').then((response) => response.text()).then((text) => { + const footerContent = $(text); + footerAnchor.append(footerContent); + }); +} + +function initSplide() { + const slides = $('.splide__slide'); + const height = (slides.length > 4) ? 96 + ((slides.length - 4) * 16) : 96 + var splide = new Splide('.splide', { + direction : 'ttb', + type : 'loop', + height : `${height}px`, + perPage : 1, + autoplay : true, + arrows : false, + }); + splide.mount(); +} diff --git a/docs/_static/js/newsletter.js b/docs/_static/js/newsletter.js index 6cdc31a436238c..1e65a1d1d578a8 100644 --- a/docs/_static/js/newsletter.js +++ b/docs/_static/js/newsletter.js @@ -5,12 +5,21 @@ const eloquaUrl = 'https://s334284386.t.eloqua.com/e/f2' $(document).ready(function () { - // trigger without iframe - // $('#newsletterTrigger').on('click', showForm); + const waitForElement = async selector => { + while (document.querySelector(selector) === null) { + await new Promise(resolve => requestAnimationFrame(resolve)) + } + return document.querySelector(selector); + }; + + waitForElement('#newsletterTrigger').then((trigger) => { + $(trigger).on('click', showForm); + }) - $('iframe').on('load', function() { - $('iframe').contents().find('#newsletterTrigger').on('click', showForm); - }); + // trigger with iframe + // $('iframe').on('load', function() { + // $('iframe').contents().find('#newsletterTrigger').on('click', showForm); + // }); function showForm() { fetch('_static/html/newsletter.html').then((response) => response.text()).then((text) => { diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html index ed3c4c79fcfa4d..6793f261ca5cdd 100644 --- a/docs/_templates/layout.html +++ b/docs/_templates/layout.html @@ -2,6 +2,9 @@ {% block css %} {{ super() }} + + + diff --git a/docs/home.rst b/docs/home.rst index 59eb41cbf55d56..b2ba43180b744d 100644 --- a/docs/home.rst +++ b/docs/home.rst @@ -16,8 +16,20 @@ OpenVINO 2023.0 OpenVINO 2023.0 - | An open-source toolkit for optimizing and deploying deep learning models. - | Boost your AI deep-learning inference performance! + .. raw:: html + +
+
+
+
    +
  • An open-source toolkit for optimizing and deploying deep learning models.
    Boost your AI deep-learning inference performance!
  • +
  • Even more integrations in 2023.0!
    Load TensorFlow, TensorFlow Lite, and PyTorch models directly, without manual conversion.
    See the supported model formats...
  • +
  • CPU inference has become even better. ARM processors are supported and thread scheduling is available on 12th gen Intel® Core and up.
    See how to run OpenVINO on various devices...
  • +
  • Post-training optimization and quantization-aware training now in one tool!
    See the new NNCF capabilities...
  • +
+
+
+
.. button-ref:: get_started :ref-type: doc diff --git a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/footer.html b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/footer.html index 3b342007179080..02e2ff2b894acd 100644 --- a/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/footer.html +++ b/docs/openvino_sphinx_theme/openvino_sphinx_theme/templates/footer.html @@ -1,3 +1,2 @@
- -
\ No newline at end of file + From 1911b09a75997d842cdd2036a131583b24a937ec Mon Sep 17 00:00:00 2001 From: bstankix Date: Thu, 20 Jul 2023 16:44:26 +0200 Subject: [PATCH 4/6] [DOCS] Bugfix newsletter validation --- docs/_static/js/newsletter.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/_static/js/newsletter.js b/docs/_static/js/newsletter.js index 1e65a1d1d578a8..54699e65b840d5 100644 --- a/docs/_static/js/newsletter.js +++ b/docs/_static/js/newsletter.js @@ -75,11 +75,11 @@ $(document).ready(function () { const emailPattern = /^[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}$/; if (emailPattern.test(value)) { $('#newsletterEmail').removeClass('failed'); - $('.newsletter-btn').prop('disabled', false); + $('.newsletter-submit-btn').prop('disabled', false); } else { $('#newsletterEmail').addClass('failed'); - $('.newsletter-btn').prop('disabled', true); + $('.newsletter-submit-btn').prop('disabled', true); } } From 2dfb537bcba4554d2d148534875fc4636d21ce38 Mon Sep 17 00:00:00 2001 From: Sun Xiaoxia Date: Thu, 20 Jul 2023 15:34:21 +0000 Subject: [PATCH 5/6] Xiaoxia/add get_socket_id interface based on threading2.0 (#18264) * add streams_info_table init in the constructor of config * add refresh _proc_type_table * add get_org_proc_type_table * add numa_node per stream in reserve_available_cpus() * fix warning * remove log * fix code style * fix gpu test build failed issue, modify debug info * fix code style * fix build failed on macos * fix code style * select socket in reserve cpu on 2 sockets platform * fix build failed on macos * modify numa node selecting in reserve_cpu_by_streams_info * add test case * fix code style * modify test case * fix core dump * fix core dumped on pin=NUMA * fix test failed on macos * fix reserve cpu is wrong when streams_info_table=[1 1 36] proc_type_table=[36 18 0 18] * add test case in LinuxCpuReserve * modify test case * add test case in cpu_reserve_test * add cpu_stream_info_test * modify enum * fix test failed * change int to size_t * remove invalid code, fix running failed on macos * modify LinuxCpuStreamType test case, move ie_cpu_streams_info.hpp to openvino/runtime/threading/ * fix code sytle * modify enum name * add comments in test case * fix build issue * change IE_ASSERT to OPENVINO_ASSERT * fix test failed on macos and windows * updated test cases due to the cpu mapping is changed * enable numa_node_id and socket_id in streams_info_table * fix code style issue * fix document issue * add get socket id interface * fix segment fault on machine enabled socket_id=1 with numactl command * fix numactl failed on four numa nodes machine * remove compile warning * fix numa_node_id=-1 * fix test case failed on macos * fix test failed on macos * fix numa_node_id=0 on macos * Solve conflicts with master branch * separate test cases for Linux and Mac/Windows * update code style for windows compiler * fix comments * fix code style * fix code style * remove _plugin_mutex, fix comments * fix code style * fix code style * add get_num_sockets * fix cpu reserve issue in latency mode,ANY core on RPL machine * add cpu reserve test case --------- Co-authored-by: Wanglei Shen --- src/inference/dev_api/ie_system_conf.h | 35 +- .../dev_api/openvino/runtime/system_conf.hpp | 39 +- .../threading/cpu_streams_executor.hpp | 2 + .../cpu_streams_executor_internal.hpp | 74 ++ .../runtime/threading/cpu_streams_info.hpp} | 6 +- .../runtime/threading/istreams_executor.hpp | 29 +- .../threading/ie_cpu_streams_executor.hpp | 2 + .../threading/ie_istreams_executor.hpp | 19 +- .../dev/threading/cpu_streams_executor.cpp | 98 +- .../cpu_streams_executor_internal.cpp | 189 ++++ .../src/dev/threading/istreams_executor.cpp | 27 +- src/inference/src/os/cpu_map_info.hpp | 3 +- src/inference/src/os/lin/lin_system_conf.cpp | 49 +- src/inference/src/os/mac/mac_system_conf.cpp | 2 +- src/inference/src/os/win/win_system_conf.cpp | 2 +- src/inference/src/system_conf.cpp | 153 ++-- .../src/threading/ie_cpu_streams_executor.cpp | 4 + .../src/threading/ie_executor_manager.cpp | 4 + .../src/threading/ie_istreams_executor.cpp | 4 + src/inference/tests/unit/cpu_reserve_test.cpp | 842 ++++++++++++++++++ .../tests/unit/cpu_stream_info_test.cpp | 535 +++++++++++ .../intel_cpu/src/cpu_map_scheduling.cpp | 8 +- .../intel_cpu/src/cpu_map_scheduling.hpp | 3 + .../intel_cpu/src/cpu_streams_calculation.cpp | 84 +- .../intel_cpu/src/cpu_streams_calculation.hpp | 24 +- src/plugins/intel_cpu/src/plugin.cpp | 4 +- .../unit/streams_info/streams_e2e_test.cpp | 164 +++- 27 files changed, 2170 insertions(+), 235 deletions(-) create mode 100644 src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor_internal.hpp rename src/inference/dev_api/{threading/ie_cpu_streams_info.hpp => openvino/runtime/threading/cpu_streams_info.hpp} (95%) create mode 100644 src/inference/src/dev/threading/cpu_streams_executor_internal.cpp create mode 100644 src/inference/tests/unit/cpu_reserve_test.cpp create mode 100644 src/inference/tests/unit/cpu_stream_info_test.cpp diff --git a/src/inference/dev_api/ie_system_conf.h b/src/inference/dev_api/ie_system_conf.h index 70196645715d86..7d17a69753f290 100644 --- a/src/inference/dev_api/ie_system_conf.h +++ b/src/inference/dev_api/ie_system_conf.h @@ -13,6 +13,7 @@ #include #include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/cpu_streams_info.hpp" namespace InferenceEngine { @@ -164,12 +165,11 @@ using ov::is_cpu_map_available; using ov::get_num_numa_nodes; /** - * @brief Set flag bit 'Used' of CPU + * @brief Get number of sockets * @ingroup ie_dev_api_system_conf - * @param[in] cpu_ids cpus in cup_mapping. - * @param[in] used flag bit + * @return Number of sockets */ -using ov::set_cpu_used; +using ov::get_num_sockets; /** * @brief Returns number of CPU cores on Linux/Windows @@ -179,14 +179,39 @@ using ov::set_cpu_used; */ using ov::get_proc_type_table; +/** + * @brief Returns original number of CPU cores on Linux/Windows + * @ingroup ie_dev_api_system_conf + * @param[in] plugin_task plugin task. + * @return Number of original CPU cores with core_type. + */ +using ov::get_org_proc_type_table; + /** * @brief Get and reserve available cpu ids * @ingroup ie_dev_api_system_conf * @param[in] streams_info_table streams information table. - * @return Array of available cpu ids. + * @param[in] stream_processors processors grouped in stream + * @param[in] cpu_status set cpu status */ using ov::reserve_available_cpus; +/** + * @brief Set flag bit 'Used' of CPU + * @ingroup ie_dev_api_system_conf + * @param[in] cpu_ids cpus in cup_mapping. + * @param[in] used flag bit + */ +using ov::set_cpu_used; + +/** + * @brief Get socket id by current numa node id + * @ingroup ie_dev_api_system_conf + * @param[in] numa_node_id numa node id + * @return socket id + */ +using ov::get_socket_by_numa_node; + /** * @brief This enum contains definition of each columns in processor type table which bases on cpu core types. Will * extend to support other CPU core type like ARM. diff --git a/src/inference/dev_api/openvino/runtime/system_conf.hpp b/src/inference/dev_api/openvino/runtime/system_conf.hpp index a72b5f4bcb150c..2a53c4cac88dfb 100644 --- a/src/inference/dev_api/openvino/runtime/system_conf.hpp +++ b/src/inference/dev_api/openvino/runtime/system_conf.hpp @@ -152,6 +152,13 @@ OPENVINO_RUNTIME_API bool is_cpu_map_available(); */ OPENVINO_RUNTIME_API int get_num_numa_nodes(); +/** + * @brief Get number of sockets + * @ingroup ie_dev_api_system_conf + * @return Number of sockets + */ +OPENVINO_RUNTIME_API int get_num_sockets(); + /** * @brief Returns a table of number of processor types on Linux/Windows * @ingroup ie_dev_api_system_conf @@ -169,6 +176,16 @@ OPENVINO_RUNTIME_API int get_num_numa_nodes(); */ OPENVINO_RUNTIME_API std::vector> get_proc_type_table(); +/** + * @brief Returns a table of original number of processor types without filtering other plugins occupying CPU + * resources. The difference from get_proc_type_table: This is used to get the configuration of current machine. For + * example, GPU plugin occupies all Pcores, there is only one type core in proc_type_table from get_proc_type_table(). + * If user wants to get the real configuration of this machine which should be got from get_org_proc_type_table. + * @ingroup ie_dev_api_system_conf + * @return A table about number of CPU cores of different types defined with ColumnOfProcessorTypeTable + */ +OPENVINO_RUNTIME_API std::vector> get_org_proc_type_table(); + /** * @enum ColumnOfProcessorTypeTable * @brief This enum contains definition of each columns in processor type table which bases on cpu core types. Will @@ -204,20 +221,20 @@ enum ColumnOfProcessorTypeTable { * @brief Definition of CPU_MAP_USED_FLAG column in CPU mapping table. */ enum ProcessorUseStatus { - NOT_USED = -1, //!< Processor is not bound to thread - CPU_USED = 1, //!< CPU is in using - PLUGIN_USED_START = 100 //!< Plugin other than CPU needs to use. If more GPUs use CPUs, the CPU_MAP_USED_FLAG is - //!< accumulated from PLUGIN_USED_START. For example: GPU.0:100, GPU.1:101 + NOT_USED = -1, //!< Processor is not bound to thread + CPU_USED = 1, //!< CPU is in using }; /** * @brief Get and reserve available cpu ids * @ingroup ie_dev_api_system_conf * @param[in] streams_info_table streams information table. - * @return Array of available cpu ids. + * @param[in] stream_processors processors grouped in stream which is used in core binding in cpu streams executor + * @param[in] cpu_status set cpu status */ -OPENVINO_RUNTIME_API std::vector> reserve_available_cpus( - const std::vector> streams_info_table); +OPENVINO_RUNTIME_API void reserve_available_cpus(const std::vector> streams_info_table, + std::vector>& stream_processors, + const int cpu_status = NOT_USED); /** * @brief Set CPU_MAP_USED_FLAG of cpu_mapping @@ -227,6 +244,14 @@ OPENVINO_RUNTIME_API std::vector> reserve_available_cpus( */ OPENVINO_RUNTIME_API void set_cpu_used(const std::vector& cpu_ids, const int used); +/** + * @brief Get socket id by current numa node id + * @ingroup ie_dev_api_system_conf + * @param[in] numa_node_id numa node id + * @return socket id + */ +OPENVINO_RUNTIME_API int get_socket_by_numa_node(int numa_node_id); + /** * @enum ColumnOfCPUMappingTable * @brief This enum contains definition of each columns in CPU mapping table which use processor id as index. diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp index a59986665a6524..0faf37fc6cfb8b 100644 --- a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor.hpp @@ -46,6 +46,8 @@ class OPENVINO_RUNTIME_API CPUStreamsExecutor : public IStreamsExecutor { int get_numa_node_id() override; + int get_socket_id() override; + private: struct Impl; std::unique_ptr _impl; diff --git a/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor_internal.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor_internal.hpp new file mode 100644 index 00000000000000..e03388d34d5705 --- /dev/null +++ b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_executor_internal.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +/** + * @file openvino/runtime/threading/cpu_streams_executor_internal.hpp + * @brief A header file for OpenVINO Streams-based Executor Interface + */ + +#pragma once + +#include +#include + +namespace ov { +namespace threading { + +enum StreamCreateType { + STREAM_WITHOUT_PARAM = 0, // new task_arena with no parameters, no threads binding + STREAM_WITH_CORE_TYPE, // new task_arena with core type, threads binding with core type + STREAM_WITH_NUMA_ID, // new task_arena with numa node id, threads binding with numa node id + STREAM_WITH_OBSERVE // new task_arena with no parameters, threads binding with observe +}; + +/** + * @brief Get current stream information + * @param[in] stream_id stream id + * @param[in] cpu_reservation cpu reservation + * @param[in] org_proc_type_table available processors in the platform + * @param[in] streams_info_table streams information table + * @param[out] stream_type stream create type + * @param[out] concurrency the number of threads created at the same time + * @param[out] core_type core type + * @param[out] numa_node_id numa node id + */ +void get_cur_stream_info(const int stream_id, + const bool cpu_reservation, + const std::vector> org_proc_type_table, + const std::vector> streams_info_table, + StreamCreateType& stream_type, + int& concurrency, + int& core_type, + int& numa_node_id); + +/** + * @brief Reserve cpu resource by streams info + * @param[in] _streams_info_table streams info table + * @param[in] _numa_nodes number of numa nodes + * @param[out] _cpu_mapping_table CPU mapping table for each processor + * @param[out] _proc_type_table summary table of number of processors per type + * @param[out] _stream_processors processors grouped in stream which is used in core binding in cpu streams executor + * @param[in] _cpu_status set cpu status + * @return + */ +void reserve_cpu_by_streams_info(const std::vector> _streams_info_table, + const int _numa_nodes, + std::vector>& _cpu_mapping_table, + std::vector>& _proc_type_table, + std::vector>& _stream_processors, + const int _cpu_status); + +/** + * @brief Update proc_type_table + * @param[in] _cpu_mapping_table CPU mapping table for each processor + * @param[in] _numa_nodes total number for nodes in system + * @param[out] _proc_type_table summary table of number of processors per type + * @return + */ +void update_proc_type_table(const std::vector> _cpu_mapping_table, + const int _numa_nodes, + std::vector>& _proc_type_table); + +} // namespace threading +} // namespace ov \ No newline at end of file diff --git a/src/inference/dev_api/threading/ie_cpu_streams_info.hpp b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_info.hpp similarity index 95% rename from src/inference/dev_api/threading/ie_cpu_streams_info.hpp rename to src/inference/dev_api/openvino/runtime/threading/cpu_streams_info.hpp index 587669456759d2..f49f2a2bad51dc 100644 --- a/src/inference/dev_api/threading/ie_cpu_streams_info.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/cpu_streams_info.hpp @@ -3,13 +3,13 @@ // /** - * @file ie_cpu_streams_info.hpp + * @file cpu_streams_info.hpp * @brief A header file for Inference Engine CPU streams info table implementation. */ #pragma once -namespace InferenceEngine { +namespace ov { /** * @enum ColumnOfCpuStreamsInfoTable @@ -45,4 +45,4 @@ enum ColumnOfCpuStreamsInfoTable { CPU_STREAMS_TABLE_SIZE = 5 //!< Size of streams info table }; -} // namespace InferenceEngine \ No newline at end of file +} // namespace ov diff --git a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp index 249d8378dfaa97..4167da60da00de 100644 --- a/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp +++ b/src/inference/dev_api/openvino/runtime/threading/istreams_executor.hpp @@ -83,6 +83,7 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { const bool enable_hyper_thread = true); // no network specifics considered (only CPU's caps); static int get_hybrid_num_streams(std::map& config, const int stream_mode); static void update_hybrid_custom_threads(Config& config); + static Config reserve_cpu_threads(const Config& initial); std::string _name; //!< Used by `ITT` to name executor threads int _streams = 1; //!< Number of streams. @@ -102,13 +103,6 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { int _small_core_offset = 0; //!< Calculate small core start offset when binding cpu cores bool _enable_hyper_thread = true; //!< enable hyper thread int _plugin_task = NOT_USED; - std::vector> _orig_proc_type_table; - std::vector> _proc_type_table; - std::vector> _streams_info_table; - std::vector> _stream_core_ids; - std::vector _stream_ids; - bool _cpu_pinning = false; - bool _streams_changed = false; enum StreamMode { DEFAULT, AGGRESSIVE, LESSAGGRESSIVE }; enum PreferredCoreType { ANY, @@ -119,6 +113,11 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { } _threadPreferredCoreType = PreferredCoreType::ANY; //!< In case of @ref HYBRID_AWARE hints the TBB to affinitize + std::vector> _streams_info_table = {}; + std::vector> _stream_processor_ids; + bool _cpu_reservation = false; + bool _streams_changed = false; + /** * @brief A constructor with arguments * @@ -138,7 +137,9 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { int threadBindingStep = 1, int threadBindingOffset = 0, int threads = 0, - PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) + PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY, + std::vector> streamsInfoTable = {}, + bool cpuReservation = false) : _name{name}, _streams{streams}, _threadsPerStream{threadsPerStream}, @@ -146,7 +147,9 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { _threadBindingStep{threadBindingStep}, _threadBindingOffset{threadBindingOffset}, _threads{threads}, - _threadPreferredCoreType(threadPreferredCoreType) {} + _threadPreferredCoreType(threadPreferredCoreType), + _streams_info_table{streamsInfoTable}, + _cpu_reservation{cpuReservation} {} }; /** @@ -162,10 +165,18 @@ class OPENVINO_RUNTIME_API IStreamsExecutor : virtual public ITaskExecutor { /** * @brief Return the id of current NUMA Node + * Return 0 when current stream cross some NUMA Nodes * @return `ID` of current NUMA Node, or throws exceptions if called not from stream thread */ virtual int get_numa_node_id() = 0; + /** + * @brief Return the id of current socket + * Return 0 when current stream cross some sockets + * @return `ID` of current socket, or throws exceptions if called not from stream thread + */ + virtual int get_socket_id() = 0; + /** * @brief Execute the task in the current thread using streams executor configuration and constraints * @param task A task to start diff --git a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp index 312963fd45a8bd..b86145c70a2c49 100644 --- a/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp +++ b/src/inference/dev_api/threading/ie_cpu_streams_executor.hpp @@ -48,6 +48,8 @@ class INFERENCE_ENGINE_API_CLASS(CPUStreamsExecutor) : public IStreamsExecutor { int GetNumaNodeId() override; + int GetSocketId() override; + private: struct Impl; std::unique_ptr _impl; diff --git a/src/inference/dev_api/threading/ie_istreams_executor.hpp b/src/inference/dev_api/threading/ie_istreams_executor.hpp index bb2bbeca0b70d2..55593583960148 100644 --- a/src/inference/dev_api/threading/ie_istreams_executor.hpp +++ b/src/inference/dev_api/threading/ie_istreams_executor.hpp @@ -74,6 +74,7 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, publi const bool enable_hyper_thread = true); // no network specifics considered (only CPU's caps); static int GetHybridNumStreams(std::map& config, const int stream_mode); static void UpdateHybridCustomThreads(Config& config); + static Config ReserveCpuThreads(const Config& initial); /** * @brief A constructor with arguments @@ -94,7 +95,9 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, publi int threadBindingStep = 1, int threadBindingOffset = 0, int threads = 0, - PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY) + PreferredCoreType threadPreferredCoreType = PreferredCoreType::ANY, + std::vector> streamsInfoTable = {}, + bool cpuReservation = false) : ov::threading::IStreamsExecutor::Config(name, streams, threadsPerStream, @@ -102,7 +105,9 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, publi threadBindingStep, threadBindingOffset, threads, - threadPreferredCoreType) {} + threadPreferredCoreType, + streamsInfoTable, + cpuReservation) {} Config(const ov::threading::IStreamsExecutor::Config& config) : ov::threading::IStreamsExecutor::Config(config) {} @@ -125,6 +130,12 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, publi */ virtual int GetNumaNodeId() = 0; + /** + * @brief Return the id of current socket + * @return `ID` of current socket, or throws exceptions if called not from stream thread + */ + virtual int GetSocketId() = 0; + /** * @brief Execute the task in the current thread using streams executor configuration and constraints * @param task A task to start @@ -139,6 +150,10 @@ class INFERENCE_ENGINE_API_CLASS(IStreamsExecutor) : public ITaskExecutor, publi return GetNumaNodeId(); } + int get_socket_id() override { + return GetSocketId(); + } + void execute(Task task) override { Execute(task); } diff --git a/src/inference/src/dev/threading/cpu_streams_executor.cpp b/src/inference/src/dev/threading/cpu_streams_executor.cpp index 1f689be5623468..0b22e913628ba7 100644 --- a/src/inference/src/dev/threading/cpu_streams_executor.cpp +++ b/src/inference/src/dev/threading/cpu_streams_executor.cpp @@ -15,11 +15,9 @@ #include "dev/threading/thread_affinity.hpp" #include "openvino/itt.hpp" #include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp" #include "openvino/runtime/threading/executor_manager.hpp" #include "openvino/runtime/threading/thread_local.hpp" -#include "threading/ie_cpu_streams_info.hpp" - -using namespace InferenceEngine; namespace ov { namespace threading { @@ -120,6 +118,9 @@ struct CPUStreamsExecutor::Impl { _impl->_streamIdQueue.push(_streamId); } #if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO + if (_impl->_config._name.find("StreamsExecutor") == std::string::npos) { + set_cpu_used(_cpu_ids, NOT_USED); + } if (nullptr != _observer) { _observer->observe(false); } @@ -127,51 +128,29 @@ struct CPUStreamsExecutor::Impl { } #if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO - void init_stream() { - std::lock_guard lock{_impl->_cpumap_mutex}; - const auto stream_id = _streamId >= _impl->_config._streams ? _impl->_config._streams - 1 : _streamId; - const auto concurrency = - (_impl->_config._streams_info_table.size() > 0 && _impl->_config._stream_ids.size() > 0) - ? _impl->_config._streams_info_table[_impl->_config._stream_ids[stream_id]][THREADS_PER_STREAM] - : 0; - const auto cpu_core_type = - (_impl->_config._streams_info_table.size() > 0 && _impl->_config._stream_ids.size() > 0) - ? static_cast( - _impl->_config._streams_info_table[_impl->_config._stream_ids[stream_id]][PROC_TYPE]) - : static_cast(0); - if (concurrency <= 0) { - return; - } - if (_impl->_config._orig_proc_type_table[0][EFFICIENT_CORE_PROC] > 0) { - const auto selected_core_type = - (cpu_core_type == MAIN_CORE_PROC || cpu_core_type == HYPER_THREADING_PROC) - ? custom::info::core_types().back() - : custom::info::core_types().front(); - if (_impl->_config._cpu_pinning) { -# if defined(_WIN32) || defined(__APPLE__) - _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} - .set_core_type(selected_core_type) - .set_max_concurrency(concurrency)}); -# else - _taskArena.reset(new custom::task_arena{concurrency}); -# endif - } else { - if (cpu_core_type == ALL_PROC) { - _taskArena.reset(new custom::task_arena{concurrency}); - } else { - _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{} - .set_core_type(selected_core_type) - .set_max_concurrency(concurrency)}); - } - } - } else if (_impl->_config._proc_type_table.size() > 1 && !_impl->_config._cpu_pinning) { + void create_tbb_task_arena(const int stream_id, + const StreamCreateType stream_type, + const int concurrency, + const int core_type, + const int numa_node_id) { + _numaNodeId = (_impl->_usedNumaNodes.size() == 1 && _impl->_usedNumaNodes.at(0) == -1) + ? -1 // macOS + : std::max(0, numa_node_id); + _socketId = get_socket_by_numa_node(_numaNodeId); + if (stream_type == STREAM_WITHOUT_PARAM) { + _taskArena.reset(new custom::task_arena{concurrency}); + } else if (stream_type == STREAM_WITH_NUMA_ID) { _taskArena.reset(new custom::task_arena{custom::task_arena::constraints{_numaNodeId, concurrency}}); + } else if (stream_type == STREAM_WITH_CORE_TYPE) { + const auto real_core_type = (core_type == MAIN_CORE_PROC || core_type == HYPER_THREADING_PROC) + ? custom::info::core_types().back() + : custom::info::core_types().front(); + _taskArena.reset(new custom::task_arena{ + custom::task_arena::constraints{}.set_core_type(real_core_type).set_max_concurrency(concurrency)}); } else { _taskArena.reset(new custom::task_arena{concurrency}); - } - if (_impl->_config._cpu_pinning) { - _cpu_ids = static_cast(_impl->_config._stream_core_ids.size()) == _impl->_config._streams - ? _impl->_config._stream_core_ids[stream_id] + _cpu_ids = static_cast(_impl->_config._stream_processor_ids.size()) == _impl->_config._streams + ? _impl->_config._stream_processor_ids[stream_id] : _cpu_ids; if (_cpu_ids.size() > 0) { CpuSet processMask; @@ -192,6 +171,27 @@ struct CPUStreamsExecutor::Impl { } } } + void init_stream() { + int concurrency; + int cpu_core_type; + int numa_node_id; + StreamCreateType stream_type; + const auto org_proc_type_table = get_org_proc_type_table(); + const auto stream_id = _streamId >= _impl->_config._streams ? _impl->_config._streams - 1 : _streamId; + + get_cur_stream_info(stream_id, + _impl->_config._cpu_reservation, + org_proc_type_table, + _impl->_config._streams_info_table, + stream_type, + concurrency, + cpu_core_type, + numa_node_id); + if (concurrency <= 0) { + return; + } + create_tbb_task_arena(stream_id, stream_type, concurrency, cpu_core_type, numa_node_id); + } void init_stream_legacy() { const auto concurrency = (0 == _impl->_config._threadsPerStream) ? custom::task_arena::automatic @@ -303,6 +303,7 @@ struct CPUStreamsExecutor::Impl { Impl* _impl = nullptr; int _streamId = 0; int _numaNodeId = 0; + int _socketId = 0; bool _execute = false; std::queue _taskQueue; #if OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO @@ -326,7 +327,6 @@ struct CPUStreamsExecutor::Impl { } else { _usedNumaNodes = numaNodes; } - _config._streams = _config._streams == 0 ? 1 : _config._streams; #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) if (!is_cpu_map_available() && ThreadBindingType::HYBRID_AWARE == config._threadBindingType) { const auto core_types = custom::info::core_types(); @@ -423,7 +423,6 @@ struct CPUStreamsExecutor::Impl { std::queue _streamIdQueue; std::vector _threads; std::mutex _mutex; - std::mutex _cpumap_mutex; std::condition_variable _queueCondVar; std::queue _taskQueue; bool _isStopped = false; @@ -451,6 +450,11 @@ int CPUStreamsExecutor::get_numa_node_id() { return stream->_numaNodeId; } +int CPUStreamsExecutor::get_socket_id() { + auto stream = _impl->_streams.local(); + return stream->_socketId; +} + CPUStreamsExecutor::CPUStreamsExecutor(const IStreamsExecutor::Config& config) : _impl{new Impl{config}} {} CPUStreamsExecutor::~CPUStreamsExecutor() { diff --git a/src/inference/src/dev/threading/cpu_streams_executor_internal.cpp b/src/inference/src/dev/threading/cpu_streams_executor_internal.cpp new file mode 100644 index 00000000000000..44e2df4c53fccc --- /dev/null +++ b/src/inference/src/dev/threading/cpu_streams_executor_internal.cpp @@ -0,0 +1,189 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp" + +#include +#include + +#include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/cpu_streams_info.hpp" + +namespace ov { +namespace threading { + +void get_cur_stream_info(const int stream_id, + const bool cpu_reservation, + const std::vector> proc_type_table, + const std::vector> streams_info_table, + StreamCreateType& stream_type, + int& concurrency, + int& core_type, + int& numa_node_id) { + int stream_total = 0; + size_t stream_info_id = 0; + bool cpu_reserve = cpu_reservation; + for (size_t i = 0; i < streams_info_table.size(); i++) { + stream_total += streams_info_table[i][NUMBER_OF_STREAMS]; + if (stream_id < stream_total) { + stream_info_id = i; + break; + } + } + concurrency = streams_info_table[stream_info_id][THREADS_PER_STREAM]; + core_type = streams_info_table[stream_info_id][PROC_TYPE]; + numa_node_id = streams_info_table[stream_info_id][STREAM_NUMA_NODE_ID]; + +#if defined(_WIN32) || defined(__APPLE__) + cpu_reserve = false; +#endif + if (cpu_reserve) { + stream_type = STREAM_WITH_OBSERVE; + } else { + stream_type = STREAM_WITHOUT_PARAM; + if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && core_type != ALL_PROC) { + stream_type = STREAM_WITH_CORE_TYPE; + } else if (proc_type_table.size() > 1 && numa_node_id >= 0) { + stream_type = STREAM_WITH_NUMA_ID; + } + } +} + +void reserve_cpu_by_streams_info(const std::vector> _streams_info_table, + const int _numa_nodes, + std::vector>& _cpu_mapping_table, + std::vector>& _proc_type_table, + std::vector>& _stream_processors, + const int _cpu_status) { + std::vector> streams_info_table = _streams_info_table; + std::vector> stream_conditions; + std::vector stream_pos; + std::vector stream_num; + int num_streams = 0; + + stream_pos.assign(_streams_info_table.size(), 0); + stream_num.assign(_streams_info_table.size(), 0); + for (size_t i = 0; i < _streams_info_table.size(); i++) { + stream_pos[i] = num_streams; + num_streams += _streams_info_table[i][NUMBER_OF_STREAMS]; + } + _stream_processors.assign(num_streams, std::vector()); + stream_conditions.assign(_streams_info_table.size(), std::vector()); + for (size_t i = 0; i < _streams_info_table.size(); i++) { + std::vector proc_types; + std::vector numa_nodes; + std::vector sockets; + if (_streams_info_table[i][PROC_TYPE] > ALL_PROC && _streams_info_table[i][NUMBER_OF_STREAMS] > 0) { + proc_types.push_back(std::to_string(_streams_info_table[i][PROC_TYPE])); + } + if (num_streams == 1 && _streams_info_table[0][PROC_TYPE] == MAIN_CORE_PROC && + _streams_info_table[0][THREADS_PER_STREAM] > _proc_type_table[0][MAIN_CORE_PROC]) { + proc_types.push_back(std::to_string(HYPER_THREADING_PROC)); + } + if (_streams_info_table[i][STREAM_NUMA_NODE_ID] < 0) { + for (int j = 0; j < _numa_nodes; j++) { + numa_nodes.push_back(std::to_string(j)); + } + } else { + numa_nodes.push_back(std::to_string(_streams_info_table[i][STREAM_NUMA_NODE_ID])); + } + if (_streams_info_table[i][STREAM_SOCKET_ID] < 0) { + for (int j = 0; j < _numa_nodes; j++) { + sockets.push_back(std::to_string(j)); + } + } else { + sockets.push_back(std::to_string(_streams_info_table[i][STREAM_SOCKET_ID])); + } + for (auto t : proc_types) { + for (auto n : numa_nodes) { + for (auto s : sockets) { + stream_conditions[i].push_back(t + n + s); + } + } + } + } + + for (size_t i = 0; i < _cpu_mapping_table.size(); i++) { + std::string cpu_string = std::to_string(_cpu_mapping_table[i][CPU_MAP_CORE_TYPE]) + + std::to_string(_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID]) + + std::to_string(_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]); + for (size_t j = 0; j < stream_conditions.size(); j++) { + if (std::find(stream_conditions[j].begin(), stream_conditions[j].end(), cpu_string) != + stream_conditions[j].end()) { + _stream_processors[stream_pos[j]].push_back(_cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]); + _cpu_mapping_table[i][CPU_MAP_USED_FLAG] = _cpu_status; + if (static_cast(_stream_processors[stream_pos[j]].size()) == + _streams_info_table[j][THREADS_PER_STREAM]) { + stream_pos[j]++; + stream_num[j]++; + } + if (stream_num[j] >= _streams_info_table[j][NUMBER_OF_STREAMS]) { + stream_conditions[j].clear(); + } + break; + } + } + } + + if (_cpu_status > NOT_USED) { + update_proc_type_table(_cpu_mapping_table, _numa_nodes, _proc_type_table); + } +} + +void update_proc_type_table(const std::vector> _cpu_mapping_table, + const int _numa_nodes, + std::vector>& _proc_type_table) { + std::vector all_table; + std::map numa_node_map; + + _proc_type_table.assign((_numa_nodes == 1) ? 1 : _numa_nodes + 1, std::vector({0, 0, 0, 0, -1, -1})); + if (_numa_nodes > 1) { + for (int i = 0; i < _numa_nodes; i++) { + _proc_type_table[i + 1][PROC_NUMA_NODE_ID] = i; + } + } else { + _proc_type_table[0][PROC_NUMA_NODE_ID] = 0; + } + if (_numa_nodes > 1) { + for (int i = 1; i < static_cast(_proc_type_table.size()); i++) { + numa_node_map.insert(std::pair(_proc_type_table[i][PROC_NUMA_NODE_ID], i)); + } + } else { + numa_node_map.insert(std::pair(_proc_type_table[0][PROC_NUMA_NODE_ID], 0)); + } + all_table = {0, 0, 0, 0, -1, -1}; + for (size_t i = 0; i < _cpu_mapping_table.size(); i++) { + if (_cpu_mapping_table[i][CPU_MAP_USED_FLAG] == NOT_USED && _cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] >= 0 && + _cpu_mapping_table[i][CPU_MAP_CORE_TYPE] >= ALL_PROC) { + _proc_type_table[numa_node_map.at(_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID])] + [_cpu_mapping_table[i][CPU_MAP_CORE_TYPE]]++; + _proc_type_table[numa_node_map.at(_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID])][ALL_PROC]++; + _proc_type_table[numa_node_map.at(_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID])][PROC_SOCKET_ID] = + _cpu_mapping_table[i][CPU_MAP_SOCKET_ID]; + all_table[_cpu_mapping_table[i][CPU_MAP_CORE_TYPE]]++; + all_table[ALL_PROC]++; + } + } + if (_numa_nodes > 1) { + _proc_type_table[0] = all_table; + } + + if (_proc_type_table.size() > 1) { + size_t n = _proc_type_table.size(); + + while (n > 0) { + if (0 == _proc_type_table[n - 1][ALL_PROC]) { + _proc_type_table.erase(_proc_type_table.begin() + n - 1); + } + n--; + } + + if ((_proc_type_table.size() > 1) && (_proc_type_table[0][ALL_PROC] == _proc_type_table[1][ALL_PROC])) { + _proc_type_table.erase(_proc_type_table.begin()); + } + } +} + +} // namespace threading +} // namespace ov diff --git a/src/inference/src/dev/threading/istreams_executor.cpp b/src/inference/src/dev/threading/istreams_executor.cpp index 12d6cd281b3428..b7c95cb5ec5973 100644 --- a/src/inference/src/dev/threading/istreams_executor.cpp +++ b/src/inference/src/dev/threading/istreams_executor.cpp @@ -13,7 +13,7 @@ #include "ie_plugin_config.hpp" #include "openvino/core/parallel.hpp" #include "openvino/runtime/properties.hpp" -#include "openvino/runtime/system_conf.hpp" +#include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/util/log.hpp" #include "threading/ie_parallel_custom_arena.hpp" @@ -500,5 +500,30 @@ IStreamsExecutor::Config IStreamsExecutor::Config::make_default_multi_threaded(c return streamExecutorConfig; } +IStreamsExecutor::Config IStreamsExecutor::Config::reserve_cpu_threads(const IStreamsExecutor::Config& initial) { + auto config = initial; + int status = config._name.find("StreamsExecutor") != std::string::npos ? NOT_USED : CPU_USED; + + if (config._streams_info_table.size() == 0 || (status == CPU_USED && !config._cpu_reservation)) { + return config; + } + + reserve_available_cpus(config._streams_info_table, config._stream_processor_ids, status); + + config._streams = 0; + config._threads = 0; + for (size_t i = 0; i < config._streams_info_table.size(); i++) { + if (config._streams_info_table[i][NUMBER_OF_STREAMS] > 0) { + config._streams += config._streams_info_table[i][NUMBER_OF_STREAMS]; + config._threads += + config._streams_info_table[i][NUMBER_OF_STREAMS] * config._streams_info_table[i][THREADS_PER_STREAM]; + } + } + OPENVINO_DEBUG << "[ threading ] " << config._name << " reserve_cpu_threads " << config._streams << "(" + << config._threads << ")"; + + return config; +} + } // namespace threading } // namespace ov diff --git a/src/inference/src/os/cpu_map_info.hpp b/src/inference/src/os/cpu_map_info.hpp index 638b77e2904af7..f175900dd42a7b 100644 --- a/src/inference/src/os/cpu_map_info.hpp +++ b/src/inference/src/os/cpu_map_info.hpp @@ -24,12 +24,11 @@ class CPU { int _numa_nodes = 0; int _sockets = 0; int _cores = 0; + std::vector> _org_proc_type_table; std::vector> _proc_type_table; std::vector> _cpu_mapping_table; std::mutex _cpu_mutex; - int _plugin_status = PLUGIN_USED_START; int _socket_idx = 0; - int _num_threads = 0; }; CPU& cpu_info(); diff --git a/src/inference/src/os/lin/lin_system_conf.cpp b/src/inference/src/os/lin/lin_system_conf.cpp index d1934867b70043..02fbe263e15902 100644 --- a/src/inference/src/os/lin/lin_system_conf.cpp +++ b/src/inference/src/os/lin/lin_system_conf.cpp @@ -23,7 +23,6 @@ CPU::CPU() { std::vector> system_info_table; std::vector node_info_table; - _num_threads = parallel_get_max_threads(); auto get_cache_info_linux = [&]() { int cpu_index = 0; int cache_index = 0; @@ -124,14 +123,56 @@ CPU::CPU() { } std::vector phy_core_list; + std::vector socket_list; + std::vector> numa_node_list; std::vector> valid_cpu_mapping_table; + numa_node_list.assign(_sockets, std::vector()); for (int i = 0; i < _processors; i++) { if (CPU_ISSET(i, &mask)) { valid_cpu_mapping_table.emplace_back(_cpu_mapping_table[i]); if (_cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == MAIN_CORE_PROC) { phy_core_list.emplace_back(_cpu_mapping_table[i][CPU_MAP_CORE_ID]); } + if (_sockets > 1) { + if (std::find(socket_list.begin(), socket_list.end(), _cpu_mapping_table[i][CPU_MAP_SOCKET_ID]) == + socket_list.end()) { + socket_list.push_back(_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]); + } + if (std::find(numa_node_list[_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]].begin(), + numa_node_list[_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]].end(), + _cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID]) == + numa_node_list[_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]].end()) { + numa_node_list[_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]].push_back( + _cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID]); + } + } + } + } + if (_sockets > 1) { + std::sort(socket_list.begin(), socket_list.end()); + for (int n = _sockets - 1; n >= 0; n--) { + if (numa_node_list[n].size() == 0) { + numa_node_list.erase(numa_node_list.begin() + n); + } else { + std::sort(numa_node_list[n].begin(), numa_node_list[n].end()); + } + } + std::map sockets_map; + std::map numa_node_map; + for (int i = 0; i < static_cast(socket_list.size()); i++) { + sockets_map.insert(std::pair(socket_list[i], i)); + } + for (int i = 0; i < static_cast(numa_node_list.size()); i++) { + for (int j = 0; j < static_cast(numa_node_list[i].size()); j++) { + numa_node_map.insert(std::pair(numa_node_list[i][j], i * _numa_nodes / _sockets + j)); + } + } + for (size_t i = 0; i < valid_cpu_mapping_table.size(); i++) { + valid_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] = + numa_node_map.at(valid_cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID]); + valid_cpu_mapping_table[i][CPU_MAP_SOCKET_ID] = + sockets_map.at(valid_cpu_mapping_table[i][CPU_MAP_SOCKET_ID]); } } @@ -220,7 +261,7 @@ CPU::CPU() { _cores); } } - + _org_proc_type_table = _proc_type_table; std::vector>().swap(system_info_table); if (check_valid_cpu() < 0) { @@ -655,8 +696,8 @@ void update_valid_processor_linux(const std::vector phy_core_list, _proc_type_table[0][ALL_PROC]++; _proc_type_table[0][row[CPU_MAP_CORE_TYPE]]++; if (_proc_type_table.size() > 1) { - _proc_type_table[row[CPU_MAP_SOCKET_ID] + 1][ALL_PROC]++; - _proc_type_table[row[CPU_MAP_SOCKET_ID] + 1][row[CPU_MAP_CORE_TYPE]]++; + _proc_type_table[row[CPU_MAP_NUMA_NODE_ID] + 1][ALL_PROC]++; + _proc_type_table[row[CPU_MAP_NUMA_NODE_ID] + 1][row[CPU_MAP_CORE_TYPE]]++; } } diff --git a/src/inference/src/os/mac/mac_system_conf.cpp b/src/inference/src/os/mac/mac_system_conf.cpp index f0fad3434f003f..f0bc545be416d9 100644 --- a/src/inference/src/os/mac/mac_system_conf.cpp +++ b/src/inference/src/os/mac/mac_system_conf.cpp @@ -13,8 +13,8 @@ namespace ov { CPU::CPU() { - _num_threads = parallel_get_max_threads(); parse_processor_info_macos(_processors, _numa_nodes, _cores, _proc_type_table); + _org_proc_type_table = _proc_type_table; } int parse_processor_info_macos(int& _processors, diff --git a/src/inference/src/os/win/win_system_conf.cpp b/src/inference/src/os/win/win_system_conf.cpp index 2c7aaec3366008..c678525b66edfe 100644 --- a/src/inference/src/os/win/win_system_conf.cpp +++ b/src/inference/src/os/win/win_system_conf.cpp @@ -19,7 +19,6 @@ namespace ov { CPU::CPU() { DWORD len = 0; - _num_threads = parallel_get_max_threads(); if (GetLogicalProcessorInformationEx(RelationAll, nullptr, &len) || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { return; } @@ -38,6 +37,7 @@ CPU::CPU() { _cores, _proc_type_table, _cpu_mapping_table); + _org_proc_type_table = _proc_type_table; } void parse_processor_info_win(const char* base_ptr, diff --git a/src/inference/src/system_conf.cpp b/src/inference/src/system_conf.cpp index 11cf2704eb3cfa..344fbeb3be9df9 100644 --- a/src/inference/src/system_conf.cpp +++ b/src/inference/src/system_conf.cpp @@ -14,10 +14,12 @@ #include #include "dev/threading/parallel_custom_arena.hpp" -#include "ie_common.h" +#include "openvino/core/except.hpp" #include "openvino/core/visibility.hpp" +#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp" +#include "openvino/runtime/threading/cpu_streams_info.hpp" +#include "openvino/util/log.hpp" #include "os/cpu_map_info.hpp" -#include "threading/ie_cpu_streams_info.hpp" #ifdef __APPLE__ # include @@ -30,8 +32,6 @@ # include #endif -using namespace InferenceEngine; - namespace ov { #if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) @@ -187,17 +187,27 @@ int get_number_of_logical_cpu_cores(bool) { std::vector> get_proc_type_table() { return {{-1}}; } +std::vector> get_org_proc_type_table() { + return {{-1}}; +} bool is_cpu_map_available() { return false; } int get_num_numa_nodes() { return -1; } -std::vector> reserve_available_cpus(const std::vector> streams_info_table) { - return {{-1}}; +int get_num_sockets() { + return -1; } +void reserve_available_cpus(const std::vector> streams_info_table, + std::vector>& stream_processors, + const int cpu_status) {} void set_cpu_used(const std::vector& cpu_ids, const int used) {} +int get_socket_by_numa_node(int numa_node_id) { + return -1; +}; + #elif defined(__APPLE__) // for Linux and Windows the getNumberOfCPUCores (that accounts only for physical cores) implementation is OS-specific // (see cpp files in corresponding folders), for __APPLE__ it is default : @@ -224,14 +234,26 @@ std::vector> get_proc_type_table() { return cpu._proc_type_table; } +std::vector> get_org_proc_type_table() { + CPU& cpu = cpu_info(); + return cpu._org_proc_type_table; +} + int get_num_numa_nodes() { return cpu_info()._numa_nodes; } -std::vector> reserve_available_cpus(const std::vector> streams_info_table) { - return {{-1}}; +int get_num_sockets() { + return -1; } +void reserve_available_cpus(const std::vector> streams_info_table, + std::vector>& stream_processors, + const int cpu_status) {} void set_cpu_used(const std::vector& cpu_ids, const int used) {} +int get_socket_by_numa_node(int numa_node_id) { + return -1; +}; + #else # ifndef _WIN32 @@ -239,7 +261,7 @@ int get_number_of_cpu_cores(bool bigCoresOnly) { CPU& cpu = cpu_info(); unsigned numberOfProcessors = cpu._processors; unsigned totalNumberOfCpuCores = cpu._cores; - IE_ASSERT(totalNumberOfCpuCores != 0); + OPENVINO_ASSERT(totalNumberOfCpuCores != 0, "Total number of cpu cores can not be 0."); cpu_set_t usedCoreSet, currentCoreSet, currentCpuSet; CPU_ZERO(¤tCpuSet); CPU_ZERO(&usedCoreSet); @@ -283,6 +305,11 @@ std::vector> get_proc_type_table() { return cpu._proc_type_table; } +std::vector> get_org_proc_type_table() { + CPU& cpu = cpu_info(); + return cpu._org_proc_type_table; +} + bool is_cpu_map_available() { CPU& cpu = cpu_info(); return cpu._cpu_mapping_table.size() > 0; @@ -292,76 +319,70 @@ int get_num_numa_nodes() { return cpu_info()._numa_nodes; } -std::vector> reserve_available_cpus(const std::vector> streams_info_table) { - std::vector cpu_ids; - int info_table_size = static_cast(streams_info_table.size()); - std::vector> stream_ids; - std::vector>> res_stream_ids; +int get_num_sockets() { + return cpu_info()._sockets; +} + +void reserve_available_cpus(const std::vector> streams_info_table, + std::vector>& stream_processors, + const int cpu_status) { CPU& cpu = cpu_info(); - stream_ids.assign(info_table_size, std::vector()); - res_stream_ids.assign(info_table_size, std::vector>()); + std::lock_guard lock{cpu._cpu_mutex}; - for (int i = 0; i < cpu._processors; i++) { - for (int j = 0; j < info_table_size; j++) { - if (static_cast(res_stream_ids[j].size()) < streams_info_table[j][NUMBER_OF_STREAMS]) { - std::lock_guard lock{cpu._cpu_mutex}; - if (cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] == streams_info_table[j][PROC_TYPE] && - cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] == NOT_USED) { - stream_ids[j].push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]); - cpu_ids.push_back(cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID]); - } - if (static_cast(stream_ids[j].size()) == streams_info_table[j][THREADS_PER_STREAM]) { - std::vector stream_group(stream_ids[j].begin(), stream_ids[j].end()); - res_stream_ids[j].push_back(stream_group); - stream_ids[j].clear(); - } - } - } + ov::threading::reserve_cpu_by_streams_info(streams_info_table, + cpu._numa_nodes, + cpu._cpu_mapping_table, + cpu._proc_type_table, + stream_processors, + cpu_status); + + OPENVINO_DEBUG << "[ threading ] cpu_mapping_table:"; + for (size_t i = 0; i < cpu._cpu_mapping_table.size(); i++) { + OPENVINO_DEBUG << cpu._cpu_mapping_table[i][CPU_MAP_PROCESSOR_ID] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_SOCKET_ID] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_CORE_ID] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_GROUP_ID] << " " + << cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG]; + } + OPENVINO_DEBUG << "[ threading ] proc_type_table:"; + for (size_t i = 0; i < cpu._proc_type_table.size(); i++) { + OPENVINO_DEBUG << cpu._proc_type_table[i][ALL_PROC] << " " << cpu._proc_type_table[i][MAIN_CORE_PROC] << " " + << cpu._proc_type_table[i][EFFICIENT_CORE_PROC] << " " + << cpu._proc_type_table[i][HYPER_THREADING_PROC] << " " + << cpu._proc_type_table[i][PROC_NUMA_NODE_ID] << " " << cpu._proc_type_table[i][PROC_SOCKET_ID]; + } + OPENVINO_DEBUG << "[ threading ] streams_info_table:"; + for (size_t i = 0; i < streams_info_table.size(); i++) { + OPENVINO_DEBUG << streams_info_table[i][NUMBER_OF_STREAMS] << " " << streams_info_table[i][PROC_TYPE] << " " + << streams_info_table[i][THREADS_PER_STREAM] << " " << streams_info_table[i][STREAM_NUMA_NODE_ID] + << " " << streams_info_table[i][STREAM_SOCKET_ID]; } - auto flatten_stream_ids = - std::accumulate(res_stream_ids.begin(), - res_stream_ids.end(), - decltype(res_stream_ids)::value_type{}, - [](std::vector>& pre, std::vector>& cur) { - pre.insert(pre.end(), cur.begin(), cur.end()); - return pre; - }); - - return flatten_stream_ids; } void set_cpu_used(const std::vector& cpu_ids, const int used) { CPU& cpu = cpu_info(); std::lock_guard lock{cpu._cpu_mutex}; const auto cpu_size = static_cast(cpu_ids.size()); - for (int i = 0; i < cpu_size; i++) { - if (cpu_ids[i] < cpu._processors) { - cpu._cpu_mapping_table[cpu_ids[i]][CPU_MAP_USED_FLAG] = used; + if (cpu_size > 0) { + for (int i = 0; i < cpu_size; i++) { + if (cpu_ids[i] < cpu._processors) { + cpu._cpu_mapping_table[cpu_ids[i]][CPU_MAP_USED_FLAG] = used; + } } + ov::threading::update_proc_type_table(cpu._cpu_mapping_table, cpu._numa_nodes, cpu._proc_type_table); } - // update _proc_type_table - if (used == NOT_USED || used >= PLUGIN_USED_START) { - std::vector all_table; - int start = cpu._numa_nodes > 1 ? 1 : 0; - if (cpu._proc_type_table.size() > 0 && cpu._num_threads == cpu._proc_type_table[0][ALL_PROC]) { - cpu._proc_type_table.assign(cpu._proc_type_table.size(), std::vector(PROC_TYPE_TABLE_SIZE, 0)); - all_table.resize(PROC_TYPE_TABLE_SIZE, 0); - for (int i = 0; i < cpu._processors; i++) { - if (cpu._cpu_mapping_table[i][CPU_MAP_USED_FLAG] < PLUGIN_USED_START && - cpu._cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] >= 0 && - cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE] >= ALL_PROC) { - cpu._proc_type_table[cpu._cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] + start] - [cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE]]++; - cpu._proc_type_table[cpu._cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] + start][ALL_PROC]++; - all_table[cpu._cpu_mapping_table[i][CPU_MAP_CORE_TYPE]]++; - all_table[ALL_PROC]++; - } - } - if (cpu._numa_nodes > 1) { - cpu._proc_type_table[0] = all_table; - } +} + +int get_socket_by_numa_node(int numa_node_id) { + CPU& cpu = cpu_info(); + for (int i = 0; i < cpu._processors; i++) { + if (cpu._cpu_mapping_table[i][CPU_MAP_NUMA_NODE_ID] == numa_node_id) { + return cpu._cpu_mapping_table[i][CPU_MAP_SOCKET_ID]; } } + return -1; } int get_number_of_logical_cpu_cores(bool bigCoresOnly) { diff --git a/src/inference/src/threading/ie_cpu_streams_executor.cpp b/src/inference/src/threading/ie_cpu_streams_executor.cpp index a4a06529393f01..41541064d4e1b8 100644 --- a/src/inference/src/threading/ie_cpu_streams_executor.cpp +++ b/src/inference/src/threading/ie_cpu_streams_executor.cpp @@ -39,6 +39,10 @@ int CPUStreamsExecutor::GetNumaNodeId() { return _impl->get_numa_node_id(); } +int CPUStreamsExecutor::GetSocketId() { + return _impl->get_socket_id(); +} + CPUStreamsExecutor::CPUStreamsExecutor(const IStreamsExecutor::Config& config) : _impl{new Impl(config)} {} CPUStreamsExecutor::~CPUStreamsExecutor() {} diff --git a/src/inference/src/threading/ie_executor_manager.cpp b/src/inference/src/threading/ie_executor_manager.cpp index 82a1e126ae5dae..3fcba5a1a7ca62 100644 --- a/src/inference/src/threading/ie_executor_manager.cpp +++ b/src/inference/src/threading/ie_executor_manager.cpp @@ -78,6 +78,10 @@ class StreamsExecutorWrapper : public IStreamsExecutor { return m_executor->get_numa_node_id(); } + int GetSocketId() override { + return m_executor->get_socket_id(); + } + void Execute(Task task) override { m_executor->execute(task); } diff --git a/src/inference/src/threading/ie_istreams_executor.cpp b/src/inference/src/threading/ie_istreams_executor.cpp index e78cc8cb0fae4e..3d3bdebe8138ee 100644 --- a/src/inference/src/threading/ie_istreams_executor.cpp +++ b/src/inference/src/threading/ie_istreams_executor.cpp @@ -50,4 +50,8 @@ IStreamsExecutor::Config IStreamsExecutor::Config::MakeDefaultMultiThreaded(cons return make_default_multi_threaded(initial); } +IStreamsExecutor::Config IStreamsExecutor::Config::ReserveCpuThreads(const Config& initial) { + return reserve_cpu_threads(initial); +} + } // namespace InferenceEngine diff --git a/src/inference/tests/unit/cpu_reserve_test.cpp b/src/inference/tests/unit/cpu_reserve_test.cpp new file mode 100644 index 00000000000000..c2efa532e8eb9b --- /dev/null +++ b/src/inference/tests/unit/cpu_reserve_test.cpp @@ -0,0 +1,842 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include + +#include "ie_system_conf.h" +#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp" + +using namespace testing; +using namespace ov; +using namespace threading; + +namespace { + +#if defined(__linux__) || defined(_WIN32) + +struct LinuxCpuReserveTestCase { + int _processors; + int _sockets; + std::vector> _proc_type_table; + std::vector> _cpu_mapping_table; + std::vector> _streams_info_table; + std::vector> _stream_processors; + int _cpu_status; +}; + +class LinuxCpuReserveTests : public CommonTestUtils::TestsCommon, + public testing::WithParamInterface> { +public: + void SetUp() override { + auto test_data = std::get<0>(GetParam()); + + std::vector> test_processors; + std::vector test_numa_node_ids; + + ov::threading::reserve_cpu_by_streams_info(test_data._streams_info_table, + test_data._sockets, + test_data._cpu_mapping_table, + test_data._proc_type_table, + test_processors, + test_data._cpu_status); + + ASSERT_EQ(test_data._stream_processors, test_processors); + } +}; + +LinuxCpuReserveTestCase _2sockets_72cores_hyper_36streams = { + 72, // param[in]: the number of logical processors + 2, // param[in]: the number of numa nodes + // param[in]: proc_type_table, {total processors, number of physical processors, number of Efficient processors, + // number of hyper threading processors} + {{72, 36, 0, 36, -1, -1}, // total number of processors: 72, the number of physical/hyper threading processors is + // 36 + {36, 18, 0, 18, 0, 0}, // the number of MAIN_CORE_PROC/HYPER_THREADING_PROC cores on each numa node 0 is 18. + {36, 18, 0, 18, 1, 1}}, // the number of MAIN_CORE_PROC/HYPER_THREADING_PROC cores on each numa node 1 is 18. + // param[in]: cpu_mapping_table, {PROCESSOR_ID, SOCKET_ID, CORE_ID, CORE_TYPE, GROUP_ID, Used} + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + // param[in]: streams_info_table, {NUMBER_OF_STREAMS, PROC_TYPE, THREADS_PER_STREAM} + {{18, MAIN_CORE_PROC, 1, 0, 0}, + {18, MAIN_CORE_PROC, 1, 1, 1}}, // 36 streams on physical processors, the number of threads per stream is 1. + // param[out]: stream_processors, the list of processor ids on each stream. + { + {36}, {37}, {38}, {39}, {40}, {41}, {42}, {43}, {44}, {45}, {46}, {47}, {48}, {49}, {50}, {51}, {52}, {53}, + {54}, {55}, {56}, {57}, {58}, {59}, {60}, {61}, {62}, {63}, {64}, {65}, {66}, {67}, {68}, {69}, {70}, {71}, + }, + // param[in]: cpu_status, CPU does not change CPU status + CPU_USED, +}; +LinuxCpuReserveTestCase _2sockets_72cores_hyper_2streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + {{1, MAIN_CORE_PROC, 18, 0, 0}, {1, MAIN_CORE_PROC, 18, 1, 1}}, + { + {36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53}, + {54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71}, + }, + NOT_USED, +}; +// The first 3 streams are assigned on numa_node=0, the next 3 streams are assigned on numa_node=1, +// the last stream is assigned across the numa_node because there are no enough processors on each numa node +LinuxCpuReserveTestCase _2sockets_72cores_hyper_7streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + {{3, MAIN_CORE_PROC, 5, 0, 0}, {3, MAIN_CORE_PROC, 5, 1, 1}, {1, MAIN_CORE_PROC, 5, -1, -1}}, + { + {36, 37, 38, 39, 40}, + {41, 42, 43, 44, 45}, + {46, 47, 48, 49, 50}, + {54, 55, 56, 57, 58}, + {59, 60, 61, 62, 63}, + {64, 65, 66, 67, 68}, + {51, 52, 53, 69, 70}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _2sockets_72cores_hyper_8streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + {{4, MAIN_CORE_PROC, 4, 0, 0}, {4, MAIN_CORE_PROC, 4, 1, 1}}, + { + {36, 37, 38, 39}, + {40, 41, 42, 43}, + {44, 45, 46, 47}, + {48, 49, 50, 51}, + {54, 55, 56, 57}, + {58, 59, 60, 61}, + {62, 63, 64, 65}, + {66, 67, 68, 69}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _2sockets_72cores_hyper_9streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 0, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 1, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 2, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 3, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 4, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 5, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 6, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 7, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 8, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 9, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 10, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 11, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 12, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 13, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 14, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 15, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 16, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 17, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 18, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 19, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 20, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 21, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 22, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 23, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 24, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 25, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 26, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 27, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 28, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 29, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 30, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 31, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 32, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 33, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 34, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 35, -1}, + }, + {{4, MAIN_CORE_PROC, 4, 0, 0}, {4, MAIN_CORE_PROC, 4, 1, 1}, {1, MAIN_CORE_PROC, 4, -1, -1}}, + { + {36, 37, 38, 39}, + {40, 41, 42, 43}, + {44, 45, 46, 47}, + {48, 49, 50, 51}, + {54, 55, 56, 57}, + {58, 59, 60, 61}, + {62, 63, 64, 65}, + {66, 67, 68, 69}, + {52, 53, 70, 71}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _2sockets_72cores_hyper_3streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 0, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 1, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 2, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 3, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 4, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 5, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 6, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 7, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 8, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 9, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 10, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 11, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 12, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 13, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 14, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 15, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 16, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 17, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 18, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 19, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 20, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 21, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 22, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 23, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 24, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 25, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 26, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 27, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 28, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 29, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 30, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 31, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 32, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 33, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 34, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 35, -1}, + }, + {{3, MAIN_CORE_PROC, 4, 0, 0}}, + { + {36, 37, 38, 39}, + {40, 41, 42, 43}, + {44, 45, 46, 47}, + }, + NOT_USED, +}; +// The first four streams are on numa_node=0, and the fifth stream is assigned to numa_node=1, because there are only 2 +// processors left in numa_node=0, which is not enough for one stream +LinuxCpuReserveTestCase _2sockets_72cores_hyper_5streams = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 0, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 1, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 2, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 3, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 4, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 5, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 6, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 7, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 8, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 9, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 10, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 11, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 12, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 13, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 14, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 15, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 16, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 17, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 18, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 19, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 20, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 21, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 22, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 23, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 24, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 25, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 26, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 27, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 28, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 29, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 30, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 31, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 32, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 33, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 34, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 35, -1}, + }, + {{4, MAIN_CORE_PROC, 4, 0, 0}, {1, MAIN_CORE_PROC, 4, 1, 1}}, + { + {36, 37, 38, 39}, // numa_node = 0 + {40, 41, 42, 43}, // numa_node = 0 + {44, 45, 46, 47}, // numa_node = 0 + {48, 49, 50, 51}, // numa_node = 0 + {54, 55, 56, 57}, // numa_node = 1 + }, + NOT_USED, +}; +// The method of plugin reserve cpu: assigned the streams on the numa_node with more cpu resources +LinuxCpuReserveTestCase _2sockets_72cores_hyper_3streams_plugin_reserve = { + 72, + 2, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 0, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 1, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 2, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 3, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 4, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 5, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 6, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 7, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 8, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 9, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 10, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 11, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 12, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 13, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 14, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 15, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 16, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 17, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 18, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 19, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 20, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 21, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 22, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 23, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 24, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 25, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 26, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 27, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 28, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 29, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 30, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 31, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 32, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 33, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 34, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 35, -1}, + }, + {{3, MAIN_CORE_PROC, 4, 1, 1}}, + { + {54, 55, 56, 57}, // numa_node = 1 + {58, 59, 60, 61}, // numa_node = 1 + {62, 63, 64, 65}, // numa_node = 1 + }, + CPU_USED, +}; +LinuxCpuReserveTestCase _2sockets_20cores_hyper_20streams = { + 40, + 2, + {{40, 20, 0, 20, -1, -1}, {20, 10, 0, 10, 0, 0}, {20, 10, 0, 10, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 1, 1, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 1, 1, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 1, 1, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 1, 1, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 1, 1, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 1, 1, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 1, 1, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 1, 1, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 1, 1, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 0, 0, 20, MAIN_CORE_PROC, 0, -1}, {21, 1, 1, 21, MAIN_CORE_PROC, 1, -1}, + {22, 0, 0, 22, MAIN_CORE_PROC, 2, -1}, {23, 1, 1, 23, MAIN_CORE_PROC, 3, -1}, + {24, 0, 0, 24, MAIN_CORE_PROC, 4, -1}, {25, 1, 1, 25, MAIN_CORE_PROC, 5, -1}, + {26, 0, 0, 26, MAIN_CORE_PROC, 6, -1}, {27, 1, 1, 27, MAIN_CORE_PROC, 7, -1}, + {28, 0, 0, 28, MAIN_CORE_PROC, 8, -1}, {29, 1, 1, 29, MAIN_CORE_PROC, 9, -1}, + {30, 0, 0, 30, MAIN_CORE_PROC, 10, -1}, {31, 1, 1, 31, MAIN_CORE_PROC, 11, -1}, + {32, 0, 0, 32, MAIN_CORE_PROC, 12, -1}, {33, 1, 1, 33, MAIN_CORE_PROC, 13, -1}, + {34, 0, 0, 34, MAIN_CORE_PROC, 14, -1}, {35, 1, 1, 35, MAIN_CORE_PROC, 15, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 16, -1}, {37, 1, 1, 37, MAIN_CORE_PROC, 17, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 18, -1}, {39, 1, 1, 39, MAIN_CORE_PROC, 19, -1}, + + }, + {{10, MAIN_CORE_PROC, 1, 0, 0}, {10, MAIN_CORE_PROC, 1, 1, 1}}, + { + {20}, {22}, {24}, {26}, {28}, {30}, {32}, {34}, {36}, {38}, + {21}, {23}, {25}, {27}, {29}, {31}, {33}, {35}, {37}, {39}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _2sockets_20cores_hyper_4streams = { + 40, + 2, + {{40, 20, 0, 20, -1, -1}, {20, 10, 0, 10, 0, 0}, {20, 10, 0, 10, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 1, 1, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 1, 1, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 1, 1, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 1, 1, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 1, 1, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 1, 1, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 1, 1, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 1, 1, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 1, 1, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 0, 0, 20, MAIN_CORE_PROC, 0, -1}, {21, 1, 1, 21, MAIN_CORE_PROC, 1, -1}, + {22, 0, 0, 22, MAIN_CORE_PROC, 2, -1}, {23, 1, 1, 23, MAIN_CORE_PROC, 3, -1}, + {24, 0, 0, 24, MAIN_CORE_PROC, 4, -1}, {25, 1, 1, 25, MAIN_CORE_PROC, 5, -1}, + {26, 0, 0, 26, MAIN_CORE_PROC, 6, -1}, {27, 1, 1, 27, MAIN_CORE_PROC, 7, -1}, + {28, 0, 0, 28, MAIN_CORE_PROC, 8, -1}, {29, 1, 1, 29, MAIN_CORE_PROC, 9, -1}, + {30, 0, 0, 30, MAIN_CORE_PROC, 10, -1}, {31, 1, 1, 31, MAIN_CORE_PROC, 11, -1}, + {32, 0, 0, 32, MAIN_CORE_PROC, 12, -1}, {33, 1, 1, 33, MAIN_CORE_PROC, 13, -1}, + {34, 0, 0, 34, MAIN_CORE_PROC, 14, -1}, {35, 1, 1, 35, MAIN_CORE_PROC, 15, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 16, -1}, {37, 1, 1, 37, MAIN_CORE_PROC, 17, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 18, -1}, {39, 1, 1, 39, MAIN_CORE_PROC, 19, -1}, + + }, + {{2, MAIN_CORE_PROC, 5, 0, 0}, {2, MAIN_CORE_PROC, 5, 1, 1}}, + { + {20, 22, 24, 26, 28}, + {30, 32, 34, 36, 38}, + {21, 23, 25, 27, 29}, + {31, 33, 35, 37, 39}, + }, + CPU_USED, +}; +LinuxCpuReserveTestCase _2sockets_20cores_hyper_5streams = { + 40, + 2, + {{40, 20, 0, 20, -1, -1}, {20, 10, 0, 10, 0, 0}, {20, 10, 0, 10, 1, 1}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 1, 1, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 1, 1, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 1, 1, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 1, 1, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 1, 1, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 1, 1, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 1, 1, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 1, 1, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 1, 1, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 0, 0, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 0, 0, 20, MAIN_CORE_PROC, 0, -1}, {21, 1, 1, 21, MAIN_CORE_PROC, 1, -1}, + {22, 0, 0, 22, MAIN_CORE_PROC, 2, -1}, {23, 1, 1, 23, MAIN_CORE_PROC, 3, -1}, + {24, 0, 0, 24, MAIN_CORE_PROC, 4, -1}, {25, 1, 1, 25, MAIN_CORE_PROC, 5, -1}, + {26, 0, 0, 26, MAIN_CORE_PROC, 6, -1}, {27, 1, 1, 27, MAIN_CORE_PROC, 7, -1}, + {28, 0, 0, 28, MAIN_CORE_PROC, 8, -1}, {29, 1, 1, 29, MAIN_CORE_PROC, 9, -1}, + {30, 0, 0, 30, MAIN_CORE_PROC, 10, -1}, {31, 1, 1, 31, MAIN_CORE_PROC, 11, -1}, + {32, 0, 0, 32, MAIN_CORE_PROC, 12, -1}, {33, 1, 1, 33, MAIN_CORE_PROC, 13, -1}, + {34, 0, 0, 34, MAIN_CORE_PROC, 14, -1}, {35, 1, 1, 35, MAIN_CORE_PROC, 15, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 16, -1}, {37, 1, 1, 37, MAIN_CORE_PROC, 17, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 18, -1}, {39, 1, 1, 39, MAIN_CORE_PROC, 19, -1}, + + }, + {{2, MAIN_CORE_PROC, 4, 0, 0}, {2, MAIN_CORE_PROC, 4, 1, 1}, {1, MAIN_CORE_PROC, 4, -1, -1}}, + { + {20, 22, 24, 26}, + {28, 30, 32, 34}, + {21, 23, 25, 27}, + {29, 31, 33, 35}, + {36, 37, 38, 39}, + }, + CPU_USED, +}; +LinuxCpuReserveTestCase _1socket_4cores_hyper_1streams = { + 8, + 1, + {{8, 4, 0, 4, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {5, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {6, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + }, + {{1, MAIN_CORE_PROC, 4, 0, 0}}, + {{4, 5, 6, 7}}, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_4cores_hyper_4streams = { + 8, + 1, + {{8, 4, 0, 4, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {5, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {6, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + }, + {{2, MAIN_CORE_PROC, 2, 0, 0}, {2, HYPER_THREADING_PROC, 2, 0, 0}}, + {{4, 5}, {6, 7}, {0, 1}, {2, 3}}, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_16cores_hyper_20streams = { + 24, + 1, + {{24, 8, 8, 8, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + }, + {{8, MAIN_CORE_PROC, 1, 0, 0}, {4, EFFICIENT_CORE_PROC, 2, 0, 0}, {8, HYPER_THREADING_PROC, 1, 0, 0}}, + { + {1}, {3}, {5}, {7}, {9}, {11}, {13}, {15}, {16, 17}, {18, 19}, + {20, 21}, {22, 23}, {0}, {2}, {4}, {6}, {8}, {10}, {12}, {14}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_16cores_hyper_1streams = { + 24, + 1, + {{24, 8, 8, 8, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + }, + {{1, MAIN_CORE_PROC, 8, 0, 0}}, + {{1, 3, 5, 7, 9, 11, 13, 15}}, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_16cores_hyper_4streams = { + 24, + 1, + {{24, 8, 8, 8, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + }, + {{2, MAIN_CORE_PROC, 4, 0, 0}, {2, EFFICIENT_CORE_PROC, 4, 0, 0}}, + { + {1, 3, 5, 7}, + {9, 11, 13, 15}, + {16, 17, 18, 19}, + {20, 21, 22, 23}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_16cores_hyper_4streams_ecoreonly = { + 24, + 1, + {{24, 8, 8, 8, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + }, + {{4, EFFICIENT_CORE_PROC, 2, 0, 0}}, + { + {16, 17}, + {18, 19}, + {20, 21}, + {22, 23}, + }, + NOT_USED, +}; +// streams_info_table={1, MAIN_CORE_PROC, 36}, but the number of physical cores is 18, +// in this case, threads are assigned on physical and logical cores. +LinuxCpuReserveTestCase _1socket_18cores_hyper_1streams = { + 36, + 1, + {{36, 18, 0, 18, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {16, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {17, 0, 0, 9, HYPER_THREADING_PROC, 8, -1}, + {18, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {19, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {20, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {21, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {22, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {23, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {24, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {25, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {26, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {27, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {28, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, {29, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, + {30, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, {31, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, + {32, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, {33, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, + {34, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, {35, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, + }, + {{1, MAIN_CORE_PROC, 36, 0, 0}}, + { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_18cores_hyper_2streams = { + 36, + 1, + {{36, 18, 0, 18, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {16, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {17, 0, 0, 9, HYPER_THREADING_PROC, 8, -1}, + {18, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, {19, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {20, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, {21, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {22, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, {23, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {24, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, {25, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {26, 0, 0, 8, MAIN_CORE_PROC, 8, -1}, {27, 0, 0, 9, MAIN_CORE_PROC, 9, -1}, + {28, 0, 0, 10, MAIN_CORE_PROC, 10, -1}, {29, 0, 0, 11, MAIN_CORE_PROC, 11, -1}, + {30, 0, 0, 12, MAIN_CORE_PROC, 12, -1}, {31, 0, 0, 13, MAIN_CORE_PROC, 13, -1}, + {32, 0, 0, 14, MAIN_CORE_PROC, 14, -1}, {33, 0, 0, 15, MAIN_CORE_PROC, 15, -1}, + {34, 0, 0, 16, MAIN_CORE_PROC, 16, -1}, {35, 0, 0, 17, MAIN_CORE_PROC, 17, -1}, + }, + {{1, MAIN_CORE_PROC, 18, 0, 0}, {1, HYPER_THREADING_PROC, 18, 0, 0}}, + { + {18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35}, + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, + }, + NOT_USED, +}; +LinuxCpuReserveTestCase _1socket_32cores_hyper_1streams = { + 32, + 1, + {{32, 8, 16, 8, 0, 0}}, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + {24, 0, 0, 16, EFFICIENT_CORE_PROC, 10, -1}, {25, 0, 0, 17, EFFICIENT_CORE_PROC, 10, -1}, + {26, 0, 0, 18, EFFICIENT_CORE_PROC, 10, -1}, {27, 0, 0, 19, EFFICIENT_CORE_PROC, 10, -1}, + {28, 0, 0, 20, EFFICIENT_CORE_PROC, 11, -1}, {29, 0, 0, 21, EFFICIENT_CORE_PROC, 11, -1}, + {30, 0, 0, 22, EFFICIENT_CORE_PROC, 11, -1}, {31, 0, 0, 23, EFFICIENT_CORE_PROC, 11, -1}, + }, + {{1, ALL_PROC, 24, 0, 0}, {0, MAIN_CORE_PROC, 8, 0, 0}, {0, EFFICIENT_CORE_PROC, 16, 0, 0}}, + { + {}, + }, + NOT_USED, +}; + +TEST_P(LinuxCpuReserveTests, LinuxCpuReserve) {} + +INSTANTIATE_TEST_SUITE_P(CPUReserve, + LinuxCpuReserveTests, + testing::Values(_2sockets_72cores_hyper_36streams, + _2sockets_72cores_hyper_2streams, + _2sockets_72cores_hyper_7streams, + _2sockets_72cores_hyper_8streams, + _2sockets_72cores_hyper_9streams, + _2sockets_72cores_hyper_3streams, + _2sockets_72cores_hyper_5streams, + _2sockets_72cores_hyper_3streams_plugin_reserve, + _2sockets_20cores_hyper_20streams, + _2sockets_20cores_hyper_4streams, + _2sockets_20cores_hyper_5streams, + _1socket_4cores_hyper_1streams, + _1socket_4cores_hyper_4streams, + _1socket_16cores_hyper_20streams, + _1socket_16cores_hyper_1streams, + _1socket_16cores_hyper_4streams, + _1socket_16cores_hyper_4streams_ecoreonly, + _1socket_18cores_hyper_1streams, + _1socket_18cores_hyper_2streams, + _1socket_32cores_hyper_1streams)); +#endif +} // namespace diff --git a/src/inference/tests/unit/cpu_stream_info_test.cpp b/src/inference/tests/unit/cpu_stream_info_test.cpp new file mode 100644 index 00000000000000..3d432d5cb60804 --- /dev/null +++ b/src/inference/tests/unit/cpu_stream_info_test.cpp @@ -0,0 +1,535 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include + +#include "ie_system_conf.h" +#include "openvino/runtime/threading/cpu_streams_executor_internal.hpp" +#include "os/cpu_map_info.hpp" + +using namespace testing; +using namespace ov; +using namespace threading; + +namespace { + +#if defined(__linux__) || defined(_WIN32) + +# define NUMA_ALL -1 + +struct LinuxCpuStreamTypeCase { + bool _cpu_reservation; + int _numa_nodes; + std::vector> _cpu_mapping_table; + std::vector> _proc_type_table; + std::vector> _streams_info_table; + std::vector _stream_type; + std::vector _concurrency; + std::vector _core_type; + std::vector _numa_node_id; +}; + +class LinuxCpuStreamTypeTests : public CommonTestUtils::TestsCommon, + public testing::WithParamInterface> { +public: + void SetUp() override { + auto test_data = std::get<0>(GetParam()); + + std::vector> stream_processor_ids; + std::vector test_stream_types; + std::vector test_concurrencys; + std::vector test_core_types; + std::vector test_numa_node_ids; + int streams = 0; + + for (size_t i = 0; i < test_data._streams_info_table.size(); i++) { + streams += test_data._streams_info_table[i][NUMBER_OF_STREAMS]; + } + + ov::threading::reserve_cpu_by_streams_info(test_data._streams_info_table, + test_data._numa_nodes, + test_data._cpu_mapping_table, + test_data._proc_type_table, + stream_processor_ids, + NOT_USED); + + for (auto i = 0; i < streams; i++) { + StreamCreateType test_stream_type; + int test_concurrency; + int test_core_type; + int test_numa_node_id; + get_cur_stream_info(i, + test_data._cpu_reservation, + test_data._proc_type_table, + test_data._streams_info_table, + test_stream_type, + test_concurrency, + test_core_type, + test_numa_node_id); + test_stream_types.push_back(test_stream_type); + test_concurrencys.push_back(test_concurrency); + test_core_types.push_back(test_core_type); + test_numa_node_ids.push_back(test_numa_node_id); + } + + ASSERT_EQ(test_data._stream_type, test_stream_types); + ASSERT_EQ(test_data._concurrency, test_concurrencys); + ASSERT_EQ(test_data._core_type, test_core_types); + ASSERT_EQ(test_data._numa_node_id, test_numa_node_ids); + } +}; + +LinuxCpuStreamTypeCase _2sockets_72cores_nobinding_36streams = { + false, // param[in]: cpu_reservation + 2, // param[in]: number of numa nodes + // param[in]: cpu_mapping_table, {PROCESSOR_ID, SOCKET_ID, CORE_ID, CORE_TYPE, GROUP_ID, Used} + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + // param[in]: proc_type_table, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + // param[in]: streams_info_table, {NUMBER_OF_STREAMS, PROC_TYPE, THREADS_PER_STREAM} + {{18, MAIN_CORE_PROC, 1, 0, 0}, {18, MAIN_CORE_PROC, 1, 1, 1}}, + // param[out]: stream_type per stream used in new task_arena + { + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + }, + // param[out]: concurrency per stream used in new task_arena + {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + // param[out]: core_type per stream used in new task_arena + { + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, MAIN_CORE_PROC, + }, + // param[out]: numa_node_id per stream used in new task_arena + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, +}; +LinuxCpuStreamTypeCase _2sockets_72cores_nobinding_9streams = { + false, + 2, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + {{4, MAIN_CORE_PROC, 4, 0, 0}, {4, MAIN_CORE_PROC, 4, 1, 1}, {1, MAIN_CORE_PROC, 4, -1, -1}}, + { + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITHOUT_PARAM, + }, + {4, 4, 4, 4, 4, 4, 4, 4, 4}, + { + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + }, + {0, 0, 0, 0, 1, 1, 1, 1, NUMA_ALL}, +}; +LinuxCpuStreamTypeCase _2sockets_72cores_binding_9streams = { + true, + 2, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {5, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, + {6, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {7, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, + {8, 0, 0, 8, HYPER_THREADING_PROC, 8, -1}, {9, 0, 0, 9, HYPER_THREADING_PROC, 9, -1}, + {10, 0, 0, 10, HYPER_THREADING_PROC, 10, -1}, {11, 0, 0, 11, HYPER_THREADING_PROC, 11, -1}, + {12, 0, 0, 12, HYPER_THREADING_PROC, 12, -1}, {13, 0, 0, 13, HYPER_THREADING_PROC, 13, -1}, + {14, 0, 0, 14, HYPER_THREADING_PROC, 14, -1}, {15, 0, 0, 15, HYPER_THREADING_PROC, 15, -1}, + {16, 0, 0, 16, HYPER_THREADING_PROC, 16, -1}, {17, 0, 0, 17, HYPER_THREADING_PROC, 17, -1}, + {18, 1, 1, 18, HYPER_THREADING_PROC, 18, -1}, {19, 1, 1, 19, HYPER_THREADING_PROC, 19, -1}, + {20, 1, 1, 20, HYPER_THREADING_PROC, 20, -1}, {21, 1, 1, 21, HYPER_THREADING_PROC, 21, -1}, + {22, 1, 1, 22, HYPER_THREADING_PROC, 22, -1}, {23, 1, 1, 23, HYPER_THREADING_PROC, 23, -1}, + {24, 1, 1, 24, HYPER_THREADING_PROC, 24, -1}, {25, 1, 1, 25, HYPER_THREADING_PROC, 25, -1}, + {26, 1, 1, 26, HYPER_THREADING_PROC, 26, -1}, {27, 1, 1, 27, HYPER_THREADING_PROC, 27, -1}, + {28, 1, 1, 28, HYPER_THREADING_PROC, 28, -1}, {29, 1, 1, 29, HYPER_THREADING_PROC, 29, -1}, + {30, 1, 1, 30, HYPER_THREADING_PROC, 30, -1}, {31, 1, 1, 31, HYPER_THREADING_PROC, 31, -1}, + {32, 1, 1, 32, HYPER_THREADING_PROC, 32, -1}, {33, 1, 1, 33, HYPER_THREADING_PROC, 33, -1}, + {34, 1, 1, 34, HYPER_THREADING_PROC, 34, -1}, {35, 1, 1, 35, HYPER_THREADING_PROC, 35, -1}, + {36, 0, 0, 36, MAIN_CORE_PROC, 36, -1}, {37, 0, 0, 37, MAIN_CORE_PROC, 37, -1}, + {38, 0, 0, 38, MAIN_CORE_PROC, 38, -1}, {39, 0, 0, 39, MAIN_CORE_PROC, 39, -1}, + {40, 0, 0, 40, MAIN_CORE_PROC, 40, -1}, {41, 0, 0, 41, MAIN_CORE_PROC, 41, -1}, + {42, 0, 0, 42, MAIN_CORE_PROC, 42, -1}, {43, 0, 0, 43, MAIN_CORE_PROC, 43, -1}, + {44, 0, 0, 44, MAIN_CORE_PROC, 44, -1}, {45, 0, 0, 45, MAIN_CORE_PROC, 45, -1}, + {46, 0, 0, 46, MAIN_CORE_PROC, 46, -1}, {47, 0, 0, 47, MAIN_CORE_PROC, 47, -1}, + {48, 0, 0, 48, MAIN_CORE_PROC, 48, -1}, {49, 0, 0, 49, MAIN_CORE_PROC, 49, -1}, + {50, 0, 0, 50, MAIN_CORE_PROC, 50, -1}, {51, 0, 0, 51, MAIN_CORE_PROC, 51, -1}, + {52, 0, 0, 52, MAIN_CORE_PROC, 52, -1}, {53, 0, 0, 53, MAIN_CORE_PROC, 53, -1}, + {54, 1, 1, 54, MAIN_CORE_PROC, 54, -1}, {55, 1, 1, 55, MAIN_CORE_PROC, 55, -1}, + {56, 1, 1, 56, MAIN_CORE_PROC, 56, -1}, {57, 1, 1, 57, MAIN_CORE_PROC, 57, -1}, + {58, 1, 1, 58, MAIN_CORE_PROC, 58, -1}, {59, 1, 1, 59, MAIN_CORE_PROC, 59, -1}, + {60, 1, 1, 60, MAIN_CORE_PROC, 60, -1}, {61, 1, 1, 61, MAIN_CORE_PROC, 61, -1}, + {62, 1, 1, 62, MAIN_CORE_PROC, 62, -1}, {63, 1, 1, 63, MAIN_CORE_PROC, 63, -1}, + {64, 1, 1, 64, MAIN_CORE_PROC, 64, -1}, {65, 1, 1, 65, MAIN_CORE_PROC, 65, -1}, + {66, 1, 1, 66, MAIN_CORE_PROC, 66, -1}, {67, 1, 1, 67, MAIN_CORE_PROC, 67, -1}, + {68, 1, 1, 68, MAIN_CORE_PROC, 68, -1}, {69, 1, 1, 69, MAIN_CORE_PROC, 69, -1}, + {70, 1, 1, 70, MAIN_CORE_PROC, 70, -1}, {71, 1, 1, 71, MAIN_CORE_PROC, 71, -1}, + }, + {{72, 36, 0, 36, -1, -1}, {36, 18, 0, 18, 0, 0}, {36, 18, 0, 18, 1, 1}}, + {{4, MAIN_CORE_PROC, 4, 0, 0}, {4, MAIN_CORE_PROC, 4, 1, 1}, {1, MAIN_CORE_PROC, 4, -1, -1}}, +# if defined(__linux__) + { + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + }, +# else + { + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITH_NUMA_ID, + STREAM_WITHOUT_PARAM, + }, +# endif + {4, 4, 4, 4, 4, 4, 4, 4, 4}, + { + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + }, + {0, 0, 0, 0, 1, 1, 1, 1, NUMA_ALL}, +}; +LinuxCpuStreamTypeCase _1sockets_4cores_nobinding = { + false, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {5, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {6, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + }, + {{8, 4, 0, 4, 0, 0}}, + {{1, MAIN_CORE_PROC, 8, 0, 0}}, + {STREAM_WITHOUT_PARAM}, + {8}, + {MAIN_CORE_PROC}, + {0}, +}; +LinuxCpuStreamTypeCase _1sockets_4cores_binding = { + true, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, + {1, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, + {2, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, + {3, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, + {4, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {5, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {6, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + }, + {{8, 4, 0, 4, 0, 0}}, + {{4, MAIN_CORE_PROC, 1, 0, 0}}, +# if defined(__linux__) + { + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + }, +# else + { + STREAM_WITHOUT_PARAM, + STREAM_WITHOUT_PARAM, + STREAM_WITHOUT_PARAM, + STREAM_WITHOUT_PARAM, + }, +# endif + {1, 1, 1, 1}, + { + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + MAIN_CORE_PROC, + }, + {0, 0, 0, 0}, +}; +LinuxCpuStreamTypeCase _1sockets_12cores_pcore_nobinding = { + false, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + }, + {{20, 8, 4, 8, 0, 0}}, + {{1, MAIN_CORE_PROC, 8, 0, 0}}, + {STREAM_WITH_CORE_TYPE}, + {8}, + {MAIN_CORE_PROC}, + {0}, +}; +LinuxCpuStreamTypeCase _1sockets_12cores_pcore_binding = { + true, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + }, + {{20, 8, 4, 8, 0, 0}}, + {{2, MAIN_CORE_PROC, 4, 0, 0}}, +# if defined(__linux__) + { + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + }, +# else + { + STREAM_WITH_CORE_TYPE, + STREAM_WITH_CORE_TYPE, + }, +# endif + {4, 4}, + { + MAIN_CORE_PROC, + MAIN_CORE_PROC, + }, + {0, 0}, +}; +LinuxCpuStreamTypeCase _1sockets_12cores_ecore_nobinding = { + false, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + }, + {{20, 8, 4, 8, 0, 0}}, + {{2, EFFICIENT_CORE_PROC, 2, 0, 0}}, + { + STREAM_WITH_CORE_TYPE, + STREAM_WITH_CORE_TYPE, + }, + {2, 2}, + { + EFFICIENT_CORE_PROC, + EFFICIENT_CORE_PROC, + }, + {0, 0}, +}; +LinuxCpuStreamTypeCase _1sockets_12cores_ecore_binding = { + true, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + }, + {{20, 8, 4, 8, 0, 0}}, + {{4, EFFICIENT_CORE_PROC, 1, 0, 0}}, +# if defined(__linux__) + { + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + STREAM_WITH_OBSERVE, + }, +# else + { + STREAM_WITH_CORE_TYPE, + STREAM_WITH_CORE_TYPE, + STREAM_WITH_CORE_TYPE, + STREAM_WITH_CORE_TYPE, + }, +# endif + {1, 1, 1, 1}, + { + EFFICIENT_CORE_PROC, + EFFICIENT_CORE_PROC, + EFFICIENT_CORE_PROC, + EFFICIENT_CORE_PROC, + }, + {0, 0, 0, 0}, +}; +LinuxCpuStreamTypeCase _1sockets_24cores_all_proc = { + false, + 1, + { + {0, 0, 0, 0, HYPER_THREADING_PROC, 0, -1}, {1, 0, 0, 0, MAIN_CORE_PROC, 0, -1}, + {2, 0, 0, 1, HYPER_THREADING_PROC, 1, -1}, {3, 0, 0, 1, MAIN_CORE_PROC, 1, -1}, + {4, 0, 0, 2, HYPER_THREADING_PROC, 2, -1}, {5, 0, 0, 2, MAIN_CORE_PROC, 2, -1}, + {6, 0, 0, 3, HYPER_THREADING_PROC, 3, -1}, {7, 0, 0, 3, MAIN_CORE_PROC, 3, -1}, + {8, 0, 0, 4, HYPER_THREADING_PROC, 4, -1}, {9, 0, 0, 4, MAIN_CORE_PROC, 4, -1}, + {10, 0, 0, 5, HYPER_THREADING_PROC, 5, -1}, {11, 0, 0, 5, MAIN_CORE_PROC, 5, -1}, + {12, 0, 0, 6, HYPER_THREADING_PROC, 6, -1}, {13, 0, 0, 6, MAIN_CORE_PROC, 6, -1}, + {14, 0, 0, 7, HYPER_THREADING_PROC, 7, -1}, {15, 0, 0, 7, MAIN_CORE_PROC, 7, -1}, + {16, 0, 0, 8, EFFICIENT_CORE_PROC, 8, -1}, {17, 0, 0, 9, EFFICIENT_CORE_PROC, 8, -1}, + {18, 0, 0, 10, EFFICIENT_CORE_PROC, 8, -1}, {19, 0, 0, 11, EFFICIENT_CORE_PROC, 8, -1}, + {20, 0, 0, 12, EFFICIENT_CORE_PROC, 9, -1}, {21, 0, 0, 13, EFFICIENT_CORE_PROC, 9, -1}, + {22, 0, 0, 14, EFFICIENT_CORE_PROC, 9, -1}, {23, 0, 0, 15, EFFICIENT_CORE_PROC, 9, -1}, + {24, 0, 0, 16, EFFICIENT_CORE_PROC, 10, -1}, {25, 0, 0, 17, EFFICIENT_CORE_PROC, 10, -1}, + {26, 0, 0, 18, EFFICIENT_CORE_PROC, 10, -1}, {27, 0, 0, 19, EFFICIENT_CORE_PROC, 10, -1}, + {28, 0, 0, 20, EFFICIENT_CORE_PROC, 11, -1}, {29, 0, 0, 21, EFFICIENT_CORE_PROC, 11, -1}, + {30, 0, 0, 22, EFFICIENT_CORE_PROC, 11, -1}, {31, 0, 0, 23, EFFICIENT_CORE_PROC, 11, -1}, + }, + {{32, 8, 16, 8, 0, 0}}, + {{1, ALL_PROC, 24, 0, 0}}, + {STREAM_WITHOUT_PARAM}, + {24}, + {ALL_PROC}, + {0}, +}; + +TEST_P(LinuxCpuStreamTypeTests, LinuxCpuStreamType) {} + +INSTANTIATE_TEST_SUITE_P(CpuStreamType, + LinuxCpuStreamTypeTests, + testing::Values(_2sockets_72cores_nobinding_36streams, + _2sockets_72cores_nobinding_9streams, + _2sockets_72cores_binding_9streams, + _1sockets_4cores_nobinding, + _1sockets_4cores_binding, + _1sockets_12cores_pcore_nobinding, + _1sockets_12cores_pcore_binding, + _1sockets_12cores_ecore_nobinding, + _1sockets_12cores_ecore_binding, + _1sockets_24cores_all_proc)); +#endif +} // namespace diff --git a/src/plugins/intel_cpu/src/cpu_map_scheduling.cpp b/src/plugins/intel_cpu/src/cpu_map_scheduling.cpp index a76495178d8e36..3554de0636d6f4 100644 --- a/src/plugins/intel_cpu/src/cpu_map_scheduling.cpp +++ b/src/plugins/intel_cpu/src/cpu_map_scheduling.cpp @@ -4,6 +4,7 @@ #include "cpu_map_scheduling.hpp" +#include "cpu_streams_calculation.hpp" #include "ie_parallel.hpp" #include "ie_system_conf.h" @@ -72,9 +73,10 @@ bool get_cpu_pinning(bool& input_value, const bool input_changed, const int num_streams, const threading::IStreamsExecutor::ThreadBindingType bind_type, + const Config::LatencyThreadingMode latency_threading_mode, const std::vector>& proc_type_table) { int result_value; - int num_sockets = get_num_numa_nodes(); + int num_sockets = get_default_latency_streams(latency_threading_mode); bool latency = num_streams <= num_sockets && num_streams > 0; if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && @@ -87,12 +89,12 @@ bool get_cpu_pinning(bool& input_value, result_value = input_changed ? input_value : (bind_type == threading::IStreamsExecutor::ThreadBindingType::NUMA ? false : true); + } #if (IE_THREAD == IE_THREAD_TBB || IE_THREAD == IE_THREAD_TBB_AUTO) # if defined(__APPLE__) || defined(_WIN32) - result_value = false; + result_value = false; # endif #endif - } input_value = result_value; return result_value; diff --git a/src/plugins/intel_cpu/src/cpu_map_scheduling.hpp b/src/plugins/intel_cpu/src/cpu_map_scheduling.hpp index 1329ed09e1f2c5..6da16053bcae12 100644 --- a/src/plugins/intel_cpu/src/cpu_map_scheduling.hpp +++ b/src/plugins/intel_cpu/src/cpu_map_scheduling.hpp @@ -11,6 +11,7 @@ #include +#include "config.h" #include "openvino/runtime/properties.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" @@ -45,6 +46,7 @@ std::vector> apply_hyper_threading(bool& input_ht_hint, * @param[in] input_changed indicate if value is set by user. * @param[in] num_streams number of streams * @param[in] bind_type thread binding core type + * @param[in] latency_threading_mode is the scope of candidate processors per stream for latency hint * @param[in] proc_type_table candidate processors available at this time * @return whether pinning threads to cpu cores */ @@ -52,6 +54,7 @@ bool get_cpu_pinning(bool& input_value, const bool input_changed, const int num_streams, const threading::IStreamsExecutor::ThreadBindingType bind_type, + const Config::LatencyThreadingMode latency_threading_mode, const std::vector>& proc_type_table); } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp index 952ddd8a2462db..5c958e3eb76b2c 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.cpp @@ -13,12 +13,12 @@ #include "cpu_map_scheduling.hpp" #include "graph.h" #include "ie_system_conf.h" +#include "openvino/runtime/threading/cpu_streams_info.hpp" #include "openvino/runtime/threading/istreams_executor.hpp" #include "performance_heuristics.hpp" -#include "threading/ie_cpu_streams_info.hpp" -using namespace InferenceEngine; using namespace ov; +using namespace threading; #define INIT_VAL -100 @@ -353,14 +353,15 @@ std::vector> get_streams_info_table(const int input_streams, } int get_model_prefer_threads(const int num_streams, + const Config::LatencyThreadingMode latency_threading_mode, const std::vector> proc_type_table, const std::shared_ptr& ngraphFunc, - const InferenceEngine::IStreamsExecutor::Config streamExecutorConfig) { - const int sockets = get_num_numa_nodes(); + const ov::threading::IStreamsExecutor::Config streamExecutorConfig) { + const int sockets = get_default_latency_streams(latency_threading_mode); auto model_prefer = 0; // latency if (num_streams <= sockets && num_streams > 0) { - if (streamExecutorConfig._threadBindingType == IStreamsExecutor::ThreadBindingType::HYBRID_AWARE) { + if (proc_type_table[0][EFFICIENT_CORE_PROC] > 0 && proc_type_table[0][MAIN_CORE_PROC] > 0) { bool fp_intesive = !ov::op::util::has_op_with_type(ngraphFunc); const int int8_threshold = 4; // ~relative efficiency of the VNNI-intensive code for Big vs Little cores; const int fp32_threshold = 2; // ~relative efficiency of the AVX2 fp32 code for Big vs Little cores; @@ -396,7 +397,7 @@ int get_model_prefer_threads(const int num_streams, const float L2_cache_size = dnnl::utils::get_cache_size(2 /*level*/, true /*per core */); ov::MemBandwidthPressure networkToleranceForLowCache = ov::MemBandwidthPressureTolerance(ngraphFunc, L2_cache_size, memThresholdAssumeLimitedForISA); - model_prefer = IStreamsExecutor::Config::StreamMode::DEFAULT; + model_prefer = ov::threading::IStreamsExecutor::Config::StreamMode::DEFAULT; if (networkToleranceForLowCache.max_mem_tolerance == ov::MemBandwidthPressure::UNKNOWN) { if ((networkToleranceForLowCache.ratio_compute_convs == ov::MemBandwidthPressure::ALL) || (networkToleranceForLowCache.ratio_compute_deconvs == ov::MemBandwidthPressure::ALL)) { @@ -418,27 +419,31 @@ int get_model_prefer_threads(const int num_streams, return model_prefer; } -void generate_stream_info(const int streams, - const std::shared_ptr& ngraphFunc, - Config& config, - int preferred_nthreads_per_stream) { +std::vector> generate_stream_info(const int streams, + const std::shared_ptr& ngraphFunc, + Config& config, + std::vector>& proc_type_table, + int preferred_nthreads_per_stream) { int model_prefer_threads = preferred_nthreads_per_stream; InferenceEngine::IStreamsExecutor::Config& executor_config = config.streamExecutorConfig; - auto& orig_proc_type_table = executor_config._orig_proc_type_table; - std::vector> proc_type_table = - apply_scheduling_core_type(config.schedulingCoreType, orig_proc_type_table); + + proc_type_table = apply_scheduling_core_type(config.schedulingCoreType, proc_type_table); proc_type_table = apply_hyper_threading(config.enableHyperThreading, config.changedHyperThreading, config.perfHintsConfig.ovPerfHint, proc_type_table); - executor_config._proc_type_table = proc_type_table; - executor_config._cpu_pinning = get_cpu_pinning(config.enableCpuPinning, - config.changedCpuPinning, - streams, - executor_config._threadBindingType, - proc_type_table); + executor_config._cpu_reservation = get_cpu_pinning(config.enableCpuPinning, + config.changedCpuPinning, + streams, + executor_config._threadBindingType, + config.latencyThreadingMode, + proc_type_table); if (-1 == preferred_nthreads_per_stream) { - model_prefer_threads = get_model_prefer_threads(streams, proc_type_table, ngraphFunc, executor_config); + model_prefer_threads = get_model_prefer_threads(streams, + config.latencyThreadingMode, + proc_type_table, + ngraphFunc, + executor_config); } executor_config._streams_info_table = get_streams_info_table(executor_config._streams, @@ -449,37 +454,28 @@ void generate_stream_info(const int streams, config.perfHintsConfig.ovPerfHint, config.latencyThreadingMode, proc_type_table); + return proc_type_table; } -void get_num_streams(const int streams, - const std::shared_ptr& ngraphFunc, - Config& config) { +void get_num_streams(const int streams, const std::shared_ptr& ngraphFunc, Config& config) { InferenceEngine::IStreamsExecutor::Config& executor_config = config.streamExecutorConfig; - std::vector stream_ids; - std::string log = "[ streams info ]"; - std::vector core_type_str = {" Any core: ", " PCore: ", " ECore: ", " Logical core: "}; - - std::vector> orig_proc_type_table = get_proc_type_table(); + std::vector> proc_type_table = get_proc_type_table(); - executor_config._orig_proc_type_table = orig_proc_type_table; - generate_stream_info(streams, ngraphFunc, config); + generate_stream_info(streams, ngraphFunc, config, proc_type_table); - executor_config._stream_core_ids = reserve_available_cpus(executor_config._streams_info_table); + executor_config = InferenceEngine::IStreamsExecutor::Config::reserve_cpu_threads(executor_config); executor_config._threadsPerStream = executor_config._streams_info_table[0][THREADS_PER_STREAM]; - executor_config._streams = 0; - executor_config._threads = 0; - for (size_t i = 0; i < executor_config._streams_info_table.size(); i++) { - executor_config._streams += executor_config._streams_info_table[i][NUMBER_OF_STREAMS]; - executor_config._threads += executor_config._streams_info_table[i][NUMBER_OF_STREAMS] * - executor_config._streams_info_table[i][THREADS_PER_STREAM]; - stream_ids.insert(stream_ids.end(), executor_config._streams_info_table[i][NUMBER_OF_STREAMS], i); - log += core_type_str[executor_config._streams_info_table[i][PROC_TYPE]] + - std::to_string(executor_config._streams_info_table[i][NUMBER_OF_STREAMS]) + "(" + - std::to_string(executor_config._streams_info_table[i][THREADS_PER_STREAM]) + ")"; +} + +int get_default_latency_streams(Config::LatencyThreadingMode latency_threading_mode) { + if (latency_threading_mode == Config::LatencyThreadingMode::PER_NUMA_NODE) { + return get_num_sockets(); + } else if (latency_threading_mode == Config::LatencyThreadingMode::PER_SOCKET) { + return get_num_numa_nodes(); + } else { + return 1; } - executor_config._stream_ids = stream_ids; - log += " Total: " + std::to_string(executor_config._streams) + "(" + std::to_string(executor_config._threads) + ")"; - DEBUG_LOG(log); } + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp index 16821971b1e61b..4c425bb157066a 100644 --- a/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp +++ b/src/plugins/intel_cpu/src/cpu_streams_calculation.hpp @@ -56,27 +56,34 @@ std::vector> get_streams_info_table(const int input_streams, * @param[in] num_streams is target streams set by user via NUM_STREAMS or hints. * - input "0" mean function generate the optimal number of streams * - LATENCY hint equals 1 stream. + * @param[in] latency_threading_mode is the scope of candidate processors per stream for latency hint + * - user can select all processors per numa node, per socket, or per platform. * @param[in] proc_type_table candidate processors available at this time * - candidate processors have benn updated based on properties like "Ecore only" in previous function * @param[in] ngraphFunc ngraph function * @return model_prefer_threads "0" means generating the optimal threads per stream based on platform */ int get_model_prefer_threads(const int num_streams, + const Config::LatencyThreadingMode latency_threading_mode, const std::vector> proc_type_table, const std::shared_ptr& ngraphFunc, - const InferenceEngine::IStreamsExecutor::Config streamExecutorConfig); + const ov::threading::IStreamsExecutor::Config streamExecutorConfig); /** * @brief Generate streams information according to processors type table * @param[in] streams number of streams * @param[in] ngraphFunc graph handle * @param[in] config intel cpu configuration + * @param[in] proc_type_table candidate processors available at current platform * @param[in] preferred_nthreads_per_stream is initial preferred number of threads per stream + * @return candidate processors have benn updated based on user input hints like ov::hint::scheduling_core_type and + * ov::hint::enable_hyper_threading */ -void generate_stream_info(const int streams, - const std::shared_ptr& ngraphFunc, - Config& config, - int preferred_nthreads_per_stream = -1); +std::vector> generate_stream_info(const int streams, + const std::shared_ptr& ngraphFunc, + Config& config, + std::vector>& proc_type_table, + int preferred_nthreads_per_stream = -1); struct StreamCfg { int num_streams; // Number of streams @@ -98,5 +105,12 @@ void get_num_streams(const int streams, const std::shared_ptr& ngraphFunc, Config& config); +/** + * @brief Get default number of streams in certain latency threading mode + * @param[in] latency_threading_mode is the scope of candidate processors per stream for latency hint + * @return number of streams + */ +int get_default_latency_streams(Config::LatencyThreadingMode latency_threading_mode); + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index 639d492e1f6ce5..f303ca3eeeef1e 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -17,7 +17,7 @@ #include "ie_icore.hpp" #include "ie_plugin_config.hpp" #include "ie_system_conf.h" -#include "threading/ie_cpu_streams_info.hpp" +#include "openvino/runtime/threading/cpu_streams_info.hpp" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" #include "openvino/runtime/intel_cpu/properties.hpp" @@ -286,7 +286,7 @@ void Engine::GetPerformanceStreams(Config& config, const std::shared_ptr Date: Thu, 20 Jul 2023 17:35:11 +0200 Subject: [PATCH 6/6] [PT FE] Add quantized::conv2d and quantized::conv2d_relu (#18651) * Add quantized conv2d * Fix schema * Remove mark_output * Remove tests from pre-commit --- .../pytorch/src/op/quantized_convnd.cpp | 95 +++++++++++++++++++ .../pytorch/src/op/quantized_linear.cpp | 2 +- src/frontends/pytorch/src/op_table.cpp | 4 + .../pytorch_tests/test_quantized_convnd.py | 85 +++++++++++++++++ .../pytorch_tests/test_quantized_linear.py | 2 +- 5 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 src/frontends/pytorch/src/op/quantized_convnd.cpp create mode 100644 tests/layer_tests/pytorch_tests/test_quantized_convnd.py diff --git a/src/frontends/pytorch/src/op/quantized_convnd.cpp b/src/frontends/pytorch/src/op/quantized_convnd.cpp new file mode 100644 index 00000000000000..37ab867d72a4ad --- /dev/null +++ b/src/frontends/pytorch/src/op/quantized_convnd.cpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018-2023 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "openvino/frontend/pytorch/node_context.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/convolution.hpp" +#include "openvino/op/group_conv.hpp" +#include "openvino/op/relu.hpp" +#include "utils.hpp" +#include "utils_quantize.hpp" + +namespace ov { +namespace frontend { +namespace pytorch { +namespace op { + +using namespace ov::op; + +namespace { +Output translate_quantized_convnd_base(const NodeContext& context) { + auto input = context.get_input(0); + auto packed_params_node = + std::dynamic_pointer_cast(context.get_input(1).get_node_shared_ptr()); + FRONT_END_OP_CONVERSION_CHECK(packed_params_node, "Packed params input node type is required to be FrameworkNode."); + const auto& attrs = packed_params_node->get_attrs(); + FRONT_END_OP_CONVERSION_CHECK((attrs.find(PtFrameworkNode::op_type_key) != attrs.end()), + "Packed params input node does not contain information about op type."); + FRONT_END_OP_CONVERSION_CHECK((attrs.at(PtFrameworkNode::op_type_key) == "prim::GetAttr"), + "Incorrect packed params input node operator type, expected prim::GetAttr."); + auto packed_params = packed_params_node->inputs(); + + FRONT_END_OP_CONVERSION_CHECK(packed_params.size() == 6, + "Packed parameters for quantized conv should contain 6 items."); + // Packed params: weight, bias, stride, padding, dilation, groups + auto weight = packed_params[0].get_source_output(); + auto bias = packed_params[1].get_source_output(); + auto strides = std::dynamic_pointer_cast(packed_params[2].get_source_output().get_node_shared_ptr()) + ->cast_vector(); + auto pads = std::dynamic_pointer_cast(packed_params[3].get_source_output().get_node_shared_ptr()) + ->cast_vector(); + auto dilations = std::dynamic_pointer_cast(packed_params[4].get_source_output().get_node_shared_ptr()) + ->cast_vector(); + int64_t groups = std::dynamic_pointer_cast(packed_params[5].get_source_output().get_node_shared_ptr()) + ->cast_vector()[0]; + + auto pad_type = ov::op::PadType::EXPLICIT; + + std::shared_ptr conv; + if (groups == 1) { + conv = std::make_shared(input, weight, strides, pads, pads, dilations, pad_type); + } else { + conv = std::make_shared(input, + reshape_kernel_for_group(context, weight, groups), + strides, + pads, + pads, + dilations, + pad_type); + } + auto bias_rank = bias.get_partial_shape().rank(); + if (bias_rank == 1) { + bias = reshape_channelwise(context, bias, conv); + } + conv = context.mark_node(std::make_shared(conv, bias)); + + return conv->output(0); +}; +}; // namespace + +OutputVector translate_quantized_convnd(const NodeContext& context) { + // "quantized::conv2d.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float + // output_scale, int output_zero_point) -> Tensor" + num_inputs_check(context, 4, 4); + auto scale = context.get_input(2); + auto zero_point = context.get_input(3); + return {quantize(context, translate_quantized_convnd_base(context), scale, zero_point, context.get_input(0))}; +} + +OutputVector translate_quantized_convnd_relu(const NodeContext& context) { + // "quantized::conv2d_relu.new(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, + // float output_scale, int output_zero_point) -> Tensor" + num_inputs_check(context, 4, 4); + auto scale = context.get_input(2); + auto zero_point = context.get_input(3); + auto conv = translate_quantized_convnd_base(context); + auto relu = context.mark_node(std::make_shared(conv)); + return {quantize(context, relu->output(0), scale, zero_point, context.get_input(0))}; +} + +} // namespace op +} // namespace pytorch +} // namespace frontend +} // namespace ov diff --git a/src/frontends/pytorch/src/op/quantized_linear.cpp b/src/frontends/pytorch/src/op/quantized_linear.cpp index 13c19402b30b67..a69013f3fabb6b 100644 --- a/src/frontends/pytorch/src/op/quantized_linear.cpp +++ b/src/frontends/pytorch/src/op/quantized_linear.cpp @@ -37,7 +37,7 @@ OutputVector translate_quantized_linear(const NodeContext& context) { linear = context.mark_node(std::make_shared(linear, bias)); auto scale = context.get_input(2); auto zero_point = context.get_input(3); - return {context.mark_output(quantize(context, linear, scale, zero_point, x))}; + return {quantize(context, linear, scale, zero_point, x)}; }; } // namespace op diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 77e5adc80c7c7c..f39c8aadbbb1b6 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -158,6 +158,8 @@ OP_CONVERTER(translate_var_mean); OP_CONVERTER(translate_where); OP_CONVERTER(translate_zeros); OP_CONVERTER(translate_zeros_like); +OP_CONVERTER(translate_quantized_convnd); +OP_CONVERTER(translate_quantized_convnd_relu); OP_CONVERTER(translate_quantized_linear); } // namespace op @@ -419,6 +421,8 @@ const std::map get_supported_ops() { {"prim::requires_grad", op::return_false_scalar}, {"prim::PythonOp", op::translate_pythonop}, {"prim::type", op::skip_node}, // Used with prim::device, pass PtFrameworkNode. + {"quantized::conv2d", op::translate_quantized_convnd}, + {"quantized::conv2d_relu", op::translate_quantized_convnd_relu}, {"quantized::linear", op::translate_quantized_linear}, {"torchvision::deform_conv2d", op::translate_deform_conv}, {"torchvision::nms", op::translate_nms}, diff --git a/tests/layer_tests/pytorch_tests/test_quantized_convnd.py b/tests/layer_tests/pytorch_tests/test_quantized_convnd.py new file mode 100644 index 00000000000000..7424636eea375e --- /dev/null +++ b/tests/layer_tests/pytorch_tests/test_quantized_convnd.py @@ -0,0 +1,85 @@ +# Copyright (C) 2018-2023 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import pytest +import numpy as np +import torch + +from openvino.frontend import FrontEndManager +from openvino.frontend.pytorch.decoder import TorchScriptPythonDecoder +from pytorch_layer_test_class import PytorchLayerTest + + +class TestQuantizedConv2D(PytorchLayerTest): + def _prepare_input(self): + return (np.random.randn(2, 3, 25, 25).astype(np.float32),) + + def create_model(self, weights_shape, strides, pads, dilations, groups, bias, relu, scale, zero_point): + class quantized_conv2d(torch.nn.Module): + def __init__(self): + super(quantized_conv2d, self).__init__() + if not relu: + conv_func = torch.ao.nn.quantized.Conv2d + else: + conv_func = torch.ao.nn.intrinsic.quantized.ConvReLU2d + self.conv = conv_func( + weights_shape[1] * groups, + weights_shape[0], + weights_shape[2:], + strides, + pads, + dilations, + groups, + bias, + ) + if bias: + torch.nn.init.normal_(self.conv.bias()) + self.conv.scale = float(scale) + self.conv.zero_point = int(zero_point) + + def forward(self, x): + x_quantized = torch.quantize_per_tensor(x, 1.0, 0, torch.quint8) + conv = self.conv(x_quantized) + return torch.dequantize(conv).contiguous() + + ref_net = None + if not relu: + op_name = "quantized::conv2d" + else: + op_name = "quantized::conv2d_relu" + + return quantized_conv2d(), ref_net, op_name + + @pytest.mark.parametrize( + "params", + [ + pytest.param( + {"weights_shape": [1, 3, 3, 3], "strides": 1, "pads": 0, "dilations": 1, "groups": 1}, + marks=pytest.mark.xfail( + reason="Output channels equal to 1 creates output that fails to cast to contiguous." + ), + ), + {"weights_shape": [2, 3, 3, 3], "strides": 1, "pads": 0, "dilations": 1, "groups": 1}, + {"weights_shape": [2, 3, 3, 3], "strides": 2, "pads": 0, "dilations": 1, "groups": 1}, + {"weights_shape": [2, 3, 3, 3], "strides": 1, "pads": 1, "dilations": 1, "groups": 1}, + {"weights_shape": [2, 3, 3, 3], "strides": 1, "pads": 0, "dilations": 2, "groups": 1}, + {"weights_shape": [2, 3, 3, 3], "strides": 1, "pads": [0, 1], "dilations": 1, "groups": 1}, + {"weights_shape": [2, 3, 3, 3], "strides": 1, "pads": [1, 0], "dilations": 1, "groups": 1}, + {"weights_shape": [3, 1, 3, 3], "strides": 1, "pads": 0, "dilations": 1, "groups": 3}, + ], + ) + @pytest.mark.parametrize("bias", [True, False]) + @pytest.mark.parametrize("relu", [True, False]) + @pytest.mark.parametrize("scale", [1, 0.3, 1.3]) + @pytest.mark.parametrize("zero_point", [0, 1]) + @pytest.mark.nightly + # @pytest.mark.precommit Test disabled due to sporadic issues + def test_quantized_conv2d(self, params, bias, relu, scale, zero_point, ie_device, precision, ir_version): + self._test( + *self.create_model(**params, bias=bias, relu=relu, scale=scale, zero_point=zero_point), + ie_device, + precision, + ir_version, + trace_model=True, + freeze_model=False + ) diff --git a/tests/layer_tests/pytorch_tests/test_quantized_linear.py b/tests/layer_tests/pytorch_tests/test_quantized_linear.py index 21f1353eeefa85..cc30a313d315d7 100644 --- a/tests/layer_tests/pytorch_tests/test_quantized_linear.py +++ b/tests/layer_tests/pytorch_tests/test_quantized_linear.py @@ -44,7 +44,7 @@ def forward(self, inp): @pytest.mark.parametrize("zero_point", [0, 1]) @pytest.mark.parametrize("trace", [True, False]) @pytest.mark.nightly - @pytest.mark.precommit + # @pytest.mark.precommit Test disabled due to sporadic issues def test_quantized_linear(self, params, scale, zero_point, trace, ie_device, precision, ir_version): input_shape = params.get("input_shape") weight_shape = params.get("weight_shape")