Skip to content

Commit

Permalink
[Snippets] Propagate work amounts from Subgraph To LinearIR (openvino…
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova authored Dec 27, 2023
1 parent 95e8726 commit e0eea90
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 33 deletions.
12 changes: 3 additions & 9 deletions src/common/snippets/include/snippets/op/subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class Subgraph : public ov::op::util::SubGraphOp {
const std::vector<pass::Manager::PositionedPass>& data_flow_passes = {},
const lowered::pass::PassPipeline& control_flow_passes_pre_common = {},
const lowered::pass::PassPipeline& control_flow_passes_post_common = {},
size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256,
const std::shared_ptr<IShapeInferSnippetsFactory>& factory = nullptr,
const void* compile_params = nullptr);

Expand All @@ -119,8 +120,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
void set_generator(std::shared_ptr<ov::snippets::Generator> generator);
void set_tile_rank(size_t newRank) {tileRank = newRank;}
void set_virtual_port_count(size_t count);
void set_min_jit_work_amount(size_t jit_work_amount);
void set_min_parallel_work_amount(size_t parallel_work_amount);

void print() const;

Expand All @@ -143,7 +142,8 @@ class Subgraph : public ov::op::util::SubGraphOp {
const std::vector<ov::element::Type>& output_precisions = {},
const std::vector<snippets::pass::Manager::PositionedPass>& = {});
std::shared_ptr<lowered::LinearIR>
convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory = std::make_shared<IShapeInferSnippetsFactory>());
convert_body_to_linear_ir(size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256,
const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory = std::make_shared<IShapeInferSnippetsFactory>());
std::shared_ptr<Subgraph> clone() const;

private:
Expand Down Expand Up @@ -176,12 +176,6 @@ class Subgraph : public ov::op::util::SubGraphOp {
// True if body has operations that don't support plugin-side domain optimizations
// (e.g. Transpose, Softmax, MatMul in general doesn't support dimensions collapsing)
bool m_has_domain_sensitive_ops = false;
// Minimal advised work amount for parallel execution.
// Set by a backend, typically equals to the number of threads available on the machine.
size_t m_min_parallel_work_amount = 8;
// Minimal advised work amount every JIT kernel should process during one execution call
// Set by a backend, should be large enough to compensate for the kernel call overheads
size_t m_min_jit_work_amount = 256;
} config;

std::shared_ptr<ShapeInferSnippetsNode> m_shape_infer = nullptr;
Expand Down
18 changes: 6 additions & 12 deletions src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ void Subgraph::set_virtual_port_count(const size_t count) {
m_virtual_port_count = count;
}

void Subgraph::set_min_jit_work_amount(const size_t jit_work_amount) {
config.m_min_jit_work_amount = jit_work_amount;
}

void Subgraph::set_min_parallel_work_amount(const size_t parallel_work_amount) {
config.m_min_parallel_work_amount = parallel_work_amount;
}

auto Subgraph::is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bool {
return ov::is_type<ov::op::v1::Transpose>(op) ||
ov::is_type<ov::op::v1::Softmax>(op) ||
Expand Down Expand Up @@ -347,7 +339,8 @@ VectorDims Subgraph::infer_master_shape() {
}

std::shared_ptr<lowered::LinearIR>
Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
Subgraph::convert_body_to_linear_ir(size_t min_parallel_work_amount, size_t min_kernel_work_amount,
const std::shared_ptr<IShapeInferSnippetsFactory>& shape_infer_factory) {
lowered::Config lowering_config;
lowering_config.m_save_expressions = config.m_has_domain_sensitive_ops;
#ifdef SNIPPETS_DEBUG_CAPS
Expand All @@ -356,8 +349,8 @@ Subgraph::convert_body_to_linear_ir(const std::shared_ptr<IShapeInferSnippetsFac
lowering_config.m_need_fill_tail_register = config.m_has_domain_sensitive_ops;
lowering_config.m_loop_depth = tileRank;
lowering_config.m_enable_domain_optimization = !config.m_has_domain_sensitive_ops;
lowering_config.m_min_parallel_work_amount = config.m_min_parallel_work_amount;
lowering_config.m_min_kernel_work_amount = config.m_min_jit_work_amount;
lowering_config.m_min_parallel_work_amount = min_parallel_work_amount;
lowering_config.m_min_kernel_work_amount = min_kernel_work_amount;

m_linear_ir = std::make_shared<lowered::LinearIR>(body_ptr(), shape_infer_factory, lowering_config);
m_shape_infer = m_linear_ir->get_shape_infer_instance();
Expand Down Expand Up @@ -475,10 +468,11 @@ snippets::Schedule Subgraph::generate(const BlockedShapeVector& blocked_input_sh
const std::vector<snippets::pass::Manager::PositionedPass>& data_flow_backend_passes,
const lowered::pass::PassPipeline& backend_passes_pre_common,
const lowered::pass::PassPipeline& backend_passes_post_common,
size_t min_parallel_work_amount, size_t min_kernel_work_amount,
const std::shared_ptr<IShapeInferSnippetsFactory>& factory,
const void* compile_params) {
data_flow_transformations(blocked_input_shapes, input_precisions, output_precisions, data_flow_backend_passes);
convert_body_to_linear_ir(factory);
convert_body_to_linear_ir(min_parallel_work_amount, min_kernel_work_amount, factory);
return generate_from_linear_ir(backend_passes_pre_common, backend_passes_post_common, compile_params);
}

Expand Down
1 change: 1 addition & 0 deletions src/common/snippets/tests/include/lowering_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class LoweringTests : public TransformationTestsF {
const ov::snippets::lowered::pass::PassPipeline& lowered_pre_common = {},
const ov::snippets::lowered::pass::PassPipeline& lowered_post_common = {},
const std::shared_ptr<ov::snippets::Generator>& generator = nullptr,
size_t min_parallel_work_amount = 8, size_t min_kernel_work_amount = 256,
const std::shared_ptr<IShapeInferSnippetsFactory>& factory = std::make_shared<IShapeInferSnippetsFactory>());
static std::shared_ptr<ov::snippets::op::Subgraph> getTokenizedSubgraph(const std::shared_ptr<Model>& f);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,10 @@ void OptimizeDomainTest::SetUp() {

TEST_P(OptimizeDomainTest, DomainOptimization) {
auto subgraph = LoweringTests::getTokenizedSubgraph(m_model);
subgraph->set_min_jit_work_amount(m_domain_opt_params.min_jit_work_amount);
subgraph->set_min_parallel_work_amount(m_domain_opt_params.min_parallel_work_amount);
auto linear_ir = *subgraph->convert_body_to_linear_ir();
auto linear_ir = subgraph->convert_body_to_linear_ir(m_domain_opt_params.min_parallel_work_amount, m_domain_opt_params.min_jit_work_amount);
size_t loop_depth = 1;
ov::snippets::lowered::pass::OptimizeDomain(loop_depth).run(linear_ir);
const auto& master_shape = linear_ir.get_master_shape();
ov::snippets::lowered::pass::OptimizeDomain(loop_depth).run(*linear_ir);
const auto& master_shape = linear_ir->get_master_shape();
EXPECT_EQ(loop_depth, m_domain_opt_params.exp_loop_depth) << "Inconsistent loop depth detected";
EXPECT_THAT(master_shape, testing::ContainerEq(m_domain_opt_params.exp_master_shape)) << "Inconsistent master_shape detected";
}
Expand Down
3 changes: 2 additions & 1 deletion src/common/snippets/tests/src/lowering_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ std::shared_ptr<ov::snippets::op::Subgraph>
const ov::snippets::lowered::pass::PassPipeline& lowered_pre_common,
const ov::snippets::lowered::pass::PassPipeline& lowered_post_common,
const std::shared_ptr<ov::snippets::Generator>& generator,
size_t min_parallel_work_amount, size_t min_kernel_work_amount,
const std::shared_ptr<IShapeInferSnippetsFactory>& factory) {
auto subgraph = getTokenizedSubgraph(f);
subgraph->set_generator(generator == nullptr ? std::make_shared<DummyGenerator>() : generator);
subgraph->set_tile_rank(2);
// Note: lowered_pipeline would have no effect on subgraph body, since it's applied on linear IR
subgraph->generate({}, {}, {}, backend_passes, lowered_pre_common, lowered_post_common, factory);
subgraph->generate({}, {}, {}, backend_passes, lowered_pre_common, lowered_post_common, min_parallel_work_amount, min_kernel_work_amount, factory);
return subgraph;
}

Expand Down
10 changes: 4 additions & 6 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,10 @@ void Snippet::initOptimalPrimitiveDescriptor() {
output_precisions.push_back(p);

snippetAttrs.snippet->data_flow_transformations(in_blocked_shapes, input_precisions, output_precisions, backend_passes);
snippetAttrs.snippet->convert_body_to_linear_ir(std::make_shared<snippets::CPUShapeInferSnippetsFactory>());
// Note: minimal JIT work amount is a predefined value that describes the number of kernel iterations (work amount)
// needed to cover kernel call overhead. It is used for balancing between parallel and JIT work amounts in domain optimization.
snippetAttrs.snippet->convert_body_to_linear_ir(static_cast<size_t>(parallel_get_max_threads()), 256,
std::make_shared<snippets::CPUShapeInferSnippetsFactory>());
}

ov::element::Type Snippet::getRuntimePrecision() const {
Expand Down Expand Up @@ -570,11 +573,6 @@ Snippet::SnippetJitExecutor::SnippetJitExecutor(SnippetAttrs attrs, bool is_dyna
if (std::any_of(canonicalShape.begin(), canonicalShape.end(),
[](size_t x){return x == snippets::IShapeInferSnippets::DYNAMIC_DIMENSION;}))
OPENVINO_THROW("Snippets: Canonicalization returned dynamic shape in static pipeline");
snippetAttrs.snippet->set_min_parallel_work_amount(static_cast<size_t>(parallel_get_max_threads()));

// Note: minimal JIT work amount is a predefined value that describes the number of kernel iterations (work amount)
// needed to cover kernel call overhead. It is used for balancing between parallel and JIT work amounts in domain optimization.
snippetAttrs.snippet->set_min_jit_work_amount(256);

// generate
jit_snippets_compile_args jcp;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ TEST_P(MulAddToFMATests, MulAddToFMATests) {
{},
{},
generator,
8, 256,
std::make_shared<ov::snippets::CPUShapeInferSnippetsFactory>());
model = subgraph->body_ptr();
model_ref = snippets_model->getLowered();
Expand Down

0 comments on commit e0eea90

Please sign in to comment.