diff --git a/.ci/env/apt.sh b/.ci/env/apt.sh
index 4c370e8ee5e..d5ca5a3f48e 100755
--- a/.ci/env/apt.sh
+++ b/.ci/env/apt.sh
@@ -23,21 +23,19 @@ function update {
 }
 
 function add_repo {
-    wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-    sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-    rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
-    echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
-    sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main"
-    sudo apt-get update
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \
+        gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+        | sudo tee /etc/apt/sources.list.d/oneAPI.list
+    sudo apt update
 }
 
 function install_dpcpp {
-    sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-2024.2
-    sudo bash -c 'echo libintelocl.so > /etc/OpenCL/vendors/intel-cpu.icd'
+    sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp intel-oneapi-runtime-libs
 }
 
 function install_mkl {
-    sudo apt-get install intel-oneapi-mkl-devel
+    sudo apt-get install -y intel-oneapi-mkl-devel=2024.2.1-103
 }
 
 function install_clang-format {
diff --git a/.ci/env/environment.yml b/.ci/env/environment.yml
index 61655fe9e76..171bd54de52 100644
--- a/.ci/env/environment.yml
+++ b/.ci/env/environment.yml
@@ -1,6 +1,5 @@
 name: ci-env
 channels:
   - conda-forge
-  - defaults
 dependencies:
   - impi-devel=2021.12.0
diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml
index a0620f66e9d..c19a367a1b6 100755
--- a/.ci/pipeline/ci.yml
+++ b/.ci/pipeline/ci.yml
@@ -28,6 +28,9 @@ variables:
   TBB_VERSION : 'v2021.10.0'
   VM_IMAGE : 'ubuntu-22.04'
   SYSROOT_OS: 'jammy'
+  WIN_BASEKIT_VERSION: '2024.2.1.101'
+  WINDOWS_BASEKIT_URL: 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/d91caaa0-7306-46ea-a519-79a0423e1903/w_BaseKit_p_$(WIN_BASEKIT_VERSION)_offline.exe'
+  WINDOWS_DPCPP_COMPONENTS: 'intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.tbb.devel'
 
 jobs:
 - job: 'FormatterChecks'
@@ -58,6 +61,9 @@ jobs:
   - script: |
       .ci/env/apt.sh dev-base
     displayName: 'apt-get and conda install'
+  - script: |
+      .ci/env/apt.sh mkl
+    displayName: 'mkl installation'
   - script: |
       .ci/scripts/describe_system.sh
     displayName: 'System info'
@@ -415,6 +421,9 @@ jobs:
   - script: |
       .ci/env/apt.sh dpcpp
     displayName: 'dpcpp installation'
+  - script: |
+      .ci/env/apt.sh mkl
+    displayName: 'mkl installation'
   - script: |
       source /opt/intel/oneapi/compiler/latest/env/vars.sh
       .ci/scripts/describe_system.sh
@@ -435,14 +444,17 @@ jobs:
     continueOnError: true
   - script: |
       source /opt/intel/oneapi/compiler/latest/env/vars.sh
+      source /opt/intel/oneapi/mkl/latest/env/vars.sh
       .ci/scripts/test.sh --test-kind examples --build-dir $(release.dir) --compiler clang --interface daal/cpp --build-system cmake
     displayName: 'daal/cpp examples'
   - script: |
       source /opt/intel/oneapi/compiler/latest/env/vars.sh
+      source /opt/intel/oneapi/mkl/latest/env/vars.sh
       .ci/scripts/test.sh --test-kind examples --build-dir $(release.dir) --compiler clang --interface oneapi/cpp --build-system cmake
     displayName: 'oneapi/cpp examples'
   - script: |
       source /opt/intel/oneapi/compiler/latest/env/vars.sh
+      source /opt/intel/oneapi/mkl/latest/env/vars.sh
       .ci/scripts/test.sh --test-kind samples --build-dir $(release.dir) --compiler gnu --interface daal/cpp/mpi --conda-env ci-env --build-system cmake
     displayName: 'daal/cpp/mpi samples'
   - task: PublishPipelineArtifact@1
@@ -520,12 +532,15 @@ jobs:
                  --test_thread_mode=par
     displayName: 'cpp-examples-thread-release-static'
 
-  - script: |
-      export DALROOT=`pwd`/bazel-bin/release/daal/latest
-      bazel test //examples/oneapi/cpp:all \
-                 --test_link_mode=release_dynamic \
-                 --test_thread_mode=par
-    displayName: 'cpp-examples-thread-release-dynamic'
+  # The issue that bazel doesnt link MKL libs via -Wl, --start-group..
+  # oneDAL make build pass this test
+  # TODO: add cycle linking in bazel
+  # - script: |
+  #     export DALROOT=`pwd`/bazel-bin/release/daal/latest
+      # bazel test //examples/oneapi/cpp:all \
+      #            --test_link_mode=release_dynamic \
+      #            --test_thread_mode=par
+  #   displayName: 'cpp-examples-thread-release-dynamic'
 
   - script: |
       bazel test //cpp/daal:tests
@@ -620,7 +635,7 @@ jobs:
       conda activate CB
       source $(Pipeline.Workspace)/daal/latest/env/vars.sh
       ./sklearnex/conda-recipe/run_test.sh
-    timeoutInMinutes: 15
+    timeoutInMinutes: 20
     displayName: sklearnex test
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
@@ -645,57 +660,6 @@ jobs:
     displayName: 'Upload conformance tests artifacts'
     continueOnError: true
 
-- job: 'macOSMakeClang'
-  timeoutInMinutes: 0
-  variables:
-    release.dir: '__release_mac_clang'
-    platform.type : 'mac32e'
-  pool:
-    vmImage:  'macos-12'
-  steps:
-  - script: |
-      brew install dos2unix tree
-      conda create -n ci-env -q -y -c conda-forge python=3.10
-      source /usr/local/miniconda/etc/profile.d/conda.sh
-      conda activate ci-env
-      pip install -q cpufeature
-    displayName: 'brew and conda install'
-  - script: |
-      source /usr/local/miniconda/etc/profile.d/conda.sh
-      conda activate ci-env
-      .ci/scripts/describe_system.sh
-    displayName: 'System info'
-  - script: |
-      .ci/scripts/build.sh --compiler clang --target daal --optimizations "sse2 avx2" --conda-env ci-env
-    displayName: 'make daal'
-  - script: |
-      .ci/scripts/build.sh --compiler clang --target onedal_c --optimizations "sse2 avx2"
-    displayName: 'make onedal_c'
-  - task: PublishPipelineArtifact@1
-    inputs:
-      artifactName: '$(platform.type) build'
-      targetPath: '$(Build.Repository.LocalPath)/$(release.dir)'
-    displayName: 'Upload build artifacts'
-    continueOnError: true
-  - script: |
-      .ci/scripts/test.sh --test-kind examples --build-dir $(release.dir) --compiler clang --interface daal/cpp --build-system cmake
-    displayName: 'daal/cpp examples'
-  - script: |
-      .ci/scripts/test.sh --test-kind examples --build-dir $(release.dir) --compiler clang --interface oneapi/cpp --build-system cmake
-    displayName: 'oneapi/cpp examples'
-  - script: |
-      deploy/nuget/prepare_dal_nuget.sh --release-dir $(release.dir) --build-nupkg yes
-      tree -h -I include __nuget/inteldal*/
-      ls -lh __nuget/inteldal*.nupkg
-    displayName: 'nuget pkg'
-  - task: PublishPipelineArtifact@1
-    inputs:
-      artifactName: '$(platform.type) fail'
-      targetPath: '$(Build.Repository.LocalPath)/$(release.dir)'
-    displayName: 'Uploading on fail'
-    condition: failed()
-    continueOnError: true
-
 - job: 'WindowsMakeVC'
   timeoutInMinutes: 0
   variables:
@@ -704,6 +668,8 @@ jobs:
   pool:
     vmImage: 'windows-2022'
   steps:
+  - script: .ci/scripts/install_windows.bat $(WINDOWS_BASEKIT_URL) $(WINDOWS_DPCPP_COMPONENTS)
+    displayName: Install oneAPI Base Toolkit
   - script: |
       set PATH=C:\msys64\usr\bin;%PATH%
       pip install cpufeature
@@ -714,9 +680,13 @@ jobs:
       bash .ci/scripts/describe_system.sh
     displayName: 'System info'
   - script: |
+      call C:\temp\oneapi\setvars.bat --force
+      set MKL_FPK_GPU_VERSION_LINE=2024.0.0
       .\.ci\scripts\build.bat daal vc avx2
     displayName: 'make daal'
   - script: |
+      call C:\temp\oneapi\setvars.bat --force
+      set MKL_FPK_GPU_VERSION_LINE=2024.0.0
       .\.ci\scripts\build.bat onedal_c vc avx2
     displayName: 'make onedal_c'
   - task: PublishPipelineArtifact@1
@@ -726,10 +696,12 @@ jobs:
     displayName: 'Upload build artifacts'
     continueOnError: true
   - script: |
+      call C:\temp\oneapi\setvars.bat --force
       .\.ci\scripts\test.bat daal\cpp lib msvs cmake
       .\.ci\scripts\test.bat daal\cpp dll msvs cmake
     displayName: 'daal/cpp examples'
   - script: |
+      call C:\temp\oneapi\setvars.bat --force
       .\.ci\scripts\test.bat oneapi\cpp lib msvs cmake
       .\.ci\scripts\test.bat oneapi\cpp dll msvs cmake
     displayName: 'oneapi/cpp examples'
diff --git a/.ci/scripts/build.bat b/.ci/scripts/build.bat
index 76c78c620cd..2f460df60c4 100644
--- a/.ci/scripts/build.bat
+++ b/.ci/scripts/build.bat
@@ -28,14 +28,18 @@ set PATH=C:\msys64\usr\bin;%PATH%
 echo pacman -S --noconfirm msys/make msys/dos2unix
 pacman -S --noconfirm msys/make msys/dos2unix
 
-echo call .ci\env\tbb.bat
-if "%TBBROOT%"=="" if not exist .\__deps\tbb\win\tbb call .ci\env\tbb.bat || set errorcode=1
-
-echo call .\dev\download_micromkl.bat
-if "%MKLGPUFPKROOT%"=="" if not exist .\__deps\mklgpufpk\win call .\dev\download_micromkl.bat || set errorcode=1
-
-echo call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall" x64
-if "%VISUALSTUDIOVERSION%"=="" call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall" x64 || set errorcode=1
+IF "%VS_VER%"=="2017_build_tools" (
+    @call "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+    echo "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+) ELSE (
+    IF "%VS_VER%"=="2019_build_tools" (
+        @call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+        echo "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+    ) ELSE (
+          @call "C:\temp\oneapi\setvars-vcvarsall.bat" %VS_VER%
+          echo "C:\temp\oneapi\setvars-vcvarsall.bat" %VS_VER%
+    )
+)
 
 echo make %1 -j%NUMBER_OF_PROCESSORS% COMPILER=%2 PLAT=win32e REQCPU=%3
 make %1 -j%NUMBER_OF_PROCESSORS% COMPILER=%2 PLAT=win32e REQCPU=%3 || set errorcode=1
diff --git a/.ci/scripts/build.sh b/.ci/scripts/build.sh
index f29d7af31e6..f9c25245483 100755
--- a/.ci/scripts/build.sh
+++ b/.ci/scripts/build.sh
@@ -164,8 +164,7 @@ fi
 #main actions
 echo "Call env scripts"
 if [ "${backend_config}" == "mkl" ]; then
-    echo "Sourcing MKL env"
-    "${ONEDAL_DIR}"/dev/download_micromkl.sh with_gpu="${with_gpu}"
+    source /opt/intel/oneapi/mkl/latest/env/vars.sh
 elif [ "${backend_config}" == "ref" ] && [ ! -z "${BLAS_INSTALL_DIR}" ]; then
     export OPENBLASROOT="${BLAS_INSTALL_DIR}"
 elif [ "${backend_config}" == "ref" ]; then
diff --git a/.ci/scripts/install_windows.bat b/.ci/scripts/install_windows.bat
new file mode 100644
index 00000000000..0da86a94960
--- /dev/null
+++ b/.ci/scripts/install_windows.bat
@@ -0,0 +1,31 @@
+@echo off
+rem ============================================================================
+rem Copyright contributors to the oneDAL project
+rem
+rem Licensed under the Apache License, Version 2.0 (the "License");
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+rem ============================================================================
+
+set URL=%1
+set COMPONENTS=%2
+
+curl.exe --output %TEMP%\webimage.exe --url %URL% --retry 5 --retry-delay 5
+start /b /wait %TEMP%\webimage.exe -s -x -f webimage_extracted --log extract.log
+del %TEMP%\webimage.exe
+if "%COMPONENTS%"=="" (
+  webimage_extracted\bootstrapper.exe -s --action install --eula=accept --install-dir=C:\temp\oneapi\ -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
+) else (
+  webimage_extracted\bootstrapper.exe -s --action install --components=%COMPONENTS% --eula=accept --install-dir=C:\temp\oneapi\ -p=NEED_VS2017_INTEGRATION=0 -p=NEED_VS2019_INTEGRATION=0 -p=NEED_VS2022_INTEGRATION=0 --log-dir=.
+)
+set installer_exit_code=%ERRORLEVEL%
+rd /s/q "webimage_extracted"
+exit /b %installer_exit_code%
diff --git a/.ci/scripts/test.bat b/.ci/scripts/test.bat
index aaced0689be..a2ef555a66f 100644
--- a/.ci/scripts/test.bat
+++ b/.ci/scripts/test.bat
@@ -34,8 +34,20 @@ echo CPUCOUNT=%CPUCOUNT%
 echo PATH=C:\msys64\usr\bin;%PATH%
 set PATH=C:\msys64\usr\bin;%PATH%
 
-echo call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall" x64
-call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall" x64 || set errorcode=1
+echo "%VISUALSTUDIOVERSION% HERE"
+
+IF "%VS_VER%"=="2017_build_tools" (
+    @call "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+    echo "C:\Program Files (x86)\Microsoft Visual Studio\2017\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+) ELSE (
+    IF "%VS_VER%"=="2019_build_tools" (
+        @call "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+        echo "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\VC\Auxiliary\Build\vcvars64.bat"
+    ) ELSE (
+          @call "C:\temp\oneapi\setvars-vcvarsall.bat" %VS_VER%
+          echo "C:\temp\oneapi\setvars-vcvarsall.bat" %VS_VER%
+    )
+)
 
 echo call __release_win_vc\daal\latest\env\vars.bat
 call __release_win_vc\daal\latest\env\vars.bat || set errorcode=1
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d6a47717935..789ff63d6e2 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,6 +1,6 @@
 # Documentation owners and reviewers
-/docs/         @Vika-F @maria-Petrova @Alexsandruss @bdmoore1
-*.md           @Vika-F @maria-Petrova @Alexsandruss @bdmoore1
+/docs/         @Vika-F @maria-Petrova @Alexsandruss @emmwalsh
+*.md           @Vika-F @maria-Petrova @Alexsandruss @emmwalsh
 
 # TTP files
 third-party*   @maria-Petrova
diff --git a/.github/ISSUE_TEMPLATE/RFC.md b/.github/ISSUE_TEMPLATE/RFC.md
new file mode 100644
index 00000000000..1a6440b89d1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/RFC.md
@@ -0,0 +1,20 @@
+---
+name: RFC for new interface
+about: Use this template to request new functionality or change the behavior of the library
+title: ''
+labels: 'RFC'
+assignees: ''
+---
+
+**Summary**
+Include a short summary of the request. Sections below provide guidance on
+what factors are considered important. 
+
+**Problem statement**
+Describe the problem you are trying to solve with a reasonable level of detail.
+
+**Details**
+* The definition of the function including interface and semantics. Please include how this
+interface will be extendable for different hardware implementations.
+* What existing libraries have implementation of this function and can be used
+under oneDAL interface.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 61be9541082..5691352fcb4 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -24,6 +24,6 @@ Describe what your are expecting from steps above
 If applicable, add output/screenshots to help explain your problem.
 
 **Environment:**
- - OS: [e.g. Ubuntu 18.04]
- - Compiler: [e.g. GCC9.2]
- - Version: [e.g. 2019 Update 4]
+ - OS: [e.g. Ubuntu 22.04]
+ - Compiler: [e.g. GCC12.1]
+ - Version: [e.g. 2025.1]
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 00000000000..093f63c4752
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,19 @@
+---
+name: Request a documentation change
+about: Use this template to report documentation issue or request documentation changes
+title: ''
+labels: 'documentation'
+assignees: ''
+---
+
+**Summary**
+Include a short summary of the issue or request. Sections below provide
+guidance on what factors are considered important for a documentation
+issue.
+
+**URLs**
+Include pointers to documents that are impacted.
+
+**Additional details**
+Please provide a detailed description of the expected changes in documentation
+and any suggestions that you may have.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000000..4c52336018c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,18 @@
+---
+name: Request a feature
+about: Use this template to request new functionality or change the behavior of the library
+title: ''
+labels: 'new feature'
+assignees: ''
+---
+
+**Summary**
+Include a short summary of the request. 
+
+See the sections below for factors important for a feature request.
+
+**Problem Statement**
+Describe the problem you want to solve with a reasonable level of detail.
+
+**Preferred Solution**
+Provide your ideas regarding problem solutions.
diff --git a/.github/workflows/label-enforcement.yml b/.github/workflows/label-enforcement.yml
index 5ad1f6ff8d9..c862f64b798 100644
--- a/.github/workflows/label-enforcement.yml
+++ b/.github/workflows/label-enforcement.yml
@@ -3,6 +3,9 @@ on:
   pull_request:
     branches: [ "main" ]
 
+permissions:
+  contents: read
+
 jobs:
   label_checker:
     name: Please include labels on your pull request
diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
index dc9e6a06057..6f802d3b405 100644
--- a/.github/workflows/nightly-build.yml
+++ b/.github/workflows/nightly-build.yml
@@ -24,6 +24,10 @@ on:
 permissions:
   contents: read
 
+env:
+  WINDOWS_BASEKIT_URL: 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/d91caaa0-7306-46ea-a519-79a0423e1903/w_BaseKit_p_2024.2.1.101_offline.exe'
+  WINDOWS_ALL_COMPONENTS: 'intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.tbb.devel'
+
 jobs:
   build_lnx:
     name: oneDAL Linux nightly build
@@ -38,6 +42,8 @@ jobs:
           repository: oneapi-src/oneDAL
       - name: Install DPC++
         run: .ci/env/apt.sh dpcpp
+      - name: Install MKL
+        run: .ci/env/apt.sh mkl
       - name: System Info
         run: |
           source /opt/intel/oneapi/compiler/latest/env/vars.sh
@@ -68,32 +74,34 @@ jobs:
         with:
           repository: oneapi-src/oneDAL
       - name: Install DPC++
+        shell: cmd
         run: |
-          & .ci/scripts/install_dpc.ps1
-      - name: Prepare Intel OpenCL CPU runtime
-        run: |
-          # Store the unpacked runtime to centralize and reduce external downloads
-          & .ci/scripts/collect_opencl_rt.ps1
+            call .\.ci\scripts\install_windows.bat ${{ env.WINDOWS_BASEKIT_URL }} ${{ env.WINDOWS_ALL_COMPONENTS }}
       - name: System Info
         shell: cmd
         run: |
           set PATH=C:\msys64\usr\bin;%PATH%
           pip install cpufeature
-          call .\dpcpp\compiler\latest\env\vars.bat
+          call C:\temp\oneapi\setvars.bat
           bash .ci/scripts/describe_system.sh
       - name: Make daal
         shell: cmd
         run: |
+          call C:\temp\oneapi\setvars.bat
+          set MKL_FPK_GPU_VERSION_LINE=2024.0.0
           call .\.ci\scripts\build.bat daal vc avx2
       - name: Make onedal
         shell: cmd
         run: |
+          call C:\temp\oneapi\setvars.bat
+          set MKL_FPK_GPU_VERSION_LINE=2024.0.0
           call .\.ci\scripts\build.bat onedal_c vc avx2
       - name: Make oneapi_dpc
         shell: cmd
         run: |
-          call .\dpcpp\compiler\latest\env\vars.bat
-          call .\dpcpp\compiler\latest\bin\sycl-ls.exe
+          call C:\temp\oneapi\setvars.bat
+          set MKL_FPK_GPU_VERSION_LINE=2024.0.0
+          call C:\temp\oneapi\compiler\latest\bin\sycl-ls.exe
           call .\.ci\scripts\build.bat onedal_dpc vc avx2
       - name: Archive build
         uses: actions/upload-artifact@v4
@@ -103,14 +111,9 @@ jobs:
       - name: Compress DPC++
         shell: cmd
         run: |
-          tar -cvzf icx.zip .\dpcpp
+          tar -cvzf icx.zip C:\temp\oneapi
       - name: Archive DPC++
         uses: actions/upload-artifact@v4
         with:
           name: icx_compiler
           path: .\icx.zip
-      - name: Archive Intel OpenCL CPU runtime
-        uses: actions/upload-artifact@v4
-        with:
-          name: opencl_rt_installer
-          path: .\opencl_rt.msi
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2644e0f06fe..d3e45c45a9d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,6 +83,12 @@ editorconfig-checker
 
 For your convenience we also added [coding guidelines](http://oneapi-src.github.io/oneDAL/contribution/coding_guide.html) with examples and detailed descriptions of the coding style oneDAL follows. We encourage you to consult them when writing your code.
 
+## Custom Components
+
+### Threading Layer
+
+In the source code of the algorithms, oneDAL does not use threading primitives directly. All the threading primitives used within oneDAL form are called the [threading layer](http://oneapi-src.github.io/oneDAL/contribution/threading.html). Contributors should leverage the primitives from the layer to implement parallel algorithms.
+
 ## Documentation Guidelines
 
 oneDAL uses `Doxygen` for inline comments in public header files that are used to build the API reference and  `reStructuredText` for the Developer Guide. See [oneDAL documentation](https://oneapi-src.github.io/oneDAL/) for reference.
diff --git a/WORKSPACE b/WORKSPACE
index 3cba5bcd224..38d50397eca 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -22,21 +22,6 @@ opencl_repo(
     name = "opencl",
 )
 
-load("@onedal//dev/bazel/deps:micromkl.bzl", "micromkl_repo", "micromkl_dpc_repo")
-micromkl_repo(
-    name = "micromkl",
-    root_env_var = "MKLFPKROOT",
-    url = "https://github.com/oneapi-src/oneDAL/releases/download/Dependencies/mklfpk_lnx_20230413.tgz",
-    sha256 = "e99dd6fb18f1fda382c53373262d1bb44c1b58aa6edff94cfb0e9d8dcd3395ed",
-)
-
-micromkl_dpc_repo(
-    name = "micromkl_dpc",
-    root_env_var = "MKLGPUFPKROOT",
-    url = "https://github.com/oneapi-src/oneDAL/releases/download/Dependencies/mklgpufpk_lnx_20240605.tgz",
-    sha256 = "0787a92e9580ed6b9fb97d054a0ed77994dbc18b4b3fb099451cb1e6ebdf4f16",
-)
-
 load("@onedal//dev/bazel/deps:openblas.bzl", "openblas_repo")
 openblas_repo(
     name = "openblas",
@@ -87,16 +72,23 @@ mkl_repo(
     name = "mkl",
     root_env_var = "MKLROOT",
     urls = [
+        # TODO: when the issue with binutils will be solved, replace 2023.0 to 2024.2
         "https://files.pythonhosted.org/packages/76/8c/2e6fb6186fa9335a0feb7845e001e18c22627a06ae68650e5a84ca2b536d/mkl_static-2023.0.0-py2.py3-none-manylinux1_x86_64.whl",
-        "https://files.pythonhosted.org/packages/cf/d1/ea2d769006337d968a89337dd1c3eb09c528f9ac629e8ab99324e1122f03/mkl_include-2023.0.0-py2.py3-none-manylinux1_x86_64.whl",
+        #"https://files.pythonhosted.org/packages/c1/44/42ea3ad7bbaa65acb54c977961118d7b24ea687e7c3d64aba0a019cbfa19/mkl_static-2024.2.0-py2.py3-none-manylinux1_x86_64.whl",
+        "https://files.pythonhosted.org/packages/80/e4/93ddfd475420f1c24d96f3bba1f87ec31a1eea847884c4ccb243cb336a61/mkl_include-2024.2.0-py2.py3-none-manylinux1_x86_64.whl",
+        "https://files.pythonhosted.org/packages/c9/3a/8797ef320a04e0b939a07365f09ce11f5484150bd3600c6400391c5c36e9/mkl_devel_dpcpp-2024.2.0-py2.py3-none-manylinux1_x86_64.whl",
     ],
     sha256s = [
         "49d16f315f6803b1046a4796686af766ad487f9f6d98ea76b6cdb2ebd5b559f9",
-        "14b0958dff799378975d83fbd00ce756645aa36b9f924bdfdb0fb031f72b734d",
+        #"8c2a6c6a144c5619f1df75fd550b32730f3e0632b55a15a42a95516e142ccf47",
+        "63ed16ece64d9420e9fe1d5e1b55e0680632b61ad1c0e5f207b17f85233fcc09",
+        "b80099209aef1b147b8f1c1621a47078fba2c17b2faee131939ea4d32da2c35c",
     ],
     strip_prefixes = [
         "mkl_static-2023.0.0.data/data",
-        "mkl_include-2023.0.0.data/data",
+        #"mkl_static-2024.2.0.data/data",
+        "mkl_include-2024.2.0.data/data",
+        "mkl_devel_dpcpp-2024.2.0.data/data",
     ],
 )
 
diff --git a/cpp/daal/BUILD b/cpp/daal/BUILD
index 0cbf8a50316..4557c88ffac 100644
--- a/cpp/daal/BUILD
+++ b/cpp/daal/BUILD
@@ -18,10 +18,7 @@ daal_module(
     deps = select({
         "@config//:backend_ref": [ ],
         "//conditions:default": [
-                                    "@micromkl//:vml_ipp",
-                                    # TODO: Currently vml_ipp lib depends on TBB, but it shouldn't
-                                    #       Remove TBB from deps once problem with vml_ipp is resolved
-                                    "@tbb//:tbb_binary",
+                                    "@mkl//:mkl_thr",
                                 ],
         }),
 )
@@ -32,7 +29,7 @@ daal_module(
     deps = select({
         "@config//:backend_ref": [ "@openblas//:openblas",
                                  ],
-        "//conditions:default": [ "@micromkl//:mkl_thr",
+        "//conditions:default": [ "@mkl//:mkl_thr",
                                 ],
         }),
 )
@@ -64,8 +61,7 @@ daal_module(
                                  ],
         "//conditions:default": [
                                   ":public_includes",
-                                  "@micromkl//:headers",
-                                  "@micromkl_dpc//:headers",
+                                  "@mkl//:headers",
                                 ],
         }),
 )
diff --git a/cpp/daal/src/algorithms/elastic_net/elastic_net_training_result_fpt.cpp b/cpp/daal/src/algorithms/elastic_net/elastic_net_training_result_fpt.cpp
index fabfcca4732..5dcf868e43e 100644
--- a/cpp/daal/src/algorithms/elastic_net/elastic_net_training_result_fpt.cpp
+++ b/cpp/daal/src/algorithms/elastic_net/elastic_net_training_result_fpt.cpp
@@ -40,7 +40,7 @@ using namespace daal::services;
  * \param[in] method Computation method for the algorithm
  */
 template <typename algorithmFPType>
-Status Result::allocate(const daal::algorithms::Input * input, const Parameter * parameter, const int method)
+DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * input, const Parameter * parameter, const int method)
 {
     const Input * const in = static_cast<const Input *>(input);
 
@@ -57,7 +57,8 @@ Status Result::allocate(const daal::algorithms::Input * input, const Parameter *
     return s;
 }
 
-template services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const Parameter * parameter, const int method);
+template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const Parameter * parameter,
+                                                                    const int method);
 
 } // namespace training
 } // namespace elastic_net
diff --git a/cpp/daal/src/algorithms/export_win32e.def b/cpp/daal/src/algorithms/export_win32e.def
index 443714aef69..7962b0a8844 100644
--- a/cpp/daal/src/algorithms/export_win32e.def
+++ b/cpp/daal/src/algorithms/export_win32e.def
@@ -15,19 +15,3 @@
 ;===============================================================================
 
 EXPORTS
-fpk_serv_malloc
-fpk_serv_free
-fpk_serv_memcpy_s
-fpk_serv_lock
-fpk_serv_unlock
-fpk_serv_strnlen_s
-fpk_serv_strncpy_s
-fpk_serv_strncat_s
-fpk_serv_thread_yield
-fpk_serv_core_register_cleanup
-fpk_serv_calloc
-fpk_serv_printf_s
-fpk_serv_memmove_s
-fpk_serv_realloc
-fpk_serv_print
-fpk_serv_exit
diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_impl.i b/cpp/daal/src/algorithms/kmeans/kmeans_init_impl.i
old mode 100755
new mode 100644
diff --git a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_training_result_fpt.cpp b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_training_result_fpt.cpp
index e4a30bc18f2..a331d9f48a9 100644
--- a/cpp/daal/src/algorithms/lasso_regression/lasso_regression_training_result_fpt.cpp
+++ b/cpp/daal/src/algorithms/lasso_regression/lasso_regression_training_result_fpt.cpp
@@ -40,7 +40,7 @@ using namespace daal::services;
  * \param[in] method Computation method for the algorithm
  */
 template <typename algorithmFPType>
-Status Result::allocate(const daal::algorithms::Input * input, const Parameter * parameter, const int method)
+DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * input, const Parameter * parameter, const int method)
 {
     const Input * const in = static_cast<const Input *>(input);
 
@@ -57,7 +57,8 @@ Status Result::allocate(const daal::algorithms::Input * input, const Parameter *
     return s;
 }
 
-template services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const Parameter * parameter, const int method);
+template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const Parameter * parameter,
+                                                                    const int method);
 
 } // namespace training
 } // namespace lasso_regression
diff --git a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_kernel.h b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_kernel.h
index 2b9225646ee..065f3a3a1c0 100644
--- a/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_kernel.h
+++ b/cpp/daal/src/algorithms/multiclassclassifier/multiclassclassifier_predict_kernel.h
@@ -22,8 +22,8 @@
 //--
 */
 
-#ifndef __MULTICLASSCLASSIFIER_PREDICT_FPK_H__
-#define __MULTICLASSCLASSIFIER_PREDICT_FPK_H__
+#ifndef __MULTICLASSCLASSIFIER_PREDICT_KERNEL_H__
+#define __MULTICLASSCLASSIFIER_PREDICT_KERNEL_H__
 
 #include "data_management/data/numeric_table.h"
 #include "algorithms/model.h"
diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_kernel.h b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_kernel.h
index 3924f9c67d1..9e88cd86f17 100644
--- a/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_kernel.h
+++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_predict_kernel.h
@@ -21,8 +21,8 @@
 //--
 */
 
-#ifndef _NAIVEBAYES_ASSIGN_FPK_H
-#define _NAIVEBAYES_ASSIGN_FPK_H
+#ifndef __NAIVEBAYES_PREDICT_KERNEL_H__
+#define __NAIVEBAYES_PREDICT_KERNEL_H__
 
 #include "algorithms/naive_bayes/multinomial_naive_bayes_model.h"
 #include "algorithms/naive_bayes/multinomial_naive_bayes_predict_types.h"
diff --git a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_kernel.h b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_kernel.h
index 01e1ce238ba..6a4a40d3047 100644
--- a/cpp/daal/src/algorithms/naivebayes/naivebayes_train_kernel.h
+++ b/cpp/daal/src/algorithms/naivebayes/naivebayes_train_kernel.h
@@ -21,8 +21,8 @@
 //--
 */
 
-#ifndef __NAIVEBAYES_TRAIN_FPK_H__
-#define __NAIVEBAYES_TRAIN_FPK_H__
+#ifndef __NAIVEBAYES_TRAIN_KERNEL_H__
+#define __NAIVEBAYES_TRAIN_KERNEL_H__
 
 #include "algorithms/naive_bayes/multinomial_naive_bayes_model.h"
 #include "algorithms/naive_bayes/multinomial_naive_bayes_training_types.h"
diff --git a/cpp/daal/src/algorithms/normalization/zscore/zscore_fpt.cpp b/cpp/daal/src/algorithms/normalization/zscore/zscore_fpt.cpp
index 243a0ca679a..9ac7fe22846 100644
--- a/cpp/daal/src/algorithms/normalization/zscore/zscore_fpt.cpp
+++ b/cpp/daal/src/algorithms/normalization/zscore/zscore_fpt.cpp
@@ -43,7 +43,7 @@ namespace interface2
  * \param[in] method    Algorithm computation method
  */
 template <typename algorithmFPType>
-Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method)
+DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method)
 {
     auto impl = ResultImpl::cast(getStorage(*this));
     DAAL_CHECK(impl, ErrorNullPtr);
@@ -61,8 +61,9 @@ Status Result::allocate(const daal::algorithms::Input * input, const int method)
     return allocate<algorithmFPType>(input, NULL, method);
 }
 
+template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                    const daal::algorithms::Parameter * parameter, const int method);
 template Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const int method);
-template Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method);
 
 } // namespace interface2
 } // namespace zscore
diff --git a/cpp/daal/src/algorithms/pca/pca_partialresult_correlation_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_partialresult_correlation_fpt.cpp
index cc403b6ca3b..ba577dedc2b 100644
--- a/cpp/daal/src/algorithms/pca/pca_partialresult_correlation_fpt.cpp
+++ b/cpp/daal/src/algorithms/pca/pca_partialresult_correlation_fpt.cpp
@@ -29,10 +29,12 @@ namespace algorithms
 {
 namespace pca
 {
-template services::Status PartialResult<correlationDense>::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                                 const daal::algorithms::Parameter * parameter, const int method);
-template services::Status PartialResult<correlationDense>::initialize<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                                   const daal::algorithms::Parameter * parameter, const int method);
+template DAAL_EXPORT services::Status PartialResult<correlationDense>::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                             const daal::algorithms::Parameter * parameter,
+                                                                                             const int method);
+template DAAL_EXPORT services::Status PartialResult<correlationDense>::initialize<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                               const daal::algorithms::Parameter * parameter,
+                                                                                               const int method);
 
 } // namespace pca
 } // namespace algorithms
diff --git a/cpp/daal/src/algorithms/pca/pca_partialresult_svd.h b/cpp/daal/src/algorithms/pca/pca_partialresult_svd.h
index 4d45de1a8f8..a4ffc43435a 100644
--- a/cpp/daal/src/algorithms/pca/pca_partialresult_svd.h
+++ b/cpp/daal/src/algorithms/pca/pca_partialresult_svd.h
@@ -41,8 +41,8 @@ namespace pca
  * \param[in] method    Computation method
  */
 template <typename algorithmFPType>
-services::Status PartialResult<svdDense>::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter,
-                                                   const int method)
+DAAL_EXPORT services::Status PartialResult<svdDense>::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter,
+                                                               const int method)
 {
     services::Status s;
     set(nObservationsSVD, HomogenNumericTable<algorithmFPType>::create(1, 1, NumericTableIface::doAllocate, 0, &s));
@@ -55,8 +55,8 @@ services::Status PartialResult<svdDense>::allocate(const daal::algorithms::Input
 };
 
 template <typename algorithmFPType>
-services::Status PartialResult<svdDense>::initialize(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter,
-                                                     const int method)
+DAAL_EXPORT services::Status PartialResult<svdDense>::initialize(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter,
+                                                                 const int method)
 {
     services::Status s;
     DAAL_CHECK_STATUS(s, get(nObservationsSVD)->assign((algorithmFPType)0.0))
diff --git a/cpp/daal/src/algorithms/pca/pca_partialresult_svd_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_partialresult_svd_fpt.cpp
index f0697662c99..25456e4dbb5 100644
--- a/cpp/daal/src/algorithms/pca/pca_partialresult_svd_fpt.cpp
+++ b/cpp/daal/src/algorithms/pca/pca_partialresult_svd_fpt.cpp
@@ -29,10 +29,11 @@ namespace algorithms
 {
 namespace pca
 {
-template services::Status PartialResult<svdDense>::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                         const daal::algorithms::Parameter * parameter, const int method);
-template services::Status PartialResult<svdDense>::initialize<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                           const daal::algorithms::Parameter * parameter, const int method);
+template DAAL_EXPORT services::Status PartialResult<svdDense>::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                     const daal::algorithms::Parameter * parameter, const int method);
+template DAAL_EXPORT services::Status PartialResult<svdDense>::initialize<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                       const daal::algorithms::Parameter * parameter,
+                                                                                       const int method);
 
 } // namespace pca
 } // namespace algorithms
diff --git a/cpp/daal/src/algorithms/pca/pca_result_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_result_fpt.cpp
index 74e05f1f9f4..23586838dcc 100644
--- a/cpp/daal/src/algorithms/pca/pca_result_fpt.cpp
+++ b/cpp/daal/src/algorithms/pca/pca_result_fpt.cpp
@@ -63,7 +63,8 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in
  * \param[in] method        Computation method
  */
 template <typename algorithmFPType>
-services::Status Result::allocate(const daal::algorithms::PartialResult * partialResult, daal::algorithms::Parameter * parameter, const Method method)
+DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::PartialResult * partialResult, daal::algorithms::Parameter * parameter,
+                                              const Method method)
 {
     size_t nComponents           = 0;
     DAAL_UINT64 resultsToCompute = eigenvalue;
@@ -76,8 +77,8 @@ services::Status Result::allocate(const daal::algorithms::PartialResult * partia
 
 template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input, daal::algorithms::Parameter * parameter,
                                                                     const Method method);
-template services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::PartialResult * partialResult,
-                                                        daal::algorithms::Parameter * parameter, const Method method);
+template DAAL_EXPORT services::Status Result::allocate<DAAL_FPTYPE>(const daal::algorithms::PartialResult * partialResult,
+                                                                    daal::algorithms::Parameter * parameter, const Method method);
 
 } // namespace interface3
 } // namespace pca
diff --git a/cpp/daal/src/algorithms/qr/qr_dense_default_kernel.h b/cpp/daal/src/algorithms/qr/qr_dense_default_kernel.h
index e667b3e81ab..b831dc8b1d9 100644
--- a/cpp/daal/src/algorithms/qr/qr_dense_default_kernel.h
+++ b/cpp/daal/src/algorithms/qr/qr_dense_default_kernel.h
@@ -21,8 +21,8 @@
 //--
 */
 
-#ifndef __QR_FPK_H__
-#define __QR_FPK_H__
+#ifndef __QR_DENSE_DEFAULT_KERNEL_H__
+#define __QR_DENSE_DEFAULT_KERNEL_H__
 
 #include "algorithms/qr/qr_batch.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2.h b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2.h
index 4d44d952bcf..41b61619159 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2.h
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2.h
@@ -41,7 +41,8 @@ namespace interface1
  * Allocates memory to store partial results of the SVD algorithm
  */
 template <typename algorithmFPType>
-Status DistributedPartialResult::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method)
+DAAL_EXPORT services::Status DistributedPartialResult::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter,
+                                                                const int method)
 {
     set(outputOfStep2ForStep3, KeyValueDataCollectionPtr(new KeyValueDataCollection()));
     Argument::set(finalResultFromStep2Master, ResultPtr(new Result()));
@@ -61,7 +62,7 @@ Status DistributedPartialResult::allocate(const daal::algorithms::Input * input,
  * \param[out] nBlocks         Number of rows in the input data set
  */
 template <typename algorithmFPType>
-Status DistributedPartialResult::setPartialResultStorage(KeyValueDataCollection * inCollection, size_t & nBlocks)
+DAAL_EXPORT services::Status DistributedPartialResult::setPartialResultStorage(KeyValueDataCollection * inCollection, size_t & nBlocks)
 {
     KeyValueDataCollectionPtr partialCollection = staticPointerCast<KeyValueDataCollection, SerializationIface>(Argument::get(outputOfStep2ForStep3));
     if (!partialCollection)
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2_fpt.cpp b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2_fpt.cpp
index c41ef6b87a4..a9b1845e0a3 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2_fpt.cpp
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step2_fpt.cpp
@@ -30,10 +30,11 @@ namespace svd
 {
 namespace interface1
 {
-template services::Status DistributedPartialResult::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                          const daal::algorithms::Parameter * parameter, const int method);
-template services::Status DistributedPartialResult::setPartialResultStorage<DAAL_FPTYPE>(data_management::KeyValueDataCollection * inCollection,
-                                                                                         size_t & nBlocks);
+template DAAL_EXPORT services::Status DistributedPartialResult::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                      const daal::algorithms::Parameter * parameter,
+                                                                                      const int method);
+template DAAL_EXPORT services::Status DistributedPartialResult::setPartialResultStorage<DAAL_FPTYPE>(
+    data_management::KeyValueDataCollection * inCollection, size_t & nBlocks);
 
 } // namespace interface1
 } // namespace svd
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3.h b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3.h
index e7b8e664c3e..7461ab7e380 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3.h
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3.h
@@ -42,7 +42,8 @@ namespace interface1
  * \param[in] method    Algorithm computation method
  */
 template <typename algorithmFPType>
-Status DistributedPartialResultStep3::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method)
+DAAL_EXPORT services::Status DistributedPartialResultStep3::allocate(const daal::algorithms::Input * input,
+                                                                     const daal::algorithms::Parameter * parameter, const int method)
 {
     Argument::set(finalResultFromStep3, ResultPtr(new Result()));
     return Status();
@@ -54,7 +55,7 @@ Status DistributedPartialResultStep3::allocate(const daal::algorithms::Input * i
  * \param[in]  qCollection  DataCollection of all partial results from step 1 of the SVD algorithm in the distributed processing mode
  */
 template <typename algorithmFPType>
-Status DistributedPartialResultStep3::setPartialResultStorage(data_management::DataCollection * qCollection)
+DAAL_EXPORT services::Status DistributedPartialResultStep3::setPartialResultStorage(data_management::DataCollection * qCollection)
 {
     size_t qSize = qCollection->size();
     size_t m     = 0;
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3_fpt.cpp b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3_fpt.cpp
index 1e5ec49aea0..24892f71a28 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3_fpt.cpp
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_distr_step3_fpt.cpp
@@ -30,9 +30,11 @@ namespace svd
 {
 namespace interface1
 {
-template services::Status DistributedPartialResultStep3::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
-                                                                               const daal::algorithms::Parameter * parameter, const int method);
-template services::Status DistributedPartialResultStep3::setPartialResultStorage<DAAL_FPTYPE>(data_management::DataCollection * qCollection);
+template DAAL_EXPORT services::Status DistributedPartialResultStep3::allocate<DAAL_FPTYPE>(const daal::algorithms::Input * input,
+                                                                                           const daal::algorithms::Parameter * parameter,
+                                                                                           const int method);
+template DAAL_EXPORT services::Status DistributedPartialResultStep3::setPartialResultStorage<DAAL_FPTYPE>(
+    data_management::DataCollection * qCollection);
 
 } // namespace interface1
 } // namespace svd
diff --git a/cpp/daal/src/algorithms/svd/svd_dense_default_kernel.h b/cpp/daal/src/algorithms/svd/svd_dense_default_kernel.h
index f0a19e8eb03..c8f8f333467 100644
--- a/cpp/daal/src/algorithms/svd/svd_dense_default_kernel.h
+++ b/cpp/daal/src/algorithms/svd/svd_dense_default_kernel.h
@@ -21,8 +21,8 @@
 //--
 */
 
-#ifndef __SVD_FPK_H__
-#define __SVD_FPK_H__
+#ifndef __SVD_DENSE_DEFAULT_KERNEL_H__
+#define __SVD_DENSE_DEFAULT_KERNEL_H__
 
 #include "algorithms/svd/svd_batch.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/externals/config_mkl.h b/cpp/daal/src/externals/config_mkl.h
index 8952ca2c40b..3c6465886e2 100644
--- a/cpp/daal/src/externals/config_mkl.h
+++ b/cpp/daal/src/externals/config_mkl.h
@@ -27,6 +27,7 @@
 #include "services/daal_defines.h"
 #include "services/env_detect.h"
 
+#include "src/externals/service_thread_declar_mkl.h"
 #include "src/externals/service_blas_mkl.h"
 #include "src/externals/service_lapack_mkl.h"
 #include "src/externals/service_math_mkl.h"
diff --git a/cpp/daal/src/externals/core_threading_win_dll.cpp b/cpp/daal/src/externals/core_threading_win_dll.cpp
index 7a25a0eddd1..f24689a341e 100644
--- a/cpp/daal/src/externals/core_threading_win_dll.cpp
+++ b/cpp/daal/src/externals/core_threading_win_dll.cpp
@@ -788,396 +788,3 @@ DAAL_EXPORT void * _getThreadPinner(bool create_pinner, void (*read_topo)(int &,
     return _getThreadPinner_ptr(create_pinner, read_topo, deleter);
 }
 #endif
-
-#define CALL_VOID_FUNC_FROM_DLL(fn_dpref, fn_name, argdecl, argcall)          \
-    typedef void(*fn_dpref##fn_name##_t) argdecl;                             \
-    static fn_dpref##fn_name##_t fn_dpref##fn_name##_ptr = NULL;              \
-    CALL_VOID_FUNC_FROM_DLL_CPU(fn_dpref, avx512_, fn_name, argdecl, argcall) \
-    CALL_VOID_FUNC_FROM_DLL_CPU(fn_dpref, avx2_, fn_name, argdecl, argcall)   \
-    CALL_VOID_FUNC_FROM_DLL_CPU(fn_dpref, sse42_, fn_name, argdecl, argcall)  \
-    CALL_VOID_FUNC_FROM_DLL_CPU(fn_dpref, sse2_, fn_name, argdecl, argcall)
-
-#define CALL_VOID_FUNC_FROM_DLL_CPU(fn_dpref, fn_cpu, fn_name, argdecl, argcall)                             \
-    extern "C" DAAL_EXPORT void fn_dpref##fn_cpu##fn_name argdecl                                            \
-    {                                                                                                        \
-        load_daal_thr_dll();                                                                                 \
-        if (fn_dpref##fn_name##_ptr == NULL)                                                                 \
-        {                                                                                                    \
-            fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_cpu #fn_name); \
-        }                                                                                                    \
-        fn_dpref##fn_name##_ptr argcall;                                                                     \
-    }
-
-#if defined(_WIN64)
-    #define CALL_VOID_FUNC_FROM_DLL_CPU_MIC(fn_dpref, fn_cpu, fn_name, argdecl, argcall)                         \
-        extern "C" DAAL_EXPORT void fn_dpref##fn_cpu##fn_name argdecl                                            \
-        {                                                                                                        \
-            load_daal_thr_dll();                                                                                 \
-            if (fn_dpref##fn_name##_ptr == NULL)                                                                 \
-            {                                                                                                    \
-                fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_cpu #fn_name); \
-            }                                                                                                    \
-            fn_dpref##fn_name##_ptr argcall;                                                                     \
-        }
-#else
-    #define CALL_VOID_FUNC_FROM_DLL_CPU_MIC(fn_dpref, fn_cpu, fn_name, argdecl, argcall)
-#endif
-
-#define CALL_RET_FUNC_FROM_DLL(ret_type, fn_dpref, fn_name, argdecl, argcall)          \
-    typedef ret_type(*fn_dpref##fn_name##_t) argdecl;                                  \
-    static fn_dpref##fn_name##_t fn_dpref##fn_name##_ptr = NULL;                       \
-    CALL_RET_FUNC_FROM_DLL_CPU(ret_type, fn_dpref, avx512_, fn_name, argdecl, argcall) \
-    CALL_RET_FUNC_FROM_DLL_CPU(ret_type, fn_dpref, avx2_, fn_name, argdecl, argcall)   \
-    CALL_RET_FUNC_FROM_DLL_CPU(ret_type, fn_dpref, sse42_, fn_name, argdecl, argcall)  \
-    CALL_RET_FUNC_FROM_DLL_CPU(ret_type, fn_dpref, sse2_, fn_name, argdecl, argcall)
-
-#define CALL_RET_FUNC_FROM_DLL_CPU(ret_type, fn_dpref, fn_cpu, fn_name, argdecl, argcall)                    \
-    extern "C" DAAL_EXPORT ret_type fn_dpref##fn_cpu##fn_name argdecl                                        \
-    {                                                                                                        \
-        load_daal_thr_dll();                                                                                 \
-        if (fn_dpref##fn_name##_ptr == NULL)                                                                 \
-        {                                                                                                    \
-            fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_cpu #fn_name); \
-        }                                                                                                    \
-        return fn_dpref##fn_name##_ptr argcall;                                                              \
-    }
-
-#if defined(_WIN64)
-    #define CALL_RET_FUNC_FROM_DLL_CPU_MIC(ret_type, fn_dpref, fn_cpu, fn_name, argdecl, argcall)                \
-        extern "C" DAAL_EXPORT ret_type fn_dpref##fn_cpu##fn_name argdecl                                        \
-        {                                                                                                        \
-            load_daal_thr_dll();                                                                                 \
-            if (fn_dpref##fn_name##_ptr == NULL)                                                                 \
-            {                                                                                                    \
-                fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_cpu #fn_name); \
-            }                                                                                                    \
-            return fn_dpref##fn_name##_ptr argcall;                                                              \
-        }
-#else
-    #define CALL_RET_FUNC_FROM_DLL_CPU_MIC(ret_type, fn_dpref, fn_cpu, fn_name, argdecl, argcall)
-#endif
-
-/* Used directly in Intel(R) oneAPI Data Analytics Library (oneDAL) */
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, dsyrk,
-                        (const char * uplo, const char * trans, const DAAL_INT * n, const DAAL_INT * k, const double * alpha, const double * a,
-                         const DAAL_INT * lda, const double * beta, double * c, const DAAL_INT * ldc),
-                        (uplo, trans, n, k, alpha, a, lda, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, ssyrk,
-                        (const char * uplo, const char * trans, const DAAL_INT * n, const DAAL_INT * k, const float * alpha, const float * a,
-                         const DAAL_INT * lda, const float * beta, float * c, const DAAL_INT * ldc),
-                        (uplo, trans, n, k, alpha, a, lda, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, dsyr,
-                        (const char * uplo, const DAAL_INT * n, const double * alpha, const double * x, const DAAL_INT * incx, double * a,
-                         const DAAL_INT * lda),
-                        (uplo, n, alpha, x, incx, a, lda));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, ssyr,
-                        (const char * uplo, const DAAL_INT * n, const float * alpha, const float * x, const DAAL_INT * incx, float * a,
-                         const DAAL_INT * lda),
-                        (uplo, n, alpha, x, incx, a, lda));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, dgemm,
-                        (const char * transa, const char * transb, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * alpha,
-                         const double * a, const DAAL_INT * lda, const double * b, const DAAL_INT * ldb, const double * beta, double * c,
-                         const DAAL_INT * ldc),
-                        (transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, sgemm,
-                        (const char * transa, const char * transb, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * alpha,
-                         const float * a, const DAAL_INT * lda, const float * b, const DAAL_INT * ldb, const float * beta, float * c,
-                         const DAAL_INT * ldc),
-                        (transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xdgemm,
-                        (const char * transa, const char * transb, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * alpha,
-                         const double * a, const DAAL_INT * lda, const double * b, const DAAL_INT * ldb, const double * beta, double * c,
-                         const DAAL_INT * ldc),
-                        (transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xsgemm,
-                        (const char * transa, const char * transb, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * alpha,
-                         const float * a, const DAAL_INT * lda, const float * b, const DAAL_INT * ldb, const float * beta, float * c,
-                         const DAAL_INT * ldc),
-                        (transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, dsymm,
-                        (const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a,
-                         const DAAL_INT * lda, const double * b, const DAAL_INT * ldb, const double * beta, double * c, const DAAL_INT * ldc),
-                        (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, ssymm,
-                        (const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a,
-                         const DAAL_INT * lda, const float * b, const DAAL_INT * ldb, const float * beta, float * c, const DAAL_INT * ldc),
-                        (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, dgemv,
-                        (const char * trans, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a, const DAAL_INT * lda,
-                         const double * x, const DAAL_INT * incx, const double * beta, double * y, const DAAL_INT * incy),
-                        (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, sgemv,
-                        (const char * trans, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a, const DAAL_INT * lda,
-                         const float * x, const DAAL_INT * incx, const float * beta, float * y, const DAAL_INT * incy),
-                        (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, daxpy,
-                        (const DAAL_INT * n, const double * alpha, const double * x, const DAAL_INT * incx, double * y, const DAAL_INT * incy),
-                        (n, alpha, x, incx, y, incy));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, saxpy,
-                        (const DAAL_INT * n, const float * alpha, const float * x, const DAAL_INT * incx, float * y, const DAAL_INT * incy),
-                        (n, alpha, x, incx, y, incy));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xdsyr,
-                        (const char * uplo, const DAAL_INT * n, const double * alpha, const double * x, const DAAL_INT * incx, double * a,
-                         const DAAL_INT * lda),
-                        (uplo, n, alpha, x, incx, a, lda));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xssyr,
-                        (const char * uplo, const DAAL_INT * n, const float * alpha, const float * x, const DAAL_INT * incx, float * a,
-                         const DAAL_INT * lda),
-                        (uplo, n, alpha, x, incx, a, lda));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xdsyrk,
-                        (const char * uplo, const char * trans, const DAAL_INT * n, const DAAL_INT * k, const double * alpha, const double * a,
-                         const DAAL_INT * lda, const double * beta, double * c, const DAAL_INT * ldc),
-                        (uplo, trans, n, k, alpha, a, lda, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xssyrk,
-                        (const char * uplo, const char * trans, const DAAL_INT * n, const DAAL_INT * k, const float * alpha, const float * a,
-                         const DAAL_INT * lda, const float * beta, float * c, const DAAL_INT * ldc),
-                        (uplo, trans, n, k, alpha, a, lda, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xdsymm,
-                        (const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a,
-                         const DAAL_INT * lda, const double * b, const DAAL_INT * ldb, const double * beta, double * c, const DAAL_INT * ldc),
-                        (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_blas_, xssymm,
-                        (const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a,
-                         const DAAL_INT * lda, const float * b, const DAAL_INT * ldb, const float * beta, float * c, const DAAL_INT * ldc),
-                        (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_dcsrmultd,
-                        (const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, double * a, DAAL_INT * ja, DAAL_INT * ia,
-                         double * b, DAAL_INT * jb, DAAL_INT * ib, double * c, DAAL_INT * ldc),
-                        (transa, m, n, k, a, ja, ia, b, jb, ib, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_scsrmultd,
-                        (const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, float * a, DAAL_INT * ja, DAAL_INT * ia,
-                         float * b, DAAL_INT * jb, DAAL_INT * ib, float * c, DAAL_INT * ldc),
-                        (transa, m, n, k, a, ja, ia, b, jb, ib, c, ldc));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_dcsrmv,
-                        (const char * transa, const DAAL_INT * m, const DAAL_INT * k, const double * alpha, const char * matdescra,
-                         const double * val, const DAAL_INT * indx, const DAAL_INT * pntrb, const DAAL_INT * pntre, const double * x,
-                         const double * beta, double * y),
-                        (transa, m, k, alpha, matdescra, val, indx, pntrb, pntre, x, beta, y));
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_scsrmv,
-                        (const char * transa, const DAAL_INT * m, const DAAL_INT * k, const float * alpha, const char * matdescra, const float * val,
-                         const DAAL_INT * indx, const DAAL_INT * pntrb, const DAAL_INT * pntre, const float * x, const float * beta, float * y),
-                        (transa, m, k, alpha, matdescra, val, indx, pntrb, pntre, x, beta, y));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dpotrf, (const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, DAAL_INT * info, int iuplo),
-                        (uplo, n, a, lda, info, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, spotrf, (const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, DAAL_INT * info, int iuplo),
-                        (uplo, n, a, lda, info, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dpotrs,
-                        (const char * uplo, const DAAL_INT * n, const DAAL_INT * nrhs, const double * a, const DAAL_INT * lda, double * b,
-                         const DAAL_INT * ldb, DAAL_INT * info, int iuplo),
-                        (uplo, n, nrhs, a, lda, b, ldb, info, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, spotrs,
-                        (const char * uplo, const DAAL_INT * n, const DAAL_INT * nrhs, const float * a, const DAAL_INT * lda, float * b,
-                         const DAAL_INT * ldb, DAAL_INT * info, int iuplo),
-                        (uplo, n, nrhs, a, lda, b, ldb, info, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgetrf,
-                        (const DAAL_INT * m, const DAAL_INT * n, const double * a, const DAAL_INT * lda, const DAAL_INT * ipiv, DAAL_INT * info),
-                        (m, n, a, lda, ipiv, info));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgetrf,
-                        (const DAAL_INT * m, const DAAL_INT * n, const float * a, const DAAL_INT * lda, const DAAL_INT * ipiv, DAAL_INT * info),
-                        (m, n, a, lda, ipiv, info));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgetrs,
-                        (const char * trans, const DAAL_INT * n, const DAAL_INT * nrhs, const double * a, const DAAL_INT * lda, const DAAL_INT * ipiv,
-                         double * b, const DAAL_INT * ldb, DAAL_INT * info, int iuplo),
-                        (trans, n, nrhs, a, lda, ipiv, b, ldb, info, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgetrs,
-                        (const char * trans, const DAAL_INT * n, const DAAL_INT * nrhs, const float * a, const DAAL_INT * lda, const DAAL_INT * ipiv,
-                         float * b, const DAAL_INT * ldb, DAAL_INT * info, int iuplo),
-                        (trans, n, nrhs, a, lda, ipiv, b, ldb, info, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dpotri, (const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, DAAL_INT * info, int iuplo),
-                        (uplo, n, a, lda, info, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, spotri, (const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, DAAL_INT * info, int iuplo),
-                        (uplo, n, a, lda, info, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgerqf,
-                        (const DAAL_INT * m, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * tau, double * work,
-                         const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, a, lda, tau, work, lwork, info));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgerqf,
-                        (const DAAL_INT * m, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * tau, float * work, const DAAL_INT * lwork,
-                         DAAL_INT * info),
-                        (m, n, a, lda, tau, work, lwork, info));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dormrq,
-                        (const char * side, const char * trans, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * a,
-                         const DAAL_INT * lda, const double * tau, double * c, const DAAL_INT * ldc, double * work, const DAAL_INT * lwork,
-                         DAAL_INT * info, int iside, int itrans),
-                        (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, iside, itrans));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sormrq,
-                        (const char * side, const char * trans, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * a,
-                         const DAAL_INT * lda, const float * tau, float * c, const DAAL_INT * ldc, float * work, const DAAL_INT * lwork,
-                         DAAL_INT * info, int iside, int itrans),
-                        (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, iside, itrans));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dormqr,
-                        (const char * side, const char * trans, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * a,
-                         const DAAL_INT * lda, const double * tau, double * c, const DAAL_INT * ldc, double * work, const DAAL_INT * lwork,
-                         DAAL_INT * info, int iside, int itrans),
-                        (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, iside, itrans));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sormqr,
-                        (const char * side, const char * trans, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * a,
-                         const DAAL_INT * lda, const float * tau, float * c, const DAAL_INT * ldc, float * work, const DAAL_INT * lwork,
-                         DAAL_INT * info, int iside, int itrans),
-                        (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, iside, itrans));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dtrtrs,
-                        (const char * uplo, const char * trans, const char * diag, const DAAL_INT * n, const DAAL_INT * nrhs, const double * a,
-                         const DAAL_INT * lda, double * b, const DAAL_INT * ldb, DAAL_INT * info, int iuplo, int itrans, int idiag),
-                        (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, iuplo, itrans, idiag));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, strtrs,
-                        (const char * uplo, const char * trans, const char * diag, const DAAL_INT * n, const DAAL_INT * nrhs, const float * a,
-                         const DAAL_INT * lda, float * b, const DAAL_INT * ldb, DAAL_INT * info, int iuplo, int itrans, int idiag),
-                        (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, iuplo, itrans, idiag));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dpptrf, (const char * uplo, const DAAL_INT * n, double * ap, DAAL_INT * info, int iuplo),
-                        (uplo, n, ap, info, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, spptrf, (const char * uplo, const DAAL_INT * n, float * ap, DAAL_INT * info, int iuplo),
-                        (uplo, n, ap, info, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgeqrf,
-                        (const DAAL_INT * m, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * tau, double * work,
-                         const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, a, lda, tau, work, lwork, info));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgeqrf,
-                        (const DAAL_INT * m, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * tau, float * work, const DAAL_INT * lwork,
-                         DAAL_INT * info),
-                        (m, n, a, lda, tau, work, lwork, info));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgeqp3,
-                        (const DAAL_INT * m, const DAAL_INT * n, double * a, const DAAL_INT * lda, DAAL_INT * jpvt, double * tau, double * work,
-                         const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, a, lda, jpvt, tau, work, lwork, info));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgeqp3,
-                        (const DAAL_INT * m, const DAAL_INT * n, float * a, const DAAL_INT * lda, DAAL_INT * jpvt, float * tau, float * work,
-                         const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, a, lda, jpvt, tau, work, lwork, info));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dorgqr,
-                        (const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, double * a, const DAAL_INT * lda, const double * tau,
-                         double * work, const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, k, a, lda, tau, work, lwork, info));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sorgqr,
-                        (const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, float * a, const DAAL_INT * lda, const float * tau, float * work,
-                         const DAAL_INT * lwork, DAAL_INT * info),
-                        (m, n, k, a, lda, tau, work, lwork, info));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dgesvd,
-                        (const char * jobu, const char * jobvt, const DAAL_INT * m, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * s,
-                         double * u, const DAAL_INT * ldu, double * vt, const DAAL_INT * ldvt, double * work, const DAAL_INT * lwork, DAAL_INT * info,
-                         int ijobu, int ijobvt),
-                        (jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info, ijobu, ijobvt));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, sgesvd,
-                        (const char * jobu, const char * jobvt, const DAAL_INT * m, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * s,
-                         float * u, const DAAL_INT * ldu, float * vt, const DAAL_INT * ldvt, float * work, const DAAL_INT * lwork, DAAL_INT * info,
-                         int ijobu, int ijobvt),
-                        (jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, work, lwork, info, ijobu, ijobvt));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dsyevd,
-                        (const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work,
-                         const DAAL_INT * lwork, DAAL_INT * iwork, const DAAL_INT * liwork, DAAL_INT * info, int ijobz, int iuplo),
-                        (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, ijobz, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, ssyevd,
-                        (const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work,
-                         const DAAL_INT * lwork, DAAL_INT * iwork, const DAAL_INT * liwork, DAAL_INT * info, int ijobz, int iuplo),
-                        (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, ijobz, iuplo));
-
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, dsyev,
-                        (const char * jobz, const char * uplo, const DAAL_INT * n, double * a, const DAAL_INT * lda, double * w, double * work,
-                         const DAAL_INT * lwork, DAAL_INT * info, int ijobz, int iuplo),
-                        (jobz, uplo, n, a, lda, w, work, lwork, info, ijobz, iuplo));
-CALL_VOID_FUNC_FROM_DLL(fpk_lapack_, ssyev,
-                        (const char * jobz, const char * uplo, const DAAL_INT * n, float * a, const DAAL_INT * lda, float * w, float * work,
-                         const DAAL_INT * lwork, DAAL_INT * info, int ijobz, int iuplo),
-                        (jobz, uplo, n, a, lda, w, work, lwork, info, ijobz, iuplo));
-
-CALL_RET_FUNC_FROM_DLL(double, fpk_blas_, xddot,
-                       (const DAAL_INT * n, const double * x, const DAAL_INT * incx, const double * y, const DAAL_INT * incy), (n, x, incx, y, incy));
-CALL_RET_FUNC_FROM_DLL(float, fpk_blas_, xsdot, (const DAAL_INT * n, const float * x, const DAAL_INT * incx, const float * y, const DAAL_INT * incy),
-                       (n, x, incx, y, incy));
-
-#define CSRMM_ARGS(FPTYPE)                                                                                                                       \
-    const char *transa, const DAAL_INT *m, const DAAL_INT *n, const DAAL_INT *k, const FPTYPE *alpha, const char *matdescra, const FPTYPE *val,  \
-        const DAAL_INT *indx, const DAAL_INT *pntrb, const DAAL_INT *pntre, const FPTYPE *b, const DAAL_INT *ldb, const FPTYPE *beta, FPTYPE *c, \
-        const DAAL_INT *ldc
-
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_scsrmm, (CSRMM_ARGS(float)),
-                        (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, ldb, beta, c, ldc));
-CALL_VOID_FUNC_FROM_DLL(fpk_spblas_, mkl_dcsrmm, (CSRMM_ARGS(double)),
-                        (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, ldb, beta, c, ldc));
-
-typedef int IppStatus;
-typedef unsigned char Ipp8u;
-typedef unsigned short Ipp16u;
-typedef unsigned int Ipp32u;
-typedef signed short Ipp16s;
-typedef signed int Ipp32s;
-typedef float Ipp32f;
-typedef double Ipp64f;
-
-/* Used in Intel(R) oneAPI Data Analytics Library (oneDAL) via SS */
-CALL_RET_FUNC_FROM_DLL(IppStatus, fpk_dft_, ippsSortRadixAscend_64f_I, (Ipp64f * pSrcDst, Ipp64f * pTmp, Ipp32s len), (pSrcDst, pTmp, len));
-CALL_RET_FUNC_FROM_DLL(IppStatus, fpk_dft_, ippsSortRadixAscend_32f_I, (Ipp32f * pSrcDst, Ipp32f * pTmp, Ipp32s len), (pSrcDst, pTmp, len));
-
-#define CALL_VOID_FUNC_FROM_DLL_ALONE(fn_dpref, fn_name, argdecl, argcall)                           \
-    typedef void(*fn_dpref##fn_name##_t) argdecl;                                                    \
-    static fn_dpref##fn_name##_t fn_dpref##fn_name##_ptr = NULL;                                     \
-    extern "C" DAAL_EXPORT void fn_dpref##fn_name argdecl                                            \
-    {                                                                                                \
-        load_daal_thr_dll();                                                                         \
-        if (fn_dpref##fn_name##_ptr == NULL)                                                         \
-        {                                                                                            \
-            fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_name); \
-        }                                                                                            \
-        fn_dpref##fn_name##_ptr argcall;                                                             \
-    }
-
-#define CALL_RET_FUNC_FROM_DLL_ALONE(ret_type, fn_dpref, fn_name, argdecl, argcall)                  \
-    typedef ret_type(*fn_dpref##fn_name##_t) argdecl;                                                \
-    static fn_dpref##fn_name##_t fn_dpref##fn_name##_ptr = NULL;                                     \
-    extern "C" DAAL_EXPORT ret_type fn_dpref##fn_name argdecl                                        \
-    {                                                                                                \
-        load_daal_thr_dll();                                                                         \
-        if (fn_dpref##fn_name##_ptr == NULL)                                                         \
-        {                                                                                            \
-            fn_dpref##fn_name##_ptr = (fn_dpref##fn_name##_t)load_daal_thr_func(#fn_dpref #fn_name); \
-        }                                                                                            \
-        return fn_dpref##fn_name##_ptr argcall;                                                      \
-    }
-
-CALL_VOID_FUNC_FROM_DLL_ALONE(fpk_serv_, set_num_threads, (int nth), (nth));
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, get_max_threads, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, set_num_threads_local, (int nth), (nth));
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, get_ncpus, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, get_ncorespercpu, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, get_ht, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, get_nlogicalcores, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, cpuisknm, (void), ());
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, enable_instructions, (int nth), (nth));
-CALL_RET_FUNC_FROM_DLL_ALONE(int, fpk_serv_, memmove_s, (void * dest, size_t dmax, const void * src, size_t smax), (dest, dmax, src, smax));
-
-typedef void (*func_type)(DAAL_INT, DAAL_INT, DAAL_INT, void *);
-
-CALL_VOID_FUNC_FROM_DLL_ALONE(fpk_vsl_serv_, threader_for, (DAAL_INT n, DAAL_INT threads_request, void * a, func_type func),
-                              (n, threads_request, a, func));
-CALL_VOID_FUNC_FROM_DLL_ALONE(fpk_vsl_serv_, threader_for_ordered, (DAAL_INT n, DAAL_INT threads_request, void * a, func_type func),
-                              (n, threads_request, a, func));
-CALL_VOID_FUNC_FROM_DLL_ALONE(fpk_vsl_serv_, threader_sections, (DAAL_INT threads_request, void * a, func_type func), (threads_request, a, func));
-CALL_VOID_FUNC_FROM_DLL_ALONE(fpk_vsl_serv_, threader_ordered, (DAAL_INT i, DAAL_INT th_idx, DAAL_INT th_num, void * a, func_type func),
-                              (i, th_idx, th_num, a, func));
-CALL_RET_FUNC_FROM_DLL_ALONE(DAAL_INT, fpk_vsl_serv_, threader_get_num_threads_limit, (void), ());
diff --git a/cpp/daal/src/externals/service_blas_mkl.h b/cpp/daal/src/externals/service_blas_mkl.h
old mode 100755
new mode 100644
index 58b505a6067..b30fd5f6ee9
--- a/cpp/daal/src/externals/service_blas_mkl.h
+++ b/cpp/daal/src/externals/service_blas_mkl.h
@@ -25,65 +25,11 @@
 #define __SERVICE_BLAS_MKL_H__
 
 #include "services/daal_defines.h"
-#include "mkl_daal.h"
+#include <mkl.h>
 
-#if !defined(__DAAL_CONCAT4)
-    #define __DAAL_CONCAT4(a, b, c, d)  __DAAL_CONCAT41(a, b, c, d)
-    #define __DAAL_CONCAT41(a, b, c, d) a##b##c##d
-#endif
-
-#if !defined(__DAAL_CONCAT5)
-    #define __DAAL_CONCAT5(a, b, c, d, e)  __DAAL_CONCAT51(a, b, c, d, e)
-    #define __DAAL_CONCAT51(a, b, c, d, e) a##b##c##d##e
-#endif
-
-#if defined(__APPLE__)
-    #define __DAAL_MKL_SSE2  avx_
-    #define __DAAL_MKL_SSE42 avx_
-#else
-    #define __DAAL_MKL_SSE2  sse2_
-    #define __DAAL_MKL_SSE42 sse42_
-#endif
-
-#define __DAAL_MKLFN(f_cpu, f_pref, f_name)              __DAAL_CONCAT4(fpk_, f_pref, f_cpu, f_name)
-#define __DAAL_MKLFN_CALL(f_pref, f_name, f_args)        __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)
-#define __DAAL_MKLFN_CALL_RETURN(f_pref, f_name, f_args) __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)
-
-#define __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)             \
-    if (avx512 == cpu)                                         \
-    {                                                          \
-        __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                          \
-    if (avx2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                          \
-    if (sse42 == cpu)                                          \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                          \
-    if (sse2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
+#define __DAAL_MKLFN_CALL_BLAS(f_name, f_args) f_name f_args;
 
-#define __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)                    \
-    if (avx512 == cpu)                                                \
-    {                                                                 \
-        return __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                                 \
-    if (avx2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                                 \
-    if (sse42 == cpu)                                                 \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                                 \
-    if (sse2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
+#define __DAAL_MKLFN_CALL_RETURN_BLAS(f_name, f_args, res) res = f_name f_args;
 
 namespace daal
 {
@@ -107,87 +53,96 @@ struct MklBlas<double, cpu>
     static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
                       DAAL_INT * ldata)
     {
-        __DAAL_MKLFN_CALL(blas_, dsyrk, (uplo, trans, p, n, alpha, a, lda, beta, ata, ldata));
+        __DAAL_MKLFN_CALL_BLAS(dsyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
     }
 
     static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * beta, double * ata,
                        DAAL_INT * ldata)
     {
-        __DAAL_MKLFN_CALL(blas_, xdsyrk, (uplo, trans, p, n, alpha, a, lda, beta, ata, ldata));
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(dsyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsyr(const char * uplo, const DAAL_INT * n, const double * alpha, const double * x, const DAAL_INT * incx, double * a,
                      const DAAL_INT * lda)
     {
-        __DAAL_MKLFN_CALL(blas_, dsyr, (uplo, n, alpha, x, incx, a, lda));
+        __DAAL_MKLFN_CALL_BLAS(dsyr, (uplo, (MKL_INT *)n, alpha, x, (MKL_INT *)incx, a, (MKL_INT *)lda));
     }
 
     static void xxsyr(const char * uplo, const DAAL_INT * n, const double * alpha, const double * x, const DAAL_INT * incx, double * a,
                       const DAAL_INT * lda)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, dsyr, (uplo, n, alpha, x, incx, a, lda));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(dsyr, (uplo, (MKL_INT *)n, alpha, x, (MKL_INT *)incx, a, (MKL_INT *)lda));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgemm(const char * transa, const char * transb, const DAAL_INT * p, const DAAL_INT * ny, const DAAL_INT * n, const double * alpha,
                       const double * a, const DAAL_INT * lda, const double * y, const DAAL_INT * ldy, const double * beta, double * aty,
                       const DAAL_INT * ldaty)
     {
-        __DAAL_MKLFN_CALL(blas_, dgemm, (transa, transb, p, ny, n, alpha, a, lda, y, ldy, beta, aty, ldaty));
+        __DAAL_MKLFN_CALL_BLAS(dgemm, (transa, transb, (MKL_INT *)p, (MKL_INT *)ny, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, y, (MKL_INT *)ldy, beta,
+                                       aty, (MKL_INT *)ldaty));
     }
 
     static void xxgemm(const char * transa, const char * transb, const DAAL_INT * p, const DAAL_INT * ny, const DAAL_INT * n, const double * alpha,
                        const double * a, const DAAL_INT * lda, const double * y, const DAAL_INT * ldy, const double * beta, double * aty,
                        const DAAL_INT * ldaty)
     {
-        __DAAL_MKLFN_CALL(blas_, xdgemm, (transa, transb, p, ny, n, alpha, a, lda, y, ldy, beta, aty, ldaty));
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(dgemm, (transa, transb, (MKL_INT *)p, (MKL_INT *)ny, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, y, (MKL_INT *)ldy, beta,
+                                       aty, (MKL_INT *)ldaty));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsymm(const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a,
                       const DAAL_INT * lda, const double * b, const DAAL_INT * ldb, const double * beta, double * c, const DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(blas_, dsymm, (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
+        __DAAL_MKLFN_CALL_BLAS(dsymm, (side, uplo, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, beta, c, (MKL_INT *)ldc));
     }
 
     static void xxsymm(char * side, char * uplo, DAAL_INT * m, DAAL_INT * n, double * alpha, double * a, DAAL_INT * lda, double * b, DAAL_INT * ldb,
                        double * beta, double * c, DAAL_INT * ldc)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, dsymm, (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(dsymm, (side, uplo, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, beta, c, (MKL_INT *)ldc));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgemv(const char * trans, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a, const DAAL_INT * lda,
                       const double * x, const DAAL_INT * incx, const double * beta, double * y, const DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL(blas_, dgemv, (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
+        __DAAL_MKLFN_CALL_BLAS(dgemv, (trans, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, x, (MKL_INT *)incx, beta, y, (MKL_INT *)incy));
     }
 
     static void xxgemv(const char * trans, const DAAL_INT * m, const DAAL_INT * n, const double * alpha, const double * a, const DAAL_INT * lda,
                        const double * x, const DAAL_INT * incx, const double * beta, double * y, const DAAL_INT * incy)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, dgemv, (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(dgemv, (trans, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, x, (MKL_INT *)incx, beta, y, (MKL_INT *)incy));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xaxpy(DAAL_INT * n, double * a, double * x, DAAL_INT * incx, double * y, DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL(blas_, daxpy, (n, a, x, incx, y, incy));
+        __DAAL_MKLFN_CALL_BLAS(daxpy, ((MKL_INT *)n, a, x, (MKL_INT *)incx, y, (MKL_INT *)incy));
     }
 
     static void xxaxpy(const DAAL_INT * n, const double * a, const double * x, const DAAL_INT * incx, double * y, const DAAL_INT * incy)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, daxpy, (n, a, x, incx, y, incy));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(daxpy, ((MKL_INT *)n, a, x, (MKL_INT *)incx, y, (MKL_INT *)incy));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static double xxdot(const DAAL_INT * n, const double * x, const DAAL_INT * incx, const double * y, const DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL_RETURN(blas_, xddot, (n, x, incx, y, incy));
-        return 0;
+        int old_nthr = mkl_set_num_threads_local(1);
+        double res;
+        __DAAL_MKLFN_CALL_RETURN_BLAS(ddot, ((MKL_INT *)n, x, (MKL_INT *)incx, y, (MKL_INT *)incy), res);
+        mkl_set_num_threads_local(old_nthr);
+        return res;
     }
 };
 
@@ -203,87 +158,96 @@ struct MklBlas<float, cpu>
     static void xsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
                       DAAL_INT * ldata)
     {
-        __DAAL_MKLFN_CALL(blas_, ssyrk, (uplo, trans, p, n, alpha, a, lda, beta, ata, ldata));
+        __DAAL_MKLFN_CALL_BLAS(ssyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
     }
 
     static void xxsyrk(char * uplo, char * trans, DAAL_INT * p, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * beta, float * ata,
                        DAAL_INT * ldata)
     {
-        __DAAL_MKLFN_CALL(blas_, xssyrk, (uplo, trans, p, n, alpha, a, lda, beta, ata, ldata));
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(ssyrk, (uplo, trans, (MKL_INT *)p, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, beta, ata, (MKL_INT *)ldata));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsyr(const char * uplo, const DAAL_INT * n, const float * alpha, const float * x, const DAAL_INT * incx, float * a,
                      const DAAL_INT * lda)
     {
-        __DAAL_MKLFN_CALL(blas_, ssyr, (uplo, n, alpha, x, incx, a, lda));
+        __DAAL_MKLFN_CALL_BLAS(ssyr, (uplo, (MKL_INT *)n, alpha, x, (MKL_INT *)incx, a, (MKL_INT *)lda));
     }
 
     static void xxsyr(const char * uplo, const DAAL_INT * n, const float * alpha, const float * x, const DAAL_INT * incx, float * a,
                       const DAAL_INT * lda)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, ssyr, (uplo, n, alpha, x, incx, a, lda));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(ssyr, (uplo, (MKL_INT *)n, alpha, x, (MKL_INT *)incx, a, (MKL_INT *)lda));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgemm(const char * transa, const char * transb, const DAAL_INT * p, const DAAL_INT * ny, const DAAL_INT * n, const float * alpha,
                       const float * a, const DAAL_INT * lda, const float * y, const DAAL_INT * ldy, const float * beta, float * aty,
                       const DAAL_INT * ldaty)
     {
-        __DAAL_MKLFN_CALL(blas_, sgemm, (transa, transb, p, ny, n, alpha, a, lda, y, ldy, beta, aty, ldaty));
+        __DAAL_MKLFN_CALL_BLAS(sgemm, (transa, transb, (MKL_INT *)p, (MKL_INT *)ny, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, y, (MKL_INT *)ldy, beta,
+                                       aty, (MKL_INT *)ldaty));
     }
 
     static void xxgemm(const char * transa, const char * transb, const DAAL_INT * p, const DAAL_INT * ny, const DAAL_INT * n, const float * alpha,
                        const float * a, const DAAL_INT * lda, const float * y, const DAAL_INT * ldy, const float * beta, float * aty,
                        const DAAL_INT * ldaty)
     {
-        __DAAL_MKLFN_CALL(blas_, xsgemm, (transa, transb, p, ny, n, alpha, a, lda, y, ldy, beta, aty, ldaty));
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(sgemm, (transa, transb, (MKL_INT *)p, (MKL_INT *)ny, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, y, (MKL_INT *)ldy, beta,
+                                       aty, (MKL_INT *)ldaty));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsymm(const char * side, const char * uplo, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a,
                       const DAAL_INT * lda, const float * b, const DAAL_INT * ldb, const float * beta, float * c, const DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(blas_, ssymm, (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
+        __DAAL_MKLFN_CALL_BLAS(ssymm, (side, uplo, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, beta, c, (MKL_INT *)ldc));
     }
 
     static void xxsymm(char * side, char * uplo, DAAL_INT * m, DAAL_INT * n, float * alpha, float * a, DAAL_INT * lda, float * b, DAAL_INT * ldb,
                        float * beta, float * c, DAAL_INT * ldc)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, ssymm, (side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(ssymm, (side, uplo, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, beta, c, (MKL_INT *)ldc));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgemv(const char * trans, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a, const DAAL_INT * lda,
                       const float * x, const DAAL_INT * incx, const float * beta, float * y, const DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL(blas_, sgemv, (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
+        __DAAL_MKLFN_CALL_BLAS(sgemv, (trans, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, x, (MKL_INT *)incx, beta, y, (MKL_INT *)incy));
     }
 
     static void xxgemv(const char * trans, const DAAL_INT * m, const DAAL_INT * n, const float * alpha, const float * a, const DAAL_INT * lda,
                        const float * x, const DAAL_INT * incx, const float * beta, float * y, const DAAL_INT * incy)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, sgemv, (trans, m, n, alpha, a, lda, x, incx, beta, y, incy));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(sgemv, (trans, (MKL_INT *)m, (MKL_INT *)n, alpha, a, (MKL_INT *)lda, x, (MKL_INT *)incx, beta, y, (MKL_INT *)incy));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xaxpy(DAAL_INT * n, float * a, float * x, DAAL_INT * incx, float * y, DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL(blas_, saxpy, (n, a, x, incx, y, incy));
+        __DAAL_MKLFN_CALL_BLAS(saxpy, ((MKL_INT *)n, a, x, (MKL_INT *)incx, y, (MKL_INT *)incy));
     }
 
     static void xxaxpy(const DAAL_INT * n, const float * a, const float * x, const DAAL_INT * incx, float * y, const DAAL_INT * incy)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(blas_, saxpy, (n, a, x, incx, y, incy));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_BLAS(saxpy, ((MKL_INT *)n, a, x, (MKL_INT *)incx, y, (MKL_INT *)incy));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static float xxdot(const DAAL_INT * n, const float * x, const DAAL_INT * incx, const float * y, const DAAL_INT * incy)
     {
-        __DAAL_MKLFN_CALL_RETURN(blas_, xsdot, (n, x, incx, y, incy));
-        return 0;
+        int old_nthr = mkl_set_num_threads_local(1);
+        float res;
+        __DAAL_MKLFN_CALL_RETURN_BLAS(sdot, ((MKL_INT *)n, x, (MKL_INT *)incx, y, (MKL_INT *)incy), res);
+        mkl_set_num_threads_local(old_nthr);
+        return res;
     }
 };
 
diff --git a/cpp/daal/src/externals/service_lapack_mkl.h b/cpp/daal/src/externals/service_lapack_mkl.h
index 6bcbef317bc..37a81c3262f 100644
--- a/cpp/daal/src/externals/service_lapack_mkl.h
+++ b/cpp/daal/src/externals/service_lapack_mkl.h
@@ -25,65 +25,11 @@
 #define __SERVICE_LAPACK_MKL_H__
 
 #include "services/daal_defines.h"
-#include "mkl_daal.h"
+#include <mkl.h>
 
-#if !defined(__DAAL_CONCAT4)
-    #define __DAAL_CONCAT4(a, b, c, d)  __DAAL_CONCAT41(a, b, c, d)
-    #define __DAAL_CONCAT41(a, b, c, d) a##b##c##d
-#endif
-
-#if !defined(__DAAL_CONCAT5)
-    #define __DAAL_CONCAT5(a, b, c, d, e)  __DAAL_CONCAT51(a, b, c, d, e)
-    #define __DAAL_CONCAT51(a, b, c, d, e) a##b##c##d##e
-#endif
-
-#if defined(__APPLE__)
-    #define __DAAL_MKL_SSE2  avx_
-    #define __DAAL_MKL_SSE42 avx_
-#else
-    #define __DAAL_MKL_SSE2  sse2_
-    #define __DAAL_MKL_SSE42 sse42_
-#endif
+#define __DAAL_MKLFN_CALL_LAPACK(f_name, f_args) f_name f_args;
 
-#define __DAAL_MKLFN(f_cpu, f_pref, f_name)              __DAAL_CONCAT4(fpk_, f_pref, f_cpu, f_name)
-#define __DAAL_MKLFN_CALL(f_pref, f_name, f_args)        __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)
-#define __DAAL_MKLFN_CALL_RETURN(f_pref, f_name, f_args) __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)
-
-#define __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)             \
-    if (avx512 == cpu)                                         \
-    {                                                          \
-        __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                          \
-    if (avx2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                          \
-    if (sse42 == cpu)                                          \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                          \
-    if (sse2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
-
-#define __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)                    \
-    if (avx512 == cpu)                                                \
-    {                                                                 \
-        return __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                                 \
-    if (avx2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                                 \
-    if (sse42 == cpu)                                                 \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                                 \
-    if (sse2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
+#define __DAAL_MKLFN_CALL_RETURN_LAPACK(f_name, f_args) return f_name f_args;
 
 namespace daal
 {
@@ -106,194 +52,211 @@ struct MklLapack<double, cpu>
 
     static void xgetrf(DAAL_INT * m, DAAL_INT * n, double * a, DAAL_INT * lda, DAAL_INT * ipiv, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgetrf, (m, n, a, lda, ipiv, info));
+        __DAAL_MKLFN_CALL_LAPACK(dgetrf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, (MKL_INT *)ipiv, (MKL_INT *)info));
     }
 
     static void xxgetrf(DAAL_INT * m, DAAL_INT * n, double * a, DAAL_INT * lda, DAAL_INT * ipiv, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgetrf, (m, n, a, lda, ipiv, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dgetrf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, (MKL_INT *)ipiv, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgetrs(char * trans, DAAL_INT * n, DAAL_INT * nrhs, double * a, DAAL_INT * lda, DAAL_INT * ipiv, double * b, DAAL_INT * ldb,
                        DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgetrs, (trans, n, nrhs, a, lda, ipiv, b, ldb, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dgetrs,
+                                 (trans, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, (MKL_INT *)ipiv, b, (MKL_INT *)ldb, (MKL_INT *)info));
     }
 
     static void xxgetrs(char * trans, DAAL_INT * n, DAAL_INT * nrhs, double * a, DAAL_INT * lda, DAAL_INT * ipiv, double * b, DAAL_INT * ldb,
                         DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgetrs, (trans, n, nrhs, a, lda, ipiv, b, ldb, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dgetrs,
+                                 (trans, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, (MKL_INT *)ipiv, b, (MKL_INT *)ldb, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotrf(char * uplo, DAAL_INT * p, double * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dpotrf, (uplo, p, ata, ldata, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dpotrf, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
     }
 
     static void xxpotrf(char * uplo, DAAL_INT * p, double * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dpotrf, (uplo, p, ata, ldata, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dpotrf, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotrs(char * uplo, DAAL_INT * p, DAAL_INT * ny, double * ata, DAAL_INT * ldata, double * beta, DAAL_INT * ldaty, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dpotrs, (uplo, p, ny, ata, ldata, beta, ldaty, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dpotrs, (uplo, (MKL_INT *)p, (MKL_INT *)ny, ata, (MKL_INT *)ldata, beta, (MKL_INT *)ldaty, (MKL_INT *)info));
     }
 
     static void xxpotrs(char * uplo, DAAL_INT * p, DAAL_INT * ny, double * ata, DAAL_INT * ldata, double * beta, DAAL_INT * ldaty, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dpotrs, (uplo, p, ny, ata, ldata, beta, ldaty, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dpotrs, (uplo, (MKL_INT *)p, (MKL_INT *)ny, ata, (MKL_INT *)ldata, beta, (MKL_INT *)ldaty, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotri(char * uplo, DAAL_INT * p, double * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dpotri, (uplo, p, ata, ldata, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dpotri, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
     }
 
     static void xxpotri(char * uplo, DAAL_INT * p, double * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dpotri, (uplo, p, ata, ldata, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dpotri, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgerqf(DAAL_INT * m, DAAL_INT * n, double * a, DAAL_INT * lda, double * tau, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgerqf, (m, n, a, lda, tau, work, lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(dgerqf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, tau, work, (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxgerqf(DAAL_INT * m, DAAL_INT * n, double * a, DAAL_INT * lda, double * tau, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgerqf, (m, n, a, lda, tau, work, lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dgerqf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, tau, work, (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xormrq(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c,
                        DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dormrq, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dormrq, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxormrq(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c,
                         DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dormrq, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dormrq, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xtrtrs(char * uplo, char * trans, char * diag, DAAL_INT * n, DAAL_INT * nrhs, double * a, DAAL_INT * lda, double * b, DAAL_INT * ldb,
                        DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dtrtrs, (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, 1, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dtrtrs, (uplo, trans, diag, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, (MKL_INT *)info));
     }
 
     static void xxtrtrs(char * uplo, char * trans, char * diag, DAAL_INT * n, DAAL_INT * nrhs, double * a, DAAL_INT * lda, double * b, DAAL_INT * ldb,
                         DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dtrtrs, (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, 1, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dtrtrs, (uplo, trans, diag, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
-    static void xpptrf(char * uplo, DAAL_INT * n, double * ap, DAAL_INT * info) { __DAAL_MKLFN_CALL(lapack_, dpptrf, (uplo, n, ap, info, 1)); }
+    static void xpptrf(char * uplo, DAAL_INT * n, double * ap, DAAL_INT * info)
+    {
+        __DAAL_MKLFN_CALL_LAPACK(dpptrf, (uplo, (MKL_INT *)n, ap, (MKL_INT *)info));
+    }
 
     static void xxpptrf(char * uplo, DAAL_INT * n, double * ap, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dpptrf, (uplo, n, ap, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dpptrf, (uplo, (MKL_INT *)n, ap, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgeqrf(DAAL_INT m, DAAL_INT n, double * a, DAAL_INT lda, double * tau, double * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgeqrf, (&m, &n, a, &lda, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(dgeqrf, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgeqrf(DAAL_INT m, DAAL_INT n, double * a, DAAL_INT lda, double * tau, double * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgeqrf, (&m, &n, a, &lda, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dgeqrf, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgeqp3(const DAAL_INT m, const DAAL_INT n, double * a, const DAAL_INT lda, DAAL_INT * jpvt, double * tau, double * work,
                        const DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgeqp3, (&m, &n, a, &lda, jpvt, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(
+            dgeqp3, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), (MKL_INT *)jpvt, tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgeqp3(DAAL_INT m, DAAL_INT n, double * a, DAAL_INT lda, DAAL_INT * jpvt, double * tau, double * work, DAAL_INT lwork,
                         DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgeqp3, (&m, &n, a, &lda, jpvt, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            dgeqp3, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), (MKL_INT *)jpvt, tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xorgqr(const DAAL_INT m, const DAAL_INT n, const DAAL_INT k, double * a, const DAAL_INT lda, const double * tau, double * work,
                        const DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dorgqr, (&m, &n, &k, a, &lda, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(
+            dorgqr, ((MKL_INT *)(&m), (MKL_INT *)(&n), (MKL_INT *)(&k), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxorgqr(DAAL_INT m, DAAL_INT n, DAAL_INT k, double * a, DAAL_INT lda, double * tau, double * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dorgqr, (&m, &n, &k, a, &lda, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            dorgqr, ((MKL_INT *)(&m), (MKL_INT *)(&n), (MKL_INT *)(&k), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgesvd(char jobu, char jobvt, DAAL_INT m, DAAL_INT n, double * a, DAAL_INT lda, double * s, double * u, DAAL_INT ldu, double * vt,
                        DAAL_INT ldvt, double * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dgesvd, (&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dgesvd, (&jobu, &jobvt, (MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), s, u, (MKL_INT *)(&ldu), vt,
+                                          (MKL_INT *)(&ldvt), work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgesvd(char jobu, char jobvt, DAAL_INT m, DAAL_INT n, double * a, DAAL_INT lda, double * s, double * u, DAAL_INT ldu, double * vt,
                         DAAL_INT ldvt, double * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dgesvd, (&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dgesvd, (&jobu, &jobvt, (MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), s, u, (MKL_INT *)(&ldu), vt,
+                                          (MKL_INT *)(&ldvt), work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsyevd(char * jobz, char * uplo, DAAL_INT * n, double * a, DAAL_INT * lda, double * w, double * work, DAAL_INT * lwork,
                        DAAL_INT * iwork, DAAL_INT * liwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dsyevd, (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(
+            dsyevd, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)iwork, (MKL_INT *)liwork, (MKL_INT *)info));
     }
 
     static void xxsyevd(char * jobz, char * uplo, DAAL_INT * n, double * a, DAAL_INT * lda, double * w, double * work, DAAL_INT * lwork,
                         DAAL_INT * iwork, DAAL_INT * liwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dsyevd, (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            dsyevd, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)iwork, (MKL_INT *)liwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c,
                        DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, dormqr, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(dormqr, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, double * a, DAAL_INT * lda, double * tau, double * c,
                         DAAL_INT * ldc, double * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, dormqr, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(dormqr, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
@@ -308,193 +271,210 @@ struct MklLapack<float, cpu>
 
     static void xgetrf(DAAL_INT * m, DAAL_INT * n, float * a, DAAL_INT * lda, DAAL_INT * ipiv, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgetrf, (m, n, a, lda, ipiv, info));
+        __DAAL_MKLFN_CALL_LAPACK(sgetrf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, (MKL_INT *)ipiv, (MKL_INT *)info));
     }
 
     static void xxgetrf(DAAL_INT * m, DAAL_INT * n, float * a, DAAL_INT * lda, DAAL_INT * ipiv, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgetrf, (m, n, a, lda, ipiv, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sgetrf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, (MKL_INT *)ipiv, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgetrs(char * trans, DAAL_INT * n, DAAL_INT * nrhs, float * a, DAAL_INT * lda, DAAL_INT * ipiv, float * b, DAAL_INT * ldb,
                        DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgetrs, (trans, n, nrhs, a, lda, ipiv, b, ldb, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(sgetrs,
+                                 (trans, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, (MKL_INT *)ipiv, b, (MKL_INT *)ldb, (MKL_INT *)info));
     }
 
     static void xxgetrs(char * trans, DAAL_INT * n, DAAL_INT * nrhs, float * a, DAAL_INT * lda, DAAL_INT * ipiv, float * b, DAAL_INT * ldb,
                         DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgetrs, (trans, n, nrhs, a, lda, ipiv, b, ldb, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sgetrs,
+                                 (trans, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, (MKL_INT *)ipiv, b, (MKL_INT *)ldb, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotrf(char * uplo, DAAL_INT * p, float * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, spotrf, (uplo, p, ata, ldata, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(spotrf, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
     }
 
     static void xxpotrf(char * uplo, DAAL_INT * p, float * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, spotrf, (uplo, p, ata, ldata, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(spotrf, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotrs(char * uplo, DAAL_INT * p, DAAL_INT * ny, float * ata, DAAL_INT * ldata, float * beta, DAAL_INT * ldaty, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, spotrs, (uplo, p, ny, ata, ldata, beta, ldaty, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(spotrs, (uplo, (MKL_INT *)p, (MKL_INT *)ny, ata, (MKL_INT *)ldata, beta, (MKL_INT *)ldaty, (MKL_INT *)info));
     }
 
     static void xxpotrs(char * uplo, DAAL_INT * p, DAAL_INT * ny, float * ata, DAAL_INT * ldata, float * beta, DAAL_INT * ldaty, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, spotrs, (uplo, p, ny, ata, ldata, beta, ldaty, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(spotrs, (uplo, (MKL_INT *)p, (MKL_INT *)ny, ata, (MKL_INT *)ldata, beta, (MKL_INT *)ldaty, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xpotri(char * uplo, DAAL_INT * p, float * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, spotri, (uplo, p, ata, ldata, info, 1));
+        __DAAL_MKLFN_CALL_LAPACK(spotri, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
     }
 
     static void xxpotri(char * uplo, DAAL_INT * p, float * ata, DAAL_INT * ldata, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, spotri, (uplo, p, ata, ldata, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(spotri, (uplo, (MKL_INT *)p, ata, (MKL_INT *)ldata, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgerqf(DAAL_INT * m, DAAL_INT * n, float * a, DAAL_INT * lda, float * tau, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgerqf, (m, n, a, lda, tau, work, lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(sgerqf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, tau, work, (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxgerqf(DAAL_INT * m, DAAL_INT * n, float * a, DAAL_INT * lda, float * tau, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgerqf, (m, n, a, lda, tau, work, lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sgerqf, ((MKL_INT *)m, (MKL_INT *)n, a, (MKL_INT *)lda, tau, work, (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xormrq(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c,
                        DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sormrq, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(sormrq, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxormrq(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c,
                         DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sormrq, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sormrq, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xtrtrs(char * uplo, char * trans, char * diag, DAAL_INT * n, DAAL_INT * nrhs, float * a, DAAL_INT * lda, float * b, DAAL_INT * ldb,
                        DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, strtrs, (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, 1, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(strtrs, (uplo, trans, diag, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, (MKL_INT *)info));
     }
 
     static void xxtrtrs(char * uplo, char * trans, char * diag, DAAL_INT * n, DAAL_INT * nrhs, float * a, DAAL_INT * lda, float * b, DAAL_INT * ldb,
                         DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, strtrs, (uplo, trans, diag, n, nrhs, a, lda, b, ldb, info, 1, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(strtrs, (uplo, trans, diag, (MKL_INT *)n, (MKL_INT *)nrhs, a, (MKL_INT *)lda, b, (MKL_INT *)ldb, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
-    static void xpptrf(char * uplo, DAAL_INT * n, float * ap, DAAL_INT * info) { __DAAL_MKLFN_CALL(lapack_, spptrf, (uplo, n, ap, info, 1)); }
+    static void xpptrf(char * uplo, DAAL_INT * n, float * ap, DAAL_INT * info)
+    {
+        __DAAL_MKLFN_CALL_LAPACK(spptrf, (uplo, (MKL_INT *)n, ap, (MKL_INT *)info));
+    }
 
     static void xxpptrf(char * uplo, DAAL_INT * n, float * ap, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, spptrf, (uplo, n, ap, info, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(spptrf, (uplo, (MKL_INT *)n, ap, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgeqrf(DAAL_INT m, DAAL_INT n, float * a, DAAL_INT lda, float * tau, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgeqrf, (&m, &n, a, &lda, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(sgeqrf, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgeqrf(DAAL_INT m, DAAL_INT n, float * a, DAAL_INT lda, float * tau, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgeqrf, (&m, &n, a, &lda, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sgeqrf, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgeqp3(const DAAL_INT m, const DAAL_INT n, float * a, const DAAL_INT lda, DAAL_INT * jpvt, float * tau, float * work,
                        const DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgeqp3, (&m, &n, a, &lda, jpvt, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(
+            sgeqp3, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), (MKL_INT *)jpvt, tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgeqp3(DAAL_INT m, DAAL_INT n, float * a, DAAL_INT lda, DAAL_INT * jpvt, float * tau, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgeqp3, (&m, &n, a, &lda, jpvt, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            sgeqp3, ((MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), (MKL_INT *)jpvt, tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xorgqr(const DAAL_INT m, const DAAL_INT n, const DAAL_INT k, float * a, const DAAL_INT lda, const float * tau, float * work,
                        const DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sorgqr, (&m, &n, &k, a, &lda, tau, work, &lwork, info));
+        __DAAL_MKLFN_CALL_LAPACK(
+            sorgqr, ((MKL_INT *)(&m), (MKL_INT *)(&n), (MKL_INT *)(&k), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxorgqr(DAAL_INT m, DAAL_INT n, DAAL_INT k, float * a, DAAL_INT lda, float * tau, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sorgqr, (&m, &n, &k, a, &lda, tau, work, &lwork, info));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            sorgqr, ((MKL_INT *)(&m), (MKL_INT *)(&n), (MKL_INT *)(&k), a, (MKL_INT *)(&lda), tau, work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xgesvd(char jobu, char jobvt, DAAL_INT m, DAAL_INT n, float * a, DAAL_INT lda, float * s, float * u, DAAL_INT ldu, float * vt,
                        DAAL_INT ldvt, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sgesvd, (&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(sgesvd, (&jobu, &jobvt, (MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), s, u, (MKL_INT *)(&ldu), vt,
+                                          (MKL_INT *)(&ldvt), work, (MKL_INT *)(&lwork), (MKL_INT *)info));
     }
 
     static void xxgesvd(char jobu, char jobvt, DAAL_INT m, DAAL_INT n, float * a, DAAL_INT lda, float * s, float * u, DAAL_INT ldu, float * vt,
                         DAAL_INT ldvt, float * work, DAAL_INT lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sgesvd, (&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, &lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sgesvd, (&jobu, &jobvt, (MKL_INT *)(&m), (MKL_INT *)(&n), a, (MKL_INT *)(&lda), s, u, (MKL_INT *)(&ldu), vt,
+                                          (MKL_INT *)(&ldvt), work, (MKL_INT *)(&lwork), (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xsyevd(char * jobz, char * uplo, DAAL_INT * n, float * a, DAAL_INT * lda, float * w, float * work, DAAL_INT * lwork, DAAL_INT * iwork,
                        DAAL_INT * liwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, ssyevd, (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(
+            ssyevd, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)iwork, (MKL_INT *)liwork, (MKL_INT *)info));
     }
 
     static void xxsyevd(char * jobz, char * uplo, DAAL_INT * n, float * a, DAAL_INT * lda, float * w, float * work, DAAL_INT * lwork,
                         DAAL_INT * iwork, DAAL_INT * liwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, ssyevd, (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(
+            ssyevd, (jobz, uplo, (MKL_INT *)n, a, (MKL_INT *)lda, w, work, (MKL_INT *)lwork, (MKL_INT *)iwork, (MKL_INT *)liwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 
     static void xormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c,
                        DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        __DAAL_MKLFN_CALL(lapack_, sormqr, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
+        __DAAL_MKLFN_CALL_LAPACK(sormqr, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
     }
 
     static void xxormqr(char * side, char * trans, DAAL_INT * m, DAAL_INT * n, DAAL_INT * k, float * a, DAAL_INT * lda, float * tau, float * c,
                         DAAL_INT * ldc, float * work, DAAL_INT * lwork, DAAL_INT * info)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(lapack_, sormqr, (side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, info, 1, 1));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr = mkl_set_num_threads_local(1);
+        __DAAL_MKLFN_CALL_LAPACK(sormqr, (side, trans, (MKL_INT *)m, (MKL_INT *)n, (MKL_INT *)k, a, (MKL_INT *)lda, tau, c, (MKL_INT *)ldc, work,
+                                          (MKL_INT *)lwork, (MKL_INT *)info));
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
diff --git a/cpp/daal/src/externals/service_math_mkl.h b/cpp/daal/src/externals/service_math_mkl.h
index a8bde41720f..fa5ce46a5ea 100644
--- a/cpp/daal/src/externals/service_math_mkl.h
+++ b/cpp/daal/src/externals/service_math_mkl.h
@@ -25,46 +25,12 @@
 #define __SERVICE_MATH_MKL_H__
 
 #include <math.h>
-#include "vmlvsl.h"
+#include <mkl.h>
 #include "src/services/service_defines.h"
 
-#if !defined(__DAAL_CONCAT5)
-    #define __DAAL_CONCAT5(a, b, c, d, e)  __DAAL_CONCAT51(a, b, c, d, e)
-    #define __DAAL_CONCAT51(a, b, c, d, e) a##b##c##d##e
-#endif
-
-#define VMLFN(f_cpu, f_name, f_suff)       __DAAL_CONCAT5(fpk_vml_, f_name, _, f_cpu, f_suff)
-#define VMLFN_CALL(f_name, f_suff, f_args) VMLFN_CALL1(f_name, f_suff, f_args)
-
-#if defined(__APPLE__)
-    #define __DAAL_MKLVML_SSE2  E9
-    #define __DAAL_MKLVML_SSE42 E9
-#else
-    #define __DAAL_MKLVML_SSE2  EX
-    #define __DAAL_MKLVML_SSE42 H8
-#endif
-
-#define VMLFN_CALL1(f_name, f_suff, f_args)                \
-    if (avx512 == cpu)                                     \
-    {                                                      \
-        VMLFN(Z0, f_name, f_suff) f_args;                  \
-        return;                                            \
-    }                                                      \
-    if (avx2 == cpu)                                       \
-    {                                                      \
-        VMLFN(L9, f_name, f_suff) f_args;                  \
-        return;                                            \
-    }                                                      \
-    if (sse42 == cpu)                                      \
-    {                                                      \
-        VMLFN(__DAAL_MKLVML_SSE42, f_name, f_suff) f_args; \
-        return;                                            \
-    }                                                      \
-    if (sse2 == cpu)                                       \
-    {                                                      \
-        VMLFN(__DAAL_MKLVML_SSE2, f_name, f_suff) f_args;  \
-        return;                                            \
-    }
+#define __DAAL_MKLFN_CALL_MATH(f_name, f_args) \
+    f_name f_args;                             \
+    return;
 
 namespace daal
 {
@@ -135,27 +101,57 @@ struct MklMath<double, cpu>
         return r;
     }
 
-    static void vPowx(SizeType n, const double * in, double in1, double * out) { VMLFN_CALL(dPowx, HAynn, ((int)n, in, in1, out)); }
+    static void vPowx(SizeType n, const double * in, double in1, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vCeil(SizeType n, const double * in, double * out) { VMLFN_CALL(dCeil, HAynn, ((int)n, in, out)); }
+    static void vCeil(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vErfInv(SizeType n, const double * in, double * out) { VMLFN_CALL(dErfInv, HAynn, ((int)n, in, out)); }
+    static void vErfInv(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vErf(SizeType n, const double * in, double * out) { VMLFN_CALL(dErf, HAynn, ((int)n, in, out)); }
+    static void vErf(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vExp(SizeType n, const double * in, double * out) { VMLFN_CALL(dExp, HAynn, ((int)n, in, out)); }
+    static void vExp(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
     static double vExpThreshold() { return -650.0; }
 
-    static void vTanh(SizeType n, const double * in, double * out) { VMLFN_CALL(dTanh, HAynn, ((int)n, in, out)); }
+    static void vTanh(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vSqrt(SizeType n, const double * in, double * out) { VMLFN_CALL(dSqrt, HAynn, ((int)n, in, out)); }
+    static void vSqrt(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vLog(SizeType n, const double * in, double * out) { VMLFN_CALL(dLn, HAynn, ((int)n, in, out)); }
+    static void vLog(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vLog1p(SizeType n, const double * in, double * out) { VMLFN_CALL(dLog1p, HAynn, ((int)n, in, out)); }
+    static void vLog1p(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vCdfNormInv(SizeType n, const double * in, double * out) { VMLFN_CALL(dCdfNormInv, HAynn, ((int)n, in, out)); }
+    static void vCdfNormInv(SizeType n, const double * in, double * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmdCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 };
 
 /*
@@ -217,27 +213,57 @@ struct MklMath<float, cpu>
         return r;
     }
 
-    static void vPowx(SizeType n, const float * in, float in1, float * out) { VMLFN_CALL(sPowx, HAynn, ((int)n, in, in1, out)); }
+    static void vPowx(SizeType n, const float * in, float in1, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsPowx, ((int)n, in, in1, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vCeil(SizeType n, const float * in, float * out) { VMLFN_CALL(sCeil, HAynn, ((int)n, in, out)); }
+    static void vCeil(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsCeil, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vErfInv(SizeType n, const float * in, float * out) { VMLFN_CALL(sErfInv, HAynn, ((int)n, in, out)); }
+    static void vErfInv(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsErfInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vErf(SizeType n, const float * in, float * out) { VMLFN_CALL(sErf, HAynn, ((int)n, in, out)); }
+    static void vErf(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsErf, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vExp(SizeType n, const float * in, float * out) { VMLFN_CALL(sExp, HAynn, ((int)n, in, out)); }
+    static void vExp(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsExp, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
     static float vExpThreshold() { return -75.0f; }
 
-    static void vTanh(SizeType n, const float * in, float * out) { VMLFN_CALL(sTanh, HAynn, ((int)n, in, out)); }
+    static void vTanh(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsTanh, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vSqrt(SizeType n, const float * in, float * out) { VMLFN_CALL(sSqrt, HAynn, ((int)n, in, out)); }
+    static void vSqrt(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsSqrt, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vLog(SizeType n, const float * in, float * out) { VMLFN_CALL(sLn, HAynn, ((int)n, in, out)); }
+    static void vLog(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsLn, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vLog1p(SizeType n, const float * in, float * out) { VMLFN_CALL(sLog1p, HAynn, ((int)n, in, out)); }
+    static void vLog1p(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsLog1p, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 
-    static void vCdfNormInv(SizeType n, const float * in, float * out) { VMLFN_CALL(sCdfNormInv, HAynn, ((int)n, in, out)); }
+    static void vCdfNormInv(SizeType n, const float * in, float * out)
+    {
+        __DAAL_MKLFN_CALL_MATH(vmsCdfNormInv, ((int)n, in, out, (VML_HA | VML_FTZDAZ_ON | VML_ERRMODE_IGNORE)));
+    }
 };
 
 } // namespace mkl
diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h
index f8ddfba394d..b2dcd81b78b 100644
--- a/cpp/daal/src/externals/service_rng_mkl.h
+++ b/cpp/daal/src/externals/service_rng_mkl.h
@@ -24,7 +24,7 @@
 #ifndef __SERVICE_RNG_MKL_H__
 #define __SERVICE_RNG_MKL_H__
 
-#include "vmlvsl.h"
+#include <mkl.h>
 #include "src/externals/service_stat_rng_mkl.h"
 #include "src/externals/service_rng_common.h"
 
@@ -67,7 +67,7 @@ int uniformRNG(const size_t cn, size_t * r, void * stream, const size_t a, const
         int nb     = len / 2;
         int nn     = (int)n;
         int * rr   = (int *)r;
-        __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniform, (method, stream, nn, rr, na, nb), errcode);
+        __DAAL_VSLFN_CALL_NR_WHILE(iRngUniform, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, na, nb), errcode);
 
         if (errcode != 0)
         {
@@ -90,7 +90,7 @@ int uniformRNG(const size_t cn, size_t * r, void * stream, const size_t a, const
             int nb     = len / 2;
             int nn     = (int)n;
             int * rr   = (int *)r + n;
-            __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniform, (method, stream, nn, rr, na, nb), errcode);
+            __DAAL_VSLFN_CALL_NR_WHILE(iRngUniform, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, na, nb), errcode);
 
             if (errcode != 0)
             {
@@ -120,7 +120,7 @@ int uniformRNG(const size_t cn, size_t * r, void * stream, const size_t a, const
                 for (int i = 0; i < 64; i++) dv /= 2.0;
                 int nn                = (int)n;
                 unsigned __int64 * rr = cr;
-                __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniformBits64, (method, stream, nn, rr), errcode);
+                __DAAL_VSLFN_CALL_NR_WHILE(iRngUniformBits64, ((const MKL_INT)method, stream, (const MKL_INT)nn, (unsigned MKL_INT64 *)rr), errcode);
 
                 if (errcode != 0)
                 {
@@ -136,7 +136,8 @@ int uniformRNG(const size_t cn, size_t * r, void * stream, const size_t a, const
                     n                     = cn - pos;
                     int nn                = (int)n;
                     unsigned __int64 * rr = cr + pos;
-                    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniformBits64, (method, stream, nn, rr), errcode);
+                    __DAAL_VSLFN_CALL_NR_WHILE(iRngUniformBits64, ((const MKL_INT)method, stream, (const MKL_INT)nn, (unsigned MKL_INT64 *)rr),
+                                               errcode);
 
                     if (errcode != 0)
                     {
@@ -169,7 +170,7 @@ int uniformRNG(const size_t n, int * r, void * stream, const int a, const int b,
     int errcode = 0;
     int nn      = (int)n;
     int * rr    = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniform, (method, stream, nn, rr, a, b), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(iRngUniform, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, a, b), errcode);
     return errcode;
 }
 
@@ -179,7 +180,7 @@ int uniformRNG(const size_t n, float * r, void * stream, const float a, const fl
     int errcode = 0;
     int nn      = (int)n;
     float * rr  = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, sRngUniform, (method, stream, nn, rr, a, b), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(sRngUniform, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, a, b), errcode);
     return errcode;
 }
 
@@ -189,7 +190,7 @@ int uniformRNG(const size_t n, double * r, void * stream, const double a, const
     int errcode = 0;
     int nn      = (int)n;
     double * rr = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, dRngUniform, (method, stream, nn, rr, a, b), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(dRngUniform, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, a, b), errcode);
     return errcode;
 }
 
@@ -199,7 +200,7 @@ int uniformBits32RNG(const size_t n, unsigned int * r, void * stream, const int
     int errcode       = 0;
     int nn            = (int)n;
     unsigned int * rr = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngUniformBits32, (method, stream, nn, rr), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(iRngUniformBits32, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr), errcode);
     return errcode;
 }
 
@@ -213,7 +214,7 @@ int gaussianRNG(const size_t n, float * r, void * stream, const float a, const f
     int errcode = 0;
     int nn      = (int)n;
     float * rr  = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, sRngGaussian, (method, stream, nn, rr, a, sigma), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(sRngGaussian, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, a, sigma), errcode);
     return errcode;
 }
 
@@ -223,7 +224,7 @@ int gaussianRNG(const size_t n, double * r, void * stream, const double a, const
     int errcode = 0;
     int nn      = (int)n;
     double * rr = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, dRngGaussian, (method, stream, nn, rr, a, sigma), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(dRngGaussian, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, a, sigma), errcode);
     return errcode;
 }
 
@@ -237,7 +238,7 @@ int bernoulliRNG(const size_t n, int * r, void * stream, const double p, const i
     int errcode = 0;
     int nn      = (int)n;
     int * rr    = r;
-    __DAAL_VSLFN_CALL_NR_WHILE(fpk_vsl_kernel, iRngBernoulli, (method, stream, nn, rr, p), errcode);
+    __DAAL_VSLFN_CALL_NR_WHILE(iRngBernoulli, ((const MKL_INT)method, stream, (const MKL_INT)nn, rr, p), errcode);
     return errcode;
 }
 
@@ -252,7 +253,7 @@ class BaseRNG : public BaseRNGIface<cpu>
         {
             _seed[0]    = seed;
             int errcode = 0;
-            __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslNewStreamEx, (&_stream, brngId, 1, &seed), errcode);
+            __DAAL_VSLFN_CALL_NR(vslNewStreamEx, (&_stream, (const MKL_INT)brngId, (const MKL_INT)1, &seed), errcode);
         }
     }
 
@@ -270,7 +271,7 @@ class BaseRNG : public BaseRNGIface<cpu>
                 }
             }
             int errcode = 0;
-            __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslNewStreamEx, (&_stream, brngId, n, seed), errcode);
+            __DAAL_VSLFN_CALL_NR(vslNewStreamEx, (&_stream, (const MKL_INT)brngId, (const MKL_INT)n, seed), errcode);
         }
     }
 
@@ -284,8 +285,8 @@ class BaseRNG : public BaseRNGIface<cpu>
                 _seed[i] = other._seed[i];
             }
             int errcode = 0;
-            __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslNewStreamEx, (&_stream, _brngId, _seedSize, _seed), errcode);
-            if (!errcode) __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslCopyStreamState, (_stream, other._stream), errcode);
+            __DAAL_VSLFN_CALL_NR(vslNewStreamEx, (&_stream, (const MKL_INT)_brngId, (const MKL_INT)_seedSize, _seed), errcode);
+            if (!errcode) __DAAL_VSLFN_CALL_NR(vslCopyStreamState, (_stream, other._stream), errcode);
         }
     }
 
@@ -293,42 +294,42 @@ class BaseRNG : public BaseRNGIface<cpu>
     {
         daal::services::daal_free((void *)_seed);
         int errcode = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslDeleteStream, (&_stream), errcode);
+        __DAAL_VSLFN_CALL_NR(vslDeleteStream, (&_stream), errcode);
     }
 
     int getStateSize() const
     {
         int res = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslGetStreamSize, (_stream), res);
+        __DAAL_VSLFN_CALL_NR(vslGetStreamSize, (_stream), res);
         return res;
     }
 
     int saveState(void * dest) const
     {
         int errcode = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslSaveStreamM, (_stream, (char *)dest), errcode);
+        __DAAL_VSLFN_CALL_NR(vslSaveStreamM, (_stream, (char *)dest), errcode);
         return errcode;
     }
 
     int loadState(const void * src)
     {
         int errcode = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslDeleteStream, (&_stream), errcode);
-        if (!errcode) __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslLoadStreamM, (&_stream, (const char *)src), errcode);
+        __DAAL_VSLFN_CALL_NR(vslDeleteStream, (&_stream), errcode);
+        if (!errcode) __DAAL_VSLFN_CALL_NR(vslLoadStreamM, (&_stream, (const char *)src), errcode);
         return errcode;
     }
 
     int leapfrog(size_t threadNum, size_t nThreads)
     {
         int errcode = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslLeapfrogStream, (_stream, threadNum, nThreads), errcode);
+        __DAAL_VSLFN_CALL_NR(vslLeapfrogStream, (_stream, (const MKL_INT)threadNum, (const MKL_INT)nThreads), errcode);
         return errcode;
     }
 
     int skipAhead(size_t nSkip)
     {
         int errcode = 0;
-        __DAAL_VSLFN_CALL_NR(fpk_vsl_sub_kernel, vslSkipAheadStream, (_stream, nSkip), errcode);
+        __DAAL_VSLFN_CALL_NR(vslSkipAheadStream, (_stream, nSkip), errcode);
         return errcode;
     }
 
diff --git a/cpp/daal/src/externals/service_service_mkl.h b/cpp/daal/src/externals/service_service_mkl.h
index 335fe2a5c21..50ebd34dab5 100644
--- a/cpp/daal/src/externals/service_service_mkl.h
+++ b/cpp/daal/src/externals/service_service_mkl.h
@@ -25,9 +25,10 @@
 #define __SERVICE_SERVICE_MKL_H__
 
 #include "services/daal_defines.h"
-#include "mkl_daal.h"
-#include "istrconv_daal.h"
-#include "istrconv_daal_el.h"
+#include "src/services/service_topo.h"
+#include <mkl.h>
+#include <mkl_service.h>
+#include <string.h>
 
 namespace daal
 {
@@ -37,49 +38,103 @@ namespace mkl
 {
 struct MklService
 {
-    static void * serv_malloc(size_t size, size_t alignment) { return fpk_serv_malloc(size, alignment); }
+    static void * serv_malloc(size_t size, size_t alignment) { return MKL_malloc(size, alignment); }
 
-    static void serv_free(void * ptr) { fpk_serv_free(ptr); }
+    static void serv_free(void * ptr) { MKL_free(ptr); }
 
-    static void serv_free_buffers() { fpk_serv_free_buffers(); }
+    static void serv_free_buffers() { MKL_Free_Buffers(); }
 
     static int serv_memcpy_s(void * dest, size_t destSize, const void * src, size_t srcSize)
     {
-        return fpk_serv_memcpy_s(dest, destSize, src, srcSize);
+        if (destSize < srcSize) return static_cast<int>(ENOMEM);
+        memcpy(dest, src, srcSize);
+        return 0;
+        // TODO: safe funtion
+        // return memcpy_s(dest, destSize, src, srcSize);
     }
 
-    static int serv_memmove_s(void * dest, size_t destSize, const void * src, size_t smax) { return fpk_serv_memmove_s(dest, destSize, src, smax); }
-
-    static int serv_get_ht() { return fpk_serv_get_ht(); }
+    static int serv_memmove_s(void * dest, size_t destSize, const void * src, size_t smax)
+    {
+        if (destSize < smax) return static_cast<int>(ENOMEM);
+        memmove(dest, src, smax);
+        return 0;
+        // TODO: safe funtion
+        // return memmove_s(dest, destSize, src, smax);
+    }
 
-    static int serv_get_ncpus() { return fpk_serv_get_ncpus(); }
+    static int serv_get_ht() { return (serv_get_ncorespercpu() > 1 ? 1 : 0); }
 
-    static int serv_get_ncorespercpu() { return fpk_serv_get_ncorespercpu(); }
+    static int serv_get_ncpus()
+    {
+        unsigned int ncores = daal::services::internal::_internal_daal_GetProcessorCoreCount();
+        return (ncores ? ncores : 1);
+    }
 
-    static int serv_set_memory_limit(int type, size_t limit) { return fpk_serv_set_memory_limit(type, limit); }
+    static int serv_get_ncorespercpu()
+    {
+        unsigned int nlogicalcpu = daal::services::internal::_internal_daal_GetProcessorCoreCount();
+        unsigned int ncpus       = serv_get_ncpus();
+        return (ncpus > 0 && nlogicalcpu > 0 && nlogicalcpu > ncpus ? nlogicalcpu / ncpus : 1);
+    }
 
+    // TODO: The real call should be delegated to a backend library if the option is supported
+    static int serv_set_memory_limit(int type, size_t limit) { return MKL_Set_Memory_Limit(type, limit); }
     // Added for interface compatibility - not expected to be called
-    static size_t serv_strnlen_s(const char * src, size_t slen)
+    static size_t serv_strnlen_s(const char * src, size_t slen) { return strnlen(src, slen); }
+
+    static int serv_strncpy_s(char * dest, size_t dmax, const char * src, size_t slen)
     {
-        size_t i = 0;
-        for (; i < slen && src[i] != '\0'; ++i)
-            ;
-        return i;
+        if (dmax < slen) return static_cast<int>(ENOMEM);
+        strncpy(dest, src, slen);
+        return 0;
     }
 
-    static int serv_strncpy_s(char * dest, size_t dmax, const char * src, size_t slen) { return fpk_serv_strncpy_s(dest, dmax, src, slen); }
-
-    static int serv_strncat_s(char * dest, size_t dmax, const char * src, size_t slen) { return fpk_serv_strncat_s(dest, dmax, src, slen); }
+    static int serv_strncat_s(char * dest, size_t dmax, const char * src, size_t slen)
+    {
+        if (dmax < slen) return static_cast<int>(ENOMEM);
+        strncat(dest, src, slen);
+        return 0;
+    }
 
-    static float serv_string_to_float(const char * nptr, char ** endptr) { return __FPK_string_to_float(nptr, endptr); }
+    // TODO: not a safe function - no control for the input buffer end
+    static double serv_string_to_double(const char * nptr, char ** endptr)
+    {
+        const char * cur = nptr;
+        for (; isdigit(*cur) || *cur == '-' || *cur == 'e' || *cur == 'E' || *cur == '.'; ++cur)
+            ;
+        if (endptr) *endptr = const_cast<char *>(cur);
+        size_t size = cur - nptr;
+        // TODO replace with static buffer
+        char * buffer = static_cast<char *>(malloc(size + 1));
+        for (size_t i = 0; i < size; ++i) buffer[i] = nptr[i];
+        buffer[size] = '\0';
+        double val   = atof(buffer);
+        free(buffer);
+        return val;
+    }
 
-    static double serv_string_to_double(const char * nptr, char ** endptr) { return __FPK_string_to_double(nptr, endptr); }
+    static float serv_string_to_float(const char * nptr, char ** endptr) { return static_cast<float>(serv_string_to_double(nptr, endptr)); }
 
-    static int serv_string_to_int(const char * nptr, char ** endptr) { return __FPK_string_to_int_generic(nptr, endptr); }
+    // TODO: not a safe function - no control for the input buffer end
+    static int serv_string_to_int(const char * nptr, char ** endptr)
+    {
+        const char * cur = nptr;
+        for (; isdigit(*cur) || *cur == '-'; ++cur)
+            ;
+        if (endptr) *endptr = const_cast<char *>(cur);
+        size_t size = cur - nptr;
+        // TODO replace with static buffer
+        char * buffer = static_cast<char *>(malloc(size + 1));
+        for (size_t i = 0; i < size; ++i) buffer[i] = nptr[i];
+        buffer[size] = '\0';
+        int val      = atoi(buffer);
+        free(buffer);
+        return val;
+    }
 
-    static int serv_int_to_string(char * buffer, size_t n, int value) { return __FPK_int_to_string(buffer, n, value); }
+    static int serv_int_to_string(char * buffer, size_t n, int value) { return snprintf(buffer, n, "%d", value); }
 
-    static int serv_double_to_string(char * buffer, size_t n, double value) { return __FPK_double_to_string_f(buffer, n, value); }
+    static int serv_double_to_string(char * buffer, size_t n, double value) { return snprintf(buffer, n, "%E", value); }
 };
 
 } // namespace mkl
diff --git a/cpp/daal/src/externals/service_service_ref.h b/cpp/daal/src/externals/service_service_ref.h
index da67ef66e0f..a76884a5039 100644
--- a/cpp/daal/src/externals/service_service_ref.h
+++ b/cpp/daal/src/externals/service_service_ref.h
@@ -93,7 +93,7 @@ struct RefService
     {
         return 0;
         // Old one - just to see what the method is for
-        // return fpk_serv_set_memory_limit(type, limit);
+        // return mkl_serv_set_memory_limit(type, limit);
     }
 
     static size_t serv_strnlen_s(const char * src, size_t slen)
diff --git a/cpp/daal/src/externals/service_spblas_mkl.h b/cpp/daal/src/externals/service_spblas_mkl.h
index 6e2ca981572..9cf80a5b6aa 100644
--- a/cpp/daal/src/externals/service_spblas_mkl.h
+++ b/cpp/daal/src/externals/service_spblas_mkl.h
@@ -25,65 +25,7 @@
 #define __SERVICE_SPBLAS_MKL_H__
 
 #include "services/daal_defines.h"
-#include "mkl_daal.h"
-
-#if !defined(__DAAL_CONCAT4)
-    #define __DAAL_CONCAT4(a, b, c, d)  __DAAL_CONCAT41(a, b, c, d)
-    #define __DAAL_CONCAT41(a, b, c, d) a##b##c##d
-#endif
-
-#if !defined(__DAAL_CONCAT5)
-    #define __DAAL_CONCAT5(a, b, c, d, e)  __DAAL_CONCAT51(a, b, c, d, e)
-    #define __DAAL_CONCAT51(a, b, c, d, e) a##b##c##d##e
-#endif
-
-#if defined(__APPLE__)
-    #define __DAAL_MKL_SSE2  avx_
-    #define __DAAL_MKL_SSE42 avx_
-#else
-    #define __DAAL_MKL_SSE2  sse2_
-    #define __DAAL_MKL_SSE42 sse42_
-#endif
-
-#define __DAAL_MKLFN(f_cpu, f_pref, f_name)              __DAAL_CONCAT4(fpk_, f_pref, f_cpu, f_name)
-#define __DAAL_MKLFN_CALL(f_pref, f_name, f_args)        __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)
-#define __DAAL_MKLFN_CALL_RETURN(f_pref, f_name, f_args) __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)
-
-#define __DAAL_MKLFN_CALL1(f_pref, f_name, f_args)             \
-    if (avx512 == cpu)                                         \
-    {                                                          \
-        __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                          \
-    if (avx2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                          \
-    if (sse42 == cpu)                                          \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                          \
-    if (sse2 == cpu)                                           \
-    {                                                          \
-        __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
-
-#define __DAAL_MKLFN_CALL2(f_pref, f_name, f_args)                    \
-    if (avx512 == cpu)                                                \
-    {                                                                 \
-        return __DAAL_MKLFN(avx512_, f_pref, f_name) f_args;          \
-    }                                                                 \
-    if (avx2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(avx2_, f_pref, f_name) f_args;            \
-    }                                                                 \
-    if (sse42 == cpu)                                                 \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE42, f_pref, f_name) f_args; \
-    }                                                                 \
-    if (sse2 == cpu)                                                  \
-    {                                                                 \
-        return __DAAL_MKLFN(__DAAL_MKL_SSE2, f_pref, f_name) f_args;  \
-    }
+#include <mkl.h>
 
 namespace daal
 {
@@ -107,29 +49,96 @@ struct MklSpBlas<double, cpu>
     static void xcsrmultd(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, double * a, DAAL_INT * ja, DAAL_INT * ia,
                           double * b, DAAL_INT * jb, DAAL_INT * ib, double * c, DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_dcsrmultd, (transa, m, n, k, a, ja, ia, b, jb, ib, c, ldc));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_d_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*n, (MKL_INT *)ia, (MKL_INT *)ia + 1, (MKL_INT *)ja,
+                                a);
+
+        sparse_matrix_t csrB = NULL;
+        struct matrix_descr descrB;
+        descrB.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_d_create_csr(&csrB, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)ib, (MKL_INT *)ib + 1, (MKL_INT *)jb,
+                                b);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_d_spmmd(SPARSE_OPERATION_NON_TRANSPOSE, csrA, csrB, SPARSE_LAYOUT_COLUMN_MAJOR, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_d_spmmd(SPARSE_OPERATION_TRANSPOSE, csrA, csrB, SPARSE_LAYOUT_COLUMN_MAJOR, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
+        mkl_sparse_destroy(csrB);
     }
 
     static void xcsrmv(const char * transa, const DAAL_INT * m, const DAAL_INT * k, const double * alpha, const char * matdescra, const double * val,
                        const DAAL_INT * indx, const DAAL_INT * pntrb, const DAAL_INT * pntre, const double * x, const double * beta, double * y)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_dcsrmv, (transa, m, k, alpha, matdescra, val, indx, pntrb, pntre, x, beta, y));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_d_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntre, (MKL_INT *)pntrb,
+                                (MKL_INT *)indx, (double *)val);
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_d_mv(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, x, *beta, y);
+        }
+        else
+        {
+            mkl_sparse_d_mv(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, x, *beta, y);
+        }
+        mkl_sparse_destroy(csrA);
     }
 
     static void xcsrmm(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * alpha, const char * matdescra,
                        const double * val, const DAAL_INT * indx, const DAAL_INT * pntrb, const double * b, const DAAL_INT * ldb, const double * beta,
                        double * c, const DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_dcsrmm, (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntrb + 1, b, ldb, beta, c, ldc));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_d_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntrb, (MKL_INT *)(pntrb + 1),
+                                (MKL_INT *)indx, (double *)val);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_d_mm(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n,
+                            (const MKL_INT)*ldb, *beta, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_d_mm(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n, (const MKL_INT)*ldb,
+                            *beta, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
     }
 
     static void xxcsrmm(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const double * alpha, const char * matdescra,
                         const double * val, const DAAL_INT * indx, const DAAL_INT * pntrb, const double * b, const DAAL_INT * ldb,
                         const double * beta, double * c, const DAAL_INT * ldc)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(spblas_, mkl_dcsrmm, (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntrb + 1, b, ldb, beta, c, ldc));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr         = mkl_set_num_threads_local(1);
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_d_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntrb, (MKL_INT *)(pntrb + 1),
+                                (MKL_INT *)indx, (double *)val);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_d_mm(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n,
+                            (const MKL_INT)*ldb, *beta, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_d_mm(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n, (const MKL_INT)*ldb,
+                            *beta, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
+
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
@@ -145,29 +154,97 @@ struct MklSpBlas<float, cpu>
     static void xcsrmultd(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, float * a, DAAL_INT * ja, DAAL_INT * ia,
                           float * b, DAAL_INT * jb, DAAL_INT * ib, float * c, DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_scsrmultd, (transa, m, n, k, a, ja, ia, b, jb, ib, c, ldc));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_s_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*n, (MKL_INT *)ia, (MKL_INT *)ia + 1, (MKL_INT *)ja,
+                                a);
+
+        sparse_matrix_t csrB = NULL;
+        struct matrix_descr descrB;
+        descrB.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_s_create_csr(&csrB, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)ib, (MKL_INT *)ib + 1, (MKL_INT *)jb,
+                                b);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_s_spmmd(SPARSE_OPERATION_NON_TRANSPOSE, csrA, csrB, SPARSE_LAYOUT_COLUMN_MAJOR, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_s_spmmd(SPARSE_OPERATION_TRANSPOSE, csrA, csrB, SPARSE_LAYOUT_COLUMN_MAJOR, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
+        mkl_sparse_destroy(csrB);
     }
 
     static void xcsrmv(const char * transa, const DAAL_INT * m, const DAAL_INT * k, const float * alpha, const char * matdescra, const float * val,
                        const DAAL_INT * indx, const DAAL_INT * pntrb, const DAAL_INT * pntre, const float * x, const float * beta, float * y)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_scsrmv, (transa, m, k, alpha, matdescra, val, indx, pntrb, pntre, x, beta, y));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_s_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntre, (MKL_INT *)pntrb,
+                                (MKL_INT *)indx, (float *)val);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_s_mv(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, x, *beta, y);
+        }
+        else
+        {
+            mkl_sparse_s_mv(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, x, *beta, y);
+        }
+        mkl_sparse_destroy(csrA);
     }
 
     static void xcsrmm(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * alpha, const char * matdescra,
                        const float * val, const DAAL_INT * indx, const DAAL_INT * pntrb, const float * b, const DAAL_INT * ldb, const float * beta,
                        float * c, const DAAL_INT * ldc)
     {
-        __DAAL_MKLFN_CALL(spblas_, mkl_scsrmm, (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntrb + 1, b, ldb, beta, c, ldc));
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_s_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntrb, (MKL_INT *)(pntrb + 1),
+                                (MKL_INT *)indx, (float *)val);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_s_mm(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n,
+                            (const MKL_INT)*ldb, *beta, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_s_mm(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n, (const MKL_INT)*ldb,
+                            *beta, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
     }
 
     static void xxcsrmm(const char * transa, const DAAL_INT * m, const DAAL_INT * n, const DAAL_INT * k, const float * alpha, const char * matdescra,
                         const float * val, const DAAL_INT * indx, const DAAL_INT * pntrb, const float * b, const DAAL_INT * ldb, const float * beta,
                         float * c, const DAAL_INT * ldc)
     {
-        int old_threads = fpk_serv_set_num_threads_local(1);
-        __DAAL_MKLFN_CALL(spblas_, mkl_scsrmm, (transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntrb + 1, b, ldb, beta, c, ldc));
-        fpk_serv_set_num_threads_local(old_threads);
+        int old_nthr         = mkl_set_num_threads_local(1);
+        sparse_matrix_t csrA = NULL;
+        struct matrix_descr descrA;
+        descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+        mkl_sparse_s_create_csr(&csrA, SPARSE_INDEX_BASE_ONE, (const MKL_INT)*m, (const MKL_INT)*k, (MKL_INT *)pntrb, (MKL_INT *)(pntrb + 1),
+                                (MKL_INT *)indx, (float *)val);
+
+        if (*transa == 'n' || *transa == 'N')
+        {
+            mkl_sparse_s_mm(SPARSE_OPERATION_NON_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n,
+                            (const MKL_INT)*ldb, *beta, c, (const MKL_INT)*ldc);
+        }
+        else
+        {
+            mkl_sparse_s_mm(SPARSE_OPERATION_TRANSPOSE, *alpha, csrA, descrA, SPARSE_LAYOUT_COLUMN_MAJOR, b, (const MKL_INT)*n, (const MKL_INT)*ldb,
+                            *beta, c, (const MKL_INT)*ldc);
+        }
+        mkl_sparse_destroy(csrA);
+
+        mkl_set_num_threads_local(old_nthr);
     }
 };
 
diff --git a/cpp/daal/src/externals/service_stat_mkl.h b/cpp/daal/src/externals/service_stat_mkl.h
index 05ab508589a..83160afd15a 100644
--- a/cpp/daal/src/externals/service_stat_mkl.h
+++ b/cpp/daal/src/externals/service_stat_mkl.h
@@ -24,10 +24,16 @@
 #ifndef __SERVICE_STAT_MKL_H__
 #define __SERVICE_STAT_MKL_H__
 
-#include "vmlvsl.h"
+#include <mkl.h>
+#include <mkl_vsl_functions.h>
 #include "src/externals/service_memory.h"
 #include "src/externals/service_stat_rng_mkl.h"
 
+typedef void (*func_type)(DAAL_INT, DAAL_INT, DAAL_INT, void *);
+
+#undef __DAAL_VSLFN_CALL
+#define __DAAL_VSLFN_CALL(f_name, f_args, errcode) errcode = f_name f_args;
+
 #if defined(_WIN64) || defined(__x86_64__)
     #define __SS_ILP_FLAG__ 1
 #else
@@ -77,81 +83,6 @@ extern "C"
 
 #define __DAAL_VSL_SS_ERROR_BAD_QUANT_ORDER       VSL_SS_ERROR_BAD_QUANT_ORDER
 #define __DAAL_VSL_SS_ERROR_INDICES_NOT_SUPPORTED VSL_SS_ERROR_INDICES_NOT_SUPPORTED
-
-    typedef void (*threadfuncfor)(DAAL_INT, DAAL_INT, void *, func_type);
-    typedef void (*threadfuncforordered)(DAAL_INT, DAAL_INT, void *, func_type);
-    typedef void (*threadfuncsection)(DAAL_INT, void *, func_type);
-    typedef void (*threadfuncordered)(DAAL_INT, DAAL_INT, DAAL_INT, void *, func_type);
-    typedef DAAL_INT (*threadgetlimit)(void);
-
-    struct ThreadingFuncs
-    {
-        threadfuncfor funcfor;
-        threadfuncfor funcforordered;
-        threadfuncsection funcsection;
-        threadfuncordered funcordered;
-        threadgetlimit getlimit;
-    };
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_sequential(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        DAAL_INT i;
-
-        for (i = 0; i < n; i++)
-        {
-            func(i, 0, 1, a);
-        }
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_ordered_sequential(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        DAAL_INT i;
-
-        for (i = 0; i < n; i++)
-        {
-            func(i, 0, 1, a);
-        }
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_sections_sequential(DAAL_INT threads_request, void * a, func_type func)
-    {
-        func(0, 0, 1, a);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_ordered_sequential(DAAL_INT i, DAAL_INT th_idx, DAAL_INT th_num, void * a, func_type func)
-    {
-        func(i, th_idx, th_num, a);
-    }
-
-    [[maybe_unused]] static DAAL_INT _daal_mkl_threader_get_max_threads_sequential()
-    {
-        return 1;
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        fpk_vsl_serv_threader_for(n, threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_ordered(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        fpk_vsl_serv_threader_for_ordered(n, threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_sections(DAAL_INT threads_request, void * a, func_type func)
-    {
-        fpk_vsl_serv_threader_sections(threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_ordered(DAAL_INT i, DAAL_INT th_idx, DAAL_INT th_num, void * a, func_type func)
-    {
-        //not used. To be implemented if needed.
-    }
-
-    [[maybe_unused]] static DAAL_INT _daal_mkl_threader_get_max_threads()
-    {
-        return fpk_vsl_serv_threader_get_num_threads_limit();
-    }
 }
 
 namespace daal
@@ -178,7 +109,7 @@ struct MklStatistics<double, cpu>
     static int xcp(double * data, __int64 nFeatures, __int64 nVectors, double * nPreviousObservations, double * sum, double * crossProduct,
                    __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -197,18 +128,15 @@ struct MklStatistics<double, cpu>
 
         double weight[2] = { *nPreviousObservations, *nPreviousObservations };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, &cpStorage), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_SUM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, (const MKL_INT *)&cpStorage), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_SUM, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         daal::services::daal_free(mean);
         mean = NULL;
@@ -218,7 +146,7 @@ struct MklStatistics<double, cpu>
     static int xxcp_weight(double * data, __int64 nFeatures, __int64 nVectors, double * weight, double * accumWeight, double * mean,
                            double * crossProduct, __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -228,20 +156,16 @@ struct MklStatistics<double, cpu>
 
         double accumWeightsAll[2] = { 0, 0 };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, &cpStorage), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for_sequential, _daal_mkl_threader_for_ordered_sequential,
-                                     _daal_mkl_threader_sections_sequential, _daal_mkl_threader_ordered_sequential,
-                                     _daal_mkl_threader_get_max_threads_sequential };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_MEAN, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, (const MKL_INT *)&cpStorage), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_MEAN, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         *accumWeight = accumWeightsAll[0];
 
@@ -253,7 +177,7 @@ struct MklStatistics<double, cpu>
     static int xxvar_weight(double * data, __int64 nFeatures, __int64 nVectors, double * weight, double * accumWeight, double * mean,
                             double * sampleVariance, __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -263,20 +187,16 @@ struct MklStatistics<double, cpu>
 
         double accumWeightsAll[2] = { 0, 0 };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_SUM, sampleVariance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, rawSecond), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for_sequential, _daal_mkl_threader_for_ordered_sequential,
-                                     _daal_mkl_threader_sections_sequential, _daal_mkl_threader_ordered_sequential,
-                                     _daal_mkl_threader_get_max_threads_sequential };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic, (task, __DAAL_VSL_SS_2C_SUM | __DAAL_VSL_SS_MEAN, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_SUM, sampleVariance), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, rawSecond), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_2C_SUM | __DAAL_VSL_SS_MEAN, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         *accumWeight = accumWeightsAll[0];
 
@@ -288,7 +208,7 @@ struct MklStatistics<double, cpu>
 
     static int x2c_mom(const double * data, const __int64 nFeatures, const __int64 nVectors, double * variance, const __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -296,16 +216,13 @@ struct MklStatistics<double, cpu>
         double * mean                 = (double *)daal::services::daal_malloc(nFeatures * sizeof(double));
         double * secondOrderRawMoment = (double *)daal::services::daal_malloc(nFeatures * sizeof(double));
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic, (task, __DAAL_VSL_SS_2C_MOM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_2C_MOM, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         daal::services::daal_free(mean);
         daal::services::daal_free(secondOrderRawMoment);
@@ -317,164 +234,149 @@ struct MklStatistics<double, cpu>
     static int xoutlierdetection(const double * data, const __int64 nFeatures, const __int64 nVectors, const __int64 nParams,
                                  const double * baconParams, double * baconWeights)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditOutDetect, (task, &nParams, baconParams, baconWeights), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSOutliersDetection, (task, __DAAL_VSL_SS_OUTLIERS, __DAAL_VSL_SS_METHOD_BACON, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditOutliersDetection, (task, (const MKL_INT *)&nParams, baconParams, baconWeights), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_OUTLIERS, __DAAL_VSL_SS_METHOD_BACON), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xLowOrderMoments(double * data, __int64 nFeatures, __int64 nVectors, __int64 method, double * sum, double * mean,
                                 double * secondOrderRawMoment, double * variance, double * variation)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_VARIATION, variation), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic,
-                          (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM | __DAAL_VSL_SS_VARIATION,
-                           method, &threading),
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
                           errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_VARIATION, variation), errcode);
+        __DAAL_VSLFN_CALL(
+            vsldSSCompute,
+            (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM | __DAAL_VSL_SS_VARIATION, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xSumAndVariance(double * data, __int64 nFeatures, __int64 nVectors, double * nPreviousObservations, __int64 method, double * sum,
                                double * mean, double * secondOrderRawMoment, double * variance)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
 
         double weight[2] = { *nPreviousObservations, *nPreviousObservations };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSBasic,
-                          (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM, method),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xQuantiles(const double * data, const __int64 nFeatures, const __int64 nVectors, const __int64 quantOrderN, const double * quantOrder,
                           double * quants)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
         if (errcode)
         {
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER_N, &quantOrderN), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER_N, (const MKL_INT *)&quantOrderN), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER, quantOrder), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER, quantOrder), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_QUANTILES, quants), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_QUANTILES, quants), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSQuantiles, (task, __DAAL_VSL_SS_QUANTS, __DAAL_VSL_SS_METHOD_FAST, &threading), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_QUANTS, __DAAL_VSL_SS_METHOD_FAST), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xSort(double * data, __int64 nFeatures, __int64 nVectors, double * sortedData)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 inputStorage  = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
         __int64 outputStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSNewTask, (&task, &nFeatures, &nVectors, &inputStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vsldSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&inputStorage, data, 0, 0),
+                          errcode);
         if (errcode)
         {
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV, sortedData), errcode);
+        __DAAL_VSLFN_CALL(vsldSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV, sortedData), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV_STORAGE, &outputStorage), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV_STORAGE, (const MKL_INT *)&outputStorage), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, dSSSort, (task, __DAAL_VSL_SS_SORTED_OBSERV, __DAAL_VSL_SS_METHOD_RADIX, &threading), errcode);
+        __DAAL_VSLFN_CALL(vsldSSCompute, (task, __DAAL_VSL_SS_SORTED_OBSERV, __DAAL_VSL_SS_METHOD_RADIX), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 };
@@ -493,7 +395,7 @@ struct MklStatistics<float, cpu>
     static int xcp(float * data, __int64 nFeatures, __int64 nVectors, float * nPreviousObservations, float * sum, float * crossProduct,
                    __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -512,18 +414,15 @@ struct MklStatistics<float, cpu>
 
         float weight[2] = { *nPreviousObservations, *nPreviousObservations };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, &cpStorage), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_SUM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, (const MKL_INT *)&cpStorage), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_SUM, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         daal::services::daal_free(mean);
         mean = NULL;
@@ -533,7 +432,7 @@ struct MklStatistics<float, cpu>
     static int xxcp_weight(float * data, __int64 nFeatures, __int64 nVectors, float * weight, float * accumWeight, float * mean, float * crossProduct,
                            __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -543,20 +442,16 @@ struct MklStatistics<float, cpu>
 
         float accumWeightsAll[2] = { 0, 0 };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, &cpStorage), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for_sequential, _daal_mkl_threader_for_ordered_sequential,
-                                     _daal_mkl_threader_sections_sequential, _daal_mkl_threader_ordered_sequential,
-                                     _daal_mkl_threader_get_max_threads_sequential };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_MEAN, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_CP, crossProduct), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_CP_STORAGE, (const MKL_INT *)&cpStorage), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_CP | __DAAL_VSL_SS_MEAN, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         *accumWeight = accumWeightsAll[0];
 
@@ -568,7 +463,7 @@ struct MklStatistics<float, cpu>
     static int xxvar_weight(float * data, __int64 nFeatures, __int64 nVectors, float * weight, float * accumWeight, float * mean,
                             float * sampleVariance, __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -578,20 +473,16 @@ struct MklStatistics<float, cpu>
 
         float accumWeightsAll[2] = { 0, 0 };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_SUM, sampleVariance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, rawSecond), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for_sequential, _daal_mkl_threader_for_ordered_sequential,
-                                     _daal_mkl_threader_sections_sequential, _daal_mkl_threader_ordered_sequential,
-                                     _daal_mkl_threader_get_max_threads_sequential };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic, (task, __DAAL_VSL_SS_2C_SUM | __DAAL_VSL_SS_MEAN, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_WEIGHTS, weight), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_SUM, sampleVariance), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, rawSecond), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, accumWeightsAll), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_2C_SUM | __DAAL_VSL_SS_MEAN, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         *accumWeight = accumWeightsAll[0];
 
@@ -602,7 +493,7 @@ struct MklStatistics<float, cpu>
 
     static int x2c_mom(const float * data, const __int64 nFeatures, const __int64 nVectors, float * variance, const __int64 method)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
@@ -610,16 +501,13 @@ struct MklStatistics<float, cpu>
         float * mean                 = (float *)daal::services::daal_malloc(nFeatures * sizeof(float));
         float * secondOrderRawMoment = (float *)daal::services::daal_malloc(nFeatures * sizeof(float));
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic, (task, __DAAL_VSL_SS_2C_MOM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_2C_MOM, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
 
         daal::services::daal_free(mean);
         daal::services::daal_free(secondOrderRawMoment);
@@ -632,164 +520,150 @@ struct MklStatistics<float, cpu>
     static int xoutlierdetection(const float * data, const __int64 nFeatures, const __int64 nVectors, const __int64 nParams,
                                  const float * baconParams, float * baconWeights)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditOutDetect, (task, &nParams, baconParams, baconWeights), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSOutliersDetection, (task, __DAAL_VSL_SS_OUTLIERS, __DAAL_VSL_SS_METHOD_BACON, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditOutliersDetection, (task, (const MKL_INT *)&nParams, baconParams, baconWeights), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_OUTLIERS, __DAAL_VSL_SS_METHOD_BACON), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xLowOrderMoments(float * data, __int64 nFeatures, __int64 nVectors, __int64 method, float * sum, float * mean,
                                 float * secondOrderRawMoment, float * variance, float * variation)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
-
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_VARIATION, variation), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic,
-                          (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM | __DAAL_VSL_SS_VARIATION,
-                           method, &threading),
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
                           errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_VARIATION, variation), errcode);
+        __DAAL_VSLFN_CALL(
+            vslsSSCompute,
+            (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM | __DAAL_VSL_SS_VARIATION, method), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xSumAndVariance(float * data, __int64 nFeatures, __int64 nVectors, float * nPreviousObservations, __int64 method, float * sum,
                                float * mean, float * secondOrderRawMoment, float * variance)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
 
         float weight[2] = { *nPreviousObservations, *nPreviousObservations };
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
-
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSBasic,
-                          (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM, method, &threading), errcode);
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SUM, sum), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_MEAN, mean), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2R_MOM, secondOrderRawMoment), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_2C_MOM, variance), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_ACCUM_WEIGHT, weight), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_SUM | __DAAL_VSL_SS_MEAN | __DAAL_VSL_SS_2R_MOM | __DAAL_VSL_SS_2C_MOM, method),
+                          errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xQuantiles(const float * data, const __int64 nFeatures, const __int64 nVectors, const __int64 quantOrderN, const float * quantOrder,
                           float * quants)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 dataStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &dataStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&dataStorage, data, 0, 0),
+                          errcode);
         if (errcode)
         {
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER_N, &quantOrderN), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER_N, (const MKL_INT *)&quantOrderN), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER, quantOrder), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_ORDER, quantOrder), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_QUANTILES, quants), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_QUANT_QUANTILES, quants), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSQuantiles, (task, __DAAL_VSL_SS_QUANTS, __DAAL_VSL_SS_METHOD_FAST, &threading), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_QUANTS, __DAAL_VSL_SS_METHOD_FAST), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 
     static int xSort(float * data, __int64 nFeatures, __int64 nVectors, float * sortedData)
     {
-        DAAL_VSLSSTaskPtr task;
+        VSLSSTaskPtr task;
         int errcode = 0;
 
         __int64 inputStorage  = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
         __int64 outputStorage = __DAAL_VSL_SS_MATRIX_STORAGE_COLS;
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSNewTask, (&task, &nFeatures, &nVectors, &inputStorage, data, 0, 0, __SS_ILP_FLAG__), errcode);
+        __DAAL_VSLFN_CALL(vslsSSNewTask, (&task, (const MKL_INT *)&nFeatures, (const MKL_INT *)&nVectors, (const MKL_INT *)&inputStorage, data, 0, 0),
+                          errcode);
         if (errcode)
         {
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV, sortedData), errcode);
+        __DAAL_VSLFN_CALL(vslsSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV, sortedData), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vsliSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV_STORAGE, &outputStorage), errcode);
+        __DAAL_VSLFN_CALL(vsliSSEditTask, (task, __DAAL_VSL_SS_ED_SORTED_OBSERV_STORAGE, (const MKL_INT *)&outputStorage), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        ThreadingFuncs threading = { _daal_mkl_threader_for, _daal_mkl_threader_for_ordered, _daal_mkl_threader_sections, _daal_mkl_threader_ordered,
-                                     _daal_mkl_threader_get_max_threads };
-
-        __DAAL_VSLFN_CALL(fpk_vsl_kernel, sSSSort, (task, __DAAL_VSL_SS_SORTED_OBSERV, __DAAL_VSL_SS_METHOD_RADIX, &threading), errcode);
+        __DAAL_VSLFN_CALL(vslsSSCompute, (task, __DAAL_VSL_SS_SORTED_OBSERV, __DAAL_VSL_SS_METHOD_RADIX), errcode);
         if (errcode)
         {
-            __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+            __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
             return errcode;
         }
 
-        __DAAL_VSLFN_CALL(fpk_vsl_sub_kernel, vslSSDeleteTask, (&task), errcode);
+        __DAAL_VSLFN_CALL(vslSSDeleteTask, (&task), errcode);
         return errcode;
     }
 };
diff --git a/cpp/daal/src/externals/service_stat_ref.h b/cpp/daal/src/externals/service_stat_ref.h
index 0ff35505527..e37a067f146 100644
--- a/cpp/daal/src/externals/service_stat_ref.h
+++ b/cpp/daal/src/externals/service_stat_ref.h
@@ -72,81 +72,6 @@ extern "C"
 
 #define __DAAL_VSL_SS_ERROR_BAD_QUANT_ORDER       -4022
 #define __DAAL_VSL_SS_ERROR_INDICES_NOT_SUPPORTED -4085
-
-    typedef void (*threadfuncfor)(DAAL_INT, DAAL_INT, void *, func_type);
-    typedef void (*threadfuncforordered)(DAAL_INT, DAAL_INT, void *, func_type);
-    typedef void (*threadfuncsection)(DAAL_INT, void *, func_type);
-    typedef void (*threadfuncordered)(DAAL_INT, DAAL_INT, DAAL_INT, void *, func_type);
-    typedef DAAL_INT (*threadgetlimit)(void);
-
-    struct ThreadingFuncs
-    {
-        threadfuncfor funcfor;
-        threadfuncfor funcforordered;
-        threadfuncsection funcsection;
-        threadfuncordered funcordered;
-        threadgetlimit getlimit;
-    };
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_sequential(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        DAAL_INT i;
-
-        for (i = 0; i < n; i++)
-        {
-            func(i, 0, 1, a);
-        }
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_ordered_sequential(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        DAAL_INT i;
-
-        for (i = 0; i < n; i++)
-        {
-            func(i, 0, 1, a);
-        }
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_sections_sequential(DAAL_INT threads_request, void * a, func_type func)
-    {
-        func(0, 0, 1, a);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_ordered_sequential(DAAL_INT i, DAAL_INT th_idx, DAAL_INT th_num, void * a, func_type func)
-    {
-        func(i, th_idx, th_num, a);
-    }
-
-    [[maybe_unused]] static DAAL_INT _daal_mkl_threader_get_max_threads_sequential()
-    {
-        return 1;
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        //fpk_vsl_serv_threader_for(n, threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_for_ordered(DAAL_INT n, DAAL_INT threads_request, void * a, func_type func)
-    {
-        //fpk_vsl_serv_threader_for_ordered(n, threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_sections(DAAL_INT threads_request, void * a, func_type func)
-    {
-        //fpk_vsl_serv_threader_sections(threads_request, a, func);
-    }
-
-    [[maybe_unused]] static void _daal_mkl_threader_ordered(DAAL_INT i, DAAL_INT th_idx, DAAL_INT th_num, void * a, func_type func)
-    {
-        //not used. To be implemented if needed.
-    }
-
-    [[maybe_unused]] static DAAL_INT _daal_mkl_threader_get_max_threads()
-    {
-        return 1;
-    }
 }
 
 namespace daal
diff --git a/cpp/daal/src/externals/service_stat_rng_mkl.h b/cpp/daal/src/externals/service_stat_rng_mkl.h
index d0719eeeaba..f3c8f05758b 100644
--- a/cpp/daal/src/externals/service_stat_rng_mkl.h
+++ b/cpp/daal/src/externals/service_stat_rng_mkl.h
@@ -24,22 +24,15 @@
 #ifndef __SERVICE_STAT_RNG_MKL_H__
 #define __SERVICE_STAT_RNG_MKL_H__
 
-#if !defined(__DAAL_CONCAT5)
-    #define __DAAL_CONCAT5(a, b, c, d, e)  __DAAL_CONCAT51(a, b, c, d, e)
-    #define __DAAL_CONCAT51(a, b, c, d, e) a##b##c##d##e
-#endif
-
-#define __DAAL_VSLFN(f_cpu, f_pref, f_name)                   __DAAL_CONCAT5(f_pref, _, f_cpu, _, f_name)
-#define __DAAL_VSLFN_CALL(f_pref, f_name, f_args, errcode)    __DAAL_VSLFN_CALL1(f_pref, f_name, f_args, errcode)
-#define __DAAL_VSLFN_CALL_NR(f_pref, f_name, f_args, errcode) __DAAL_VSLFN_CALL2(f_pref, f_name, f_args, errcode)
-#define __DAAL_VSLFN_CALL_NR_WHILE(f_pref, f_name, f_args, errcode)   \
+#define __DAAL_VSLFN_CALL_NR(f_name, f_args, errcode) __DAAL_VSLFN_CALL_NO_V(f_name, f_args, errcode)
+#define __DAAL_VSLFN_CALL_NR_WHILE(f_name, f_args, errcode)           \
     {                                                                 \
         size_t nn_left = n;                                           \
         while (nn_left > 0)                                           \
         {                                                             \
             nn = (nn_left > 0xFFFFFFFL) ? 0xFFFFFFF : (int)(nn_left); \
                                                                       \
-            __DAAL_VSLFN_CALL2(f_pref, f_name, f_args, errcode);      \
+            __DAAL_VSLFN_CALL_V(f_name, f_args, errcode);             \
             if (errcode < 0) return errcode;                          \
                                                                       \
             rr += nn;                                                 \
@@ -47,51 +40,14 @@
         }                                                             \
     }
 
-#if defined(__APPLE__)
-    #define __DAAL_MKLVSL_SSE2  e9
-    #define __DAAL_MKLVSL_SSE42 e9
-#else
-    #define __DAAL_MKLVSL_SSE2  ex
-    #define __DAAL_MKLVSL_SSE42 h8
-#endif
-
-#define __DAAL_VSLFN_CALL1(f_pref, f_name, f_args, errcode)                 \
-    if (avx512 == cpu)                                                      \
-    {                                                                       \
-        errcode = __DAAL_VSLFN(z0, f_pref, f_name) f_args;                  \
-    }                                                                       \
-    if (avx2 == cpu)                                                        \
-    {                                                                       \
-        errcode = __DAAL_VSLFN(l9, f_pref, f_name) f_args;                  \
-    }                                                                       \
-    if (sse42 == cpu)                                                       \
-    {                                                                       \
-        errcode = __DAAL_VSLFN(__DAAL_MKLVSL_SSE42, f_pref, f_name) f_args; \
-    }                                                                       \
-    if (sse2 == cpu)                                                        \
-    {                                                                       \
-        errcode = __DAAL_VSLFN(__DAAL_MKLVSL_SSE2, f_pref, f_name) f_args;  \
-    }                                                                       \
-    if (errcode != 0)                                                       \
-    {                                                                       \
-        return errcode;                                                     \
+#define __DAAL_VSLFN_CALL_V(f_name, f_args, retcode) \
+    {                                                \
+        retcode = v##f_name f_args;                  \
     }
-#define __DAAL_VSLFN_CALL2(f_pref, f_name, f_args, retcode)                 \
-    if (avx512 == cpu)                                                      \
-    {                                                                       \
-        retcode = __DAAL_VSLFN(z0, f_pref, f_name) f_args;                  \
-    }                                                                       \
-    if (avx2 == cpu)                                                        \
-    {                                                                       \
-        retcode = __DAAL_VSLFN(l9, f_pref, f_name) f_args;                  \
-    }                                                                       \
-    if (sse42 == cpu)                                                       \
-    {                                                                       \
-        retcode = __DAAL_VSLFN(__DAAL_MKLVSL_SSE42, f_pref, f_name) f_args; \
-    }                                                                       \
-    if (sse2 == cpu)                                                        \
-    {                                                                       \
-        retcode = __DAAL_VSLFN(__DAAL_MKLVSL_SSE2, f_pref, f_name) f_args;  \
+
+#define __DAAL_VSLFN_CALL_NO_V(f_name, f_args, retcode) \
+    {                                                   \
+        retcode = f_name f_args;                        \
     }
 
 #endif
diff --git a/cpp/daal/src/externals/service_stat_rng_ref.h b/cpp/daal/src/externals/service_stat_rng_ref.h
index 182d16ca8b1..eb5526242a7 100644
--- a/cpp/daal/src/externals/service_stat_rng_ref.h
+++ b/cpp/daal/src/externals/service_stat_rng_ref.h
@@ -27,11 +27,6 @@
 
 #include "src/externals/service_stat_rng_ref.h"
 
-#if !defined(__DAAL_CONCAT2)
-    #define __DAAL_CONCAT2(a, b) a##b
-#endif
-
-#define __DAAL_VSLFN(f_pref, f_name)                          __DAAL_CONCAT2(f_pref, f_name)
 #define __DAAL_VSLFN_CALL_NR(f_pref, f_name, f_args, errcode) __DAAL_VSLFN_CALL(f_pref, f_name, f_args, errcode)
 #define __DAAL_VSLFN_CALL_NR_WHILE(f_pref, f_name, f_args, errcode) \
     {                                                               \
diff --git a/cpp/daal/src/externals/istrconv_daal_el.h b/cpp/daal/src/externals/service_thread_declar_mkl.cpp
old mode 100755
new mode 100644
similarity index 60%
rename from cpp/daal/src/externals/istrconv_daal_el.h
rename to cpp/daal/src/externals/service_thread_declar_mkl.cpp
index 32d0ef187ee..b94fc76cdda
--- a/cpp/daal/src/externals/istrconv_daal_el.h
+++ b/cpp/daal/src/externals/service_thread_declar_mkl.cpp
@@ -1,5 +1,6 @@
+/* file: service_thread_declar_mkl.cpp */
 /*******************************************************************************
-* Copyright 2014 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,22 +15,18 @@
 * limitations under the License.
 *******************************************************************************/
 
-//
-// Abstract:
-//
-// External header file for libistrconv.
-//
-// =============================================================================
-
-#ifndef _ISTRCONV_EL_H_
-#define _ISTRCONV_EL_H_
-
-#if defined(__cplusplus)
-    #define _ISTRCONV_EXTERN_C extern "C"
-#else
-    #define _ISTRCONV_EXTERN_C extern
-#endif
-
-_ISTRCONV_EXTERN_C int __FPK_string_to_int_generic(const char * nptr, char ** endptr);
-
-#endif /*_ISTRCONV_H_*/
+namespace daal
+{
+namespace internal
+{
+namespace mkl
+{
+//It's a placeholder, the real function calls exact in xfunctions.
+//TODO: add correct threading control
+int mkl_serv_set_num_threads_local(int nthreads)
+{
+    return nthreads;
+}
+} // namespace mkl
+} // namespace internal
+} // namespace daal
diff --git a/cpp/oneapi/dal/backend/micromkl/micromkl.hpp b/cpp/daal/src/externals/service_thread_declar_mkl.h
similarity index 58%
rename from cpp/oneapi/dal/backend/micromkl/micromkl.hpp
rename to cpp/daal/src/externals/service_thread_declar_mkl.h
index 6f64b784c93..9ea71b4aa16 100644
--- a/cpp/oneapi/dal/backend/micromkl/micromkl.hpp
+++ b/cpp/daal/src/externals/service_thread_declar_mkl.h
@@ -1,5 +1,6 @@
+/* file: service_thread_declar_mkl.h */
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright contributors to the oneDAL project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,23 +15,14 @@
 * limitations under the License.
 *******************************************************************************/
 
-#pragma once
-
-#include "oneapi/dal/common.hpp"
-
-namespace oneapi::dal::backend::micromkl {
-
-template <typename Cpu, typename Float>
-void syevd(char jobz,
-           char uplo,
-           std::int64_t n,
-           Float* a,
-           std::int64_t lda,
-           Float* w,
-           Float* work,
-           std::int64_t lwork,
-           std::int64_t* iwork,
-           std::int64_t liwork,
-           std::int64_t& info);
-
-} // namespace oneapi::dal::backend::micromkl
+namespace daal
+{
+namespace internal
+{
+namespace mkl
+{
+//It's a placeholder, the real function calls exact in xfunctions.
+int mkl_serv_set_num_threads_local(int nthreads);
+} // namespace mkl
+} // namespace internal
+} // namespace daal
diff --git a/cpp/daal/src/services/library_version_info.cpp b/cpp/daal/src/services/library_version_info.cpp
index 75c5169998c..513afe181ee 100644
--- a/cpp/daal/src/services/library_version_info.cpp
+++ b/cpp/daal/src/services/library_version_info.cpp
@@ -27,7 +27,7 @@
 #include "services/env_detect.h"
 
 #ifndef DAAL_REF // temporary!!! should depend both on BACKEND and TARGETARCH
-    #include "mkl_daal.h"
+    #include <mkl.h>
 static const char * cpu_long_names[] = { "Generic",
                                          "Supplemental Streaming SIMD Extensions 3",
                                          "Intel(R) Streaming SIMD Extensions 4.2",
@@ -50,8 +50,7 @@ DAAL_EXPORT daal::services::LibraryVersionInfo::LibraryVersionInfo()
       build_rev(BUILD_REV),
       name(PRODUCT_NAME_STR),
 #ifndef DAAL_REF
-      //    fpk_serv_cpuisknm might be instantiated from backed like other MKL functions
-      processor(cpu_long_names[daal::services::Environment::getInstance()->getCpuId() + 2 * fpk_serv_cpuisknm()])
+      processor(cpu_long_names[daal::services::Environment::getInstance()->getCpuId()])
 #else
       processor(cpu_long_names[0])
 #endif
diff --git a/cpp/daal/src/services/service_algo_utils.cpp b/cpp/daal/src/services/service_algo_utils.cpp
index ca81000067c..eb3cc8b1e3e 100755
--- a/cpp/daal/src/services/service_algo_utils.cpp
+++ b/cpp/daal/src/services/service_algo_utils.cpp
@@ -21,9 +21,10 @@
 //--
 */
 
+#include "src/services/service_utils.h"
+#include "src/services/service_algo_utils.h"
 #include "services/error_indexes.h"
 #include "services/error_handling.h"
-#include "src/services/service_algo_utils.h"
 
 namespace daal
 {
diff --git a/cpp/daal/src/services/service_topo.cpp b/cpp/daal/src/services/service_topo.cpp
index 230124af36f..adaccc7f983 100644
--- a/cpp/daal/src/services/service_topo.cpp
+++ b/cpp/daal/src/services/service_topo.cpp
@@ -1817,14 +1817,14 @@ unsigned _internal_daal_GetSysLogicalProcessorCount()
 }
 
 /*
- * _internal_daal_GetSysProcessorCoreCount
+ * _internal_daal_GetProcessorCoreCount
  *
  * Returns count of processor cores in the system that were enumerated by this app
  *
  * Arguments: None
  * Return: Number of physical processors or 0 if number can not be calculated
  */
-unsigned _internal_daal_GetSysProcessorCoreCount()
+unsigned _internal_daal_GetProcessorCoreCount()
 {
     if (!glbl_obj.isInit) __internal_daal_initCpuTopology();
 
@@ -1870,7 +1870,7 @@ unsigned _internal_daal_GetCoreCountPerEachCache(unsigned subleaf, unsigned cach
 unsigned _internal_daal_GetLogicalProcessorQueue(int * queue)
 {
     const int cpus = _internal_daal_GetSysLogicalProcessorCount();
-    int cores      = _internal_daal_GetSysProcessorCoreCount();
+    int cores      = _internal_daal_GetProcessorCoreCount();
 
     if (cores == 0) cores = 1;
 
diff --git a/cpp/daal/src/services/service_topo.h b/cpp/daal/src/services/service_topo.h
index c2c0ad142d9..f60bcef199d 100644
--- a/cpp/daal/src/services/service_topo.h
+++ b/cpp/daal/src/services/service_topo.h
@@ -111,7 +111,7 @@ constexpr LNX_PTR2INT LNX_MY1CON = 1LL;
         #endif
 
     #else /* WINDOWS */
-
+        #define NOMINMAX
         #include <windows.h>
 
         #ifdef _M_IA64
@@ -337,17 +337,17 @@ struct glktsn
     ~glktsn() { FreeArrays(); }
 };
 
-static unsigned long __internal_daal_getBitsFromDWORD(const unsigned int val, const char from, const char to);
-static unsigned __internal_daal_createMask(unsigned numEntries, unsigned * maskLength);
-static unsigned __internal_daal_slectOrdfromPkg(unsigned package, unsigned core, unsigned logical);
-static unsigned __internal_daal_getAPICID(unsigned processor);
-static void __internal_daal_initCpuTopology();
-static int __internal_daal_bindContext(unsigned cpu, void * prevAffinity);
-static void __internal_daal_restoreContext(void * prevAffinity);
-static void __internal_daal_setChkProcessAffinityConsistency(unsigned lcl_OSProcessorCount);
-static void __internal_daal_setGenericAffinityBit(GenericAffinityMask * pAffinityMap, unsigned cpu);
-static void __internal_daal_getCpuidInfo(CPUIDinfo * info, const unsigned int func, const unsigned int subfunc);
-static int __internal_daal_countBits(DWORD_PTR x);
+[[maybe_unused]] static unsigned long __internal_daal_getBitsFromDWORD(const unsigned int val, const char from, const char to);
+[[maybe_unused]] static unsigned __internal_daal_createMask(unsigned numEntries, unsigned * maskLength);
+[[maybe_unused]] static unsigned __internal_daal_slectOrdfromPkg(unsigned package, unsigned core, unsigned logical);
+[[maybe_unused]] static unsigned __internal_daal_getAPICID(unsigned processor);
+[[maybe_unused]] static void __internal_daal_initCpuTopology();
+[[maybe_unused]] static int __internal_daal_bindContext(unsigned cpu, void * prevAffinity);
+[[maybe_unused]] static void __internal_daal_restoreContext(void * prevAffinity);
+[[maybe_unused]] static void __internal_daal_setChkProcessAffinityConsistency(unsigned lcl_OSProcessorCount);
+[[maybe_unused]] static void __internal_daal_setGenericAffinityBit(GenericAffinityMask * pAffinityMap, unsigned cpu);
+[[maybe_unused]] static void __internal_daal_getCpuidInfo(CPUIDinfo * info, const unsigned int func, const unsigned int subfunc);
+[[maybe_unused]] static int __internal_daal_countBits(DWORD_PTR x);
 
 unsigned _internal_daal_GetMaxCPUSupportedByOS();
 unsigned _internal_daal_GetOSLogicalProcessorCount();
diff --git a/cpp/daal/src/threading/export.def b/cpp/daal/src/threading/export.def
index 4ee8912077a..7962b0a8844 100644
--- a/cpp/daal/src/threading/export.def
+++ b/cpp/daal/src/threading/export.def
@@ -15,266 +15,3 @@
 ;===============================================================================
 
 EXPORTS
-fpk_blas_avx2_daxpy
-fpk_blas_avx2_dgemm
-fpk_blas_avx2_dgemv
-fpk_blas_avx2_dsymm
-fpk_blas_avx2_dsyr
-fpk_blas_avx2_dsyrk
-fpk_blas_avx2_saxpy
-fpk_blas_avx2_sgemm
-fpk_blas_avx2_sgemv
-fpk_blas_avx2_ssymm
-fpk_blas_avx2_ssyr
-fpk_blas_avx2_ssyrk
-fpk_blas_avx2_xdgemm
-fpk_blas_avx2_xdsymm
-fpk_blas_avx2_xdsyr
-fpk_blas_avx2_xdsyrk
-fpk_blas_avx2_xsgemm
-fpk_blas_avx2_xssymm
-fpk_blas_avx2_xssyr
-fpk_blas_avx2_xssyrk
-fpk_blas_avx2_xsdot
-fpk_blas_avx2_xddot
-fpk_blas_avx512_daxpy
-fpk_blas_avx512_dgemm
-fpk_blas_avx512_dgemv
-fpk_blas_avx512_dsymm
-fpk_blas_avx512_dsyr
-fpk_blas_avx512_dsyrk
-fpk_blas_avx512_saxpy
-fpk_blas_avx512_sgemm
-fpk_blas_avx512_sgemv
-fpk_blas_avx512_ssymm
-fpk_blas_avx512_ssyr
-fpk_blas_avx512_ssyrk
-fpk_blas_avx512_xdgemm
-fpk_blas_avx512_xdsymm
-fpk_blas_avx512_xdsyr
-fpk_blas_avx512_xdsyrk
-fpk_blas_avx512_xsgemm
-fpk_blas_avx512_xssymm
-fpk_blas_avx512_xssyr
-fpk_blas_avx512_xssyrk
-fpk_blas_avx512_xsdot
-fpk_blas_avx512_xddot
-fpk_blas_sse2_daxpy
-fpk_blas_sse2_dgemm
-fpk_blas_sse2_dgemv
-fpk_blas_sse2_dsymm
-fpk_blas_sse2_dsyr
-fpk_blas_sse2_dsyrk
-fpk_blas_sse2_saxpy
-fpk_blas_sse2_sgemm
-fpk_blas_sse2_sgemv
-fpk_blas_sse2_ssymm
-fpk_blas_sse2_ssyr
-fpk_blas_sse2_ssyrk
-fpk_blas_sse2_xdgemm
-fpk_blas_sse2_xdsymm
-fpk_blas_sse2_xdsyr
-fpk_blas_sse2_xdsyrk
-fpk_blas_sse2_xsgemm
-fpk_blas_sse2_xssymm
-fpk_blas_sse2_xssyr
-fpk_blas_sse2_xssyrk
-fpk_blas_sse2_xsdot
-fpk_blas_sse2_xddot
-fpk_blas_sse42_daxpy
-fpk_blas_sse42_dgemm
-fpk_blas_sse42_dgemv
-fpk_blas_sse42_dsymm
-fpk_blas_sse42_dsyr
-fpk_blas_sse42_dsyrk
-fpk_blas_sse42_saxpy
-fpk_blas_sse42_sgemm
-fpk_blas_sse42_sgemv
-fpk_blas_sse42_ssymm
-fpk_blas_sse42_ssyr
-fpk_blas_sse42_ssyrk
-fpk_blas_sse42_xdgemm
-fpk_blas_sse42_xdsymm
-fpk_blas_sse42_xdsyr
-fpk_blas_sse42_xdsyrk
-fpk_blas_sse42_xsgemm
-fpk_blas_sse42_xssymm
-fpk_blas_sse42_xssyr
-fpk_blas_sse42_xssyrk
-fpk_blas_sse42_xsdot
-fpk_blas_sse42_xddot
-fpk_dft_avx2_ippsSortRadixAscend_32f_I
-fpk_dft_avx2_ippsSortRadixAscend_64f_I
-fpk_dft_avx512_ippsSortRadixAscend_32f_I
-fpk_dft_avx512_ippsSortRadixAscend_64f_I
-fpk_dft_sse2_ippsSortRadixAscend_32f_I
-fpk_dft_sse2_ippsSortRadixAscend_64f_I
-fpk_dft_sse42_ippsSortRadixAscend_32f_I
-fpk_dft_sse42_ippsSortRadixAscend_64f_I
-fpk_lapack_avx2_dgeqp3
-fpk_lapack_avx2_dgeqrf
-fpk_lapack_avx2_dgerqf
-fpk_lapack_avx2_dgesvd
-fpk_lapack_avx2_dorgqr
-fpk_lapack_avx2_dormqr
-fpk_lapack_avx2_dormrq
-fpk_lapack_avx2_dpotrf
-fpk_lapack_avx2_dpotri
-fpk_lapack_avx2_dpotrs
-fpk_lapack_avx2_dgetrf
-fpk_lapack_avx2_dgetrs
-fpk_lapack_avx2_dpptrf
-fpk_lapack_avx2_dsyev
-fpk_lapack_avx2_dsyevd
-fpk_lapack_avx2_dtrtrs
-fpk_lapack_avx2_sgeqp3
-fpk_lapack_avx2_sgeqrf
-fpk_lapack_avx2_sgerqf
-fpk_lapack_avx2_sgesvd
-fpk_lapack_avx2_sorgqr
-fpk_lapack_avx2_sormqr
-fpk_lapack_avx2_sormrq
-fpk_lapack_avx2_spotrf
-fpk_lapack_avx2_spotri
-fpk_lapack_avx2_spotrs
-fpk_lapack_avx2_sgetrf
-fpk_lapack_avx2_sgetrs
-fpk_lapack_avx2_spptrf
-fpk_lapack_avx2_ssyev
-fpk_lapack_avx2_ssyevd
-fpk_lapack_avx2_strtrs
-fpk_lapack_avx512_dgeqp3
-fpk_lapack_avx512_dgeqrf
-fpk_lapack_avx512_dgerqf
-fpk_lapack_avx512_dgesvd
-fpk_lapack_avx512_dorgqr
-fpk_lapack_avx512_dormqr
-fpk_lapack_avx512_dormrq
-fpk_lapack_avx512_dpotrf
-fpk_lapack_avx512_dpotri
-fpk_lapack_avx512_dpotrs
-fpk_lapack_avx512_dgetrf
-fpk_lapack_avx512_dgetrs
-fpk_lapack_avx512_dpptrf
-fpk_lapack_avx512_dsyev
-fpk_lapack_avx512_dsyevd
-fpk_lapack_avx512_dtrtrs
-fpk_lapack_avx512_sgeqp3
-fpk_lapack_avx512_sgeqrf
-fpk_lapack_avx512_sgerqf
-fpk_lapack_avx512_sgesvd
-fpk_lapack_avx512_sorgqr
-fpk_lapack_avx512_sormqr
-fpk_lapack_avx512_sormrq
-fpk_lapack_avx512_spotrf
-fpk_lapack_avx512_spotri
-fpk_lapack_avx512_spotrs
-fpk_lapack_avx512_sgetrf
-fpk_lapack_avx512_sgetrs
-fpk_lapack_avx512_spptrf
-fpk_lapack_avx512_ssyev
-fpk_lapack_avx512_ssyevd
-fpk_lapack_avx512_strtrs
-fpk_lapack_sse2_dgeqp3
-fpk_lapack_sse2_dgeqrf
-fpk_lapack_sse2_dgerqf
-fpk_lapack_sse2_dgesvd
-fpk_lapack_sse2_dorgqr
-fpk_lapack_sse2_dormqr
-fpk_lapack_sse2_dormrq
-fpk_lapack_sse2_dpotrf
-fpk_lapack_sse2_dpotri
-fpk_lapack_sse2_dpotrs
-fpk_lapack_sse2_dgetrf
-fpk_lapack_sse2_dgetrs
-fpk_lapack_sse2_dpptrf
-fpk_lapack_sse2_dsyev
-fpk_lapack_sse2_dsyevd
-fpk_lapack_sse2_dtrtrs
-fpk_lapack_sse2_sgeqp3
-fpk_lapack_sse2_sgeqrf
-fpk_lapack_sse2_sgerqf
-fpk_lapack_sse2_sgesvd
-fpk_lapack_sse2_sorgqr
-fpk_lapack_sse2_sormqr
-fpk_lapack_sse2_sormrq
-fpk_lapack_sse2_spotrf
-fpk_lapack_sse2_spotri
-fpk_lapack_sse2_spotrs
-fpk_lapack_sse2_sgetrf
-fpk_lapack_sse2_sgetrs
-fpk_lapack_sse2_spptrf
-fpk_lapack_sse2_ssyev
-fpk_lapack_sse2_ssyevd
-fpk_lapack_sse2_strtrs
-fpk_lapack_sse42_dgeqp3
-fpk_lapack_sse42_dgeqrf
-fpk_lapack_sse42_dgerqf
-fpk_lapack_sse42_dgesvd
-fpk_lapack_sse42_dorgqr
-fpk_lapack_sse42_dormqr
-fpk_lapack_sse42_dormrq
-fpk_lapack_sse42_dpotrf
-fpk_lapack_sse42_dpotri
-fpk_lapack_sse42_dpotrs
-fpk_lapack_sse42_dgetrf
-fpk_lapack_sse42_dgetrs
-fpk_lapack_sse42_dpptrf
-fpk_lapack_sse42_dsyev
-fpk_lapack_sse42_dsyevd
-fpk_lapack_sse42_dtrtrs
-fpk_lapack_sse42_sgeqp3
-fpk_lapack_sse42_sgeqrf
-fpk_lapack_sse42_sgerqf
-fpk_lapack_sse42_sgesvd
-fpk_lapack_sse42_sorgqr
-fpk_lapack_sse42_sormqr
-fpk_lapack_sse42_sormrq
-fpk_lapack_sse42_spotrf
-fpk_lapack_sse42_spotri
-fpk_lapack_sse42_spotrs
-fpk_lapack_sse42_sgetrf
-fpk_lapack_sse42_sgetrs
-fpk_lapack_sse42_spptrf
-fpk_lapack_sse42_ssyev
-fpk_lapack_sse42_ssyevd
-fpk_lapack_sse42_strtrs
-fpk_serv_get_max_threads
-fpk_serv_set_num_threads
-fpk_serv_set_num_threads_local
-fpk_serv_get_ncpus
-fpk_serv_get_ncorespercpu
-fpk_serv_get_ht
-fpk_serv_get_nlogicalcores
-fpk_spblas_avx2_mkl_dcsrmm
-fpk_spblas_avx2_mkl_dcsrmultd
-fpk_spblas_avx2_mkl_dcsrmv
-fpk_spblas_avx2_mkl_scsrmm
-fpk_spblas_avx2_mkl_scsrmultd
-fpk_spblas_avx2_mkl_scsrmv
-fpk_spblas_avx512_mkl_dcsrmm
-fpk_spblas_avx512_mkl_dcsrmultd
-fpk_spblas_avx512_mkl_dcsrmv
-fpk_spblas_avx512_mkl_scsrmm
-fpk_spblas_avx512_mkl_scsrmultd
-fpk_spblas_avx512_mkl_scsrmv
-fpk_spblas_sse2_mkl_dcsrmm
-fpk_spblas_sse2_mkl_dcsrmultd
-fpk_spblas_sse2_mkl_dcsrmv
-fpk_spblas_sse2_mkl_scsrmm
-fpk_spblas_sse2_mkl_scsrmultd
-fpk_spblas_sse2_mkl_scsrmv
-fpk_spblas_sse42_mkl_dcsrmm
-fpk_spblas_sse42_mkl_dcsrmultd
-fpk_spblas_sse42_mkl_dcsrmv
-fpk_spblas_sse42_mkl_scsrmm
-fpk_spblas_sse42_mkl_scsrmultd
-fpk_spblas_sse42_mkl_scsrmv
-fpk_serv_enable_instructions
-fpk_serv_cpuisknm
-fpk_serv_memmove_s
-fpk_vsl_serv_threader_for
-fpk_vsl_serv_threader_for_ordered
-fpk_vsl_serv_threader_sections
-fpk_vsl_serv_threader_ordered
-fpk_vsl_serv_threader_get_num_threads_limit
diff --git a/cpp/daal/src/threading/export_lnx32e.mkl.def b/cpp/daal/src/threading/export_lnx32e.mkl.def
index 5a0da223cda..7962b0a8844 100644
--- a/cpp/daal/src/threading/export_lnx32e.mkl.def
+++ b/cpp/daal/src/threading/export_lnx32e.mkl.def
@@ -15,266 +15,3 @@
 ;===============================================================================
 
 EXPORTS
-fpk_blas_avx2_daxpy
-fpk_blas_avx2_dgemm
-fpk_blas_avx2_dgemv
-fpk_blas_avx2_dsymm
-fpk_blas_avx2_dsyr
-fpk_blas_avx2_dsyrk
-fpk_blas_avx2_saxpy
-fpk_blas_avx2_sgemm
-fpk_blas_avx2_sgemv
-fpk_blas_avx2_ssymm
-fpk_blas_avx2_ssyr
-fpk_blas_avx2_ssyrk
-fpk_blas_avx2_xdgemm
-fpk_blas_avx2_xdsymm
-fpk_blas_avx2_xdsyr
-fpk_blas_avx2_xdsyrk
-fpk_blas_avx2_xsgemm
-fpk_blas_avx2_xssymm
-fpk_blas_avx2_xssyr
-fpk_blas_avx2_xssyrk
-fpk_blas_avx2_xsdot
-fpk_blas_avx2_xddot
-fpk_blas_avx512_daxpy
-fpk_blas_avx512_dgemm
-fpk_blas_avx512_dgemv
-fpk_blas_avx512_dsymm
-fpk_blas_avx512_dsyr
-fpk_blas_avx512_dsyrk
-fpk_blas_avx512_xsdot
-fpk_blas_avx512_xddot
-fpk_blas_avx512_saxpy
-fpk_blas_avx512_sgemm
-fpk_blas_avx512_sgemv
-fpk_blas_avx512_ssymm
-fpk_blas_avx512_ssyr
-fpk_blas_avx512_ssyrk
-fpk_blas_avx512_xdgemm
-fpk_blas_avx512_xdsymm
-fpk_blas_avx512_xdsyr
-fpk_blas_avx512_xdsyrk
-fpk_blas_avx512_xsgemm
-fpk_blas_avx512_xssymm
-fpk_blas_avx512_xssyr
-fpk_blas_avx512_xssyrk
-fpk_blas_sse2_daxpy
-fpk_blas_sse2_dgemm
-fpk_blas_sse2_dgemv
-fpk_blas_sse2_dsymm
-fpk_blas_sse2_dsyr
-fpk_blas_sse2_dsyrk
-fpk_blas_sse2_saxpy
-fpk_blas_sse2_sgemm
-fpk_blas_sse2_sgemv
-fpk_blas_sse2_ssymm
-fpk_blas_sse2_ssyr
-fpk_blas_sse2_ssyrk
-fpk_blas_sse2_xdgemm
-fpk_blas_sse2_xdsymm
-fpk_blas_sse2_xdsyr
-fpk_blas_sse2_xdsyrk
-fpk_blas_sse2_xsgemm
-fpk_blas_sse2_xssymm
-fpk_blas_sse2_xssyr
-fpk_blas_sse2_xssyrk
-fpk_blas_sse2_xsdot
-fpk_blas_sse2_xddot
-fpk_blas_sse42_daxpy
-fpk_blas_sse42_dgemm
-fpk_blas_sse42_dgemv
-fpk_blas_sse42_dsymm
-fpk_blas_sse42_dsyr
-fpk_blas_sse42_dsyrk
-fpk_blas_sse42_saxpy
-fpk_blas_sse42_sgemm
-fpk_blas_sse42_sgemv
-fpk_blas_sse42_ssymm
-fpk_blas_sse42_ssyr
-fpk_blas_sse42_ssyrk
-fpk_blas_sse42_xdgemm
-fpk_blas_sse42_xdsymm
-fpk_blas_sse42_xdsyr
-fpk_blas_sse42_xdsyrk
-fpk_blas_sse42_xsgemm
-fpk_blas_sse42_xssymm
-fpk_blas_sse42_xssyr
-fpk_blas_sse42_xssyrk
-fpk_blas_sse42_xsdot
-fpk_blas_sse42_xddot
-fpk_dft_avx2_ippsSortRadixAscend_32f_I
-fpk_dft_avx2_ippsSortRadixAscend_64f_I
-fpk_dft_avx512_ippsSortRadixAscend_32f_I
-fpk_dft_avx512_ippsSortRadixAscend_64f_I
-fpk_dft_sse2_ippsSortRadixAscend_32f_I
-fpk_dft_sse2_ippsSortRadixAscend_64f_I
-fpk_dft_sse42_ippsSortRadixAscend_32f_I
-fpk_dft_sse42_ippsSortRadixAscend_64f_I
-fpk_lapack_avx2_dgeqp3
-fpk_lapack_avx2_dgeqrf
-fpk_lapack_avx2_dgerqf
-fpk_lapack_avx2_dgesvd
-fpk_lapack_avx2_dorgqr
-fpk_lapack_avx2_dormqr
-fpk_lapack_avx2_dormrq
-fpk_lapack_avx2_dpotrf
-fpk_lapack_avx2_dpotri
-fpk_lapack_avx2_dpotrs
-fpk_lapack_avx2_dgetrf
-fpk_lapack_avx2_dgetrs
-fpk_lapack_avx2_dpptrf
-fpk_lapack_avx2_dsyev
-fpk_lapack_avx2_dsyevd
-fpk_lapack_avx2_dtrtrs
-fpk_lapack_avx2_sgeqp3
-fpk_lapack_avx2_sgeqrf
-fpk_lapack_avx2_sgerqf
-fpk_lapack_avx2_sgesvd
-fpk_lapack_avx2_sorgqr
-fpk_lapack_avx2_sormqr
-fpk_lapack_avx2_sormrq
-fpk_lapack_avx2_spotrf
-fpk_lapack_avx2_spotri
-fpk_lapack_avx2_spotrs
-fpk_lapack_avx2_sgetrf
-fpk_lapack_avx2_sgetrs
-fpk_lapack_avx2_spptrf
-fpk_lapack_avx2_ssyev
-fpk_lapack_avx2_ssyevd
-fpk_lapack_avx2_strtrs
-fpk_lapack_avx512_dgeqp3
-fpk_lapack_avx512_dgeqrf
-fpk_lapack_avx512_dgerqf
-fpk_lapack_avx512_dgesvd
-fpk_lapack_avx512_dorgqr
-fpk_lapack_avx512_dormqr
-fpk_lapack_avx512_dormrq
-fpk_lapack_avx512_dpotrf
-fpk_lapack_avx512_dpotri
-fpk_lapack_avx512_dpotrs
-fpk_lapack_avx512_dgetrf
-fpk_lapack_avx512_dgetrs
-fpk_lapack_avx512_dpptrf
-fpk_lapack_avx512_dsyev
-fpk_lapack_avx512_dsyevd
-fpk_lapack_avx512_dtrtrs
-fpk_lapack_avx512_sgeqp3
-fpk_lapack_avx512_sgeqrf
-fpk_lapack_avx512_sgerqf
-fpk_lapack_avx512_sgesvd
-fpk_lapack_avx512_sorgqr
-fpk_lapack_avx512_sormqr
-fpk_lapack_avx512_sormrq
-fpk_lapack_avx512_spotrf
-fpk_lapack_avx512_spotri
-fpk_lapack_avx512_spotrs
-fpk_lapack_avx512_sgetrf
-fpk_lapack_avx512_sgetrs
-fpk_lapack_avx512_spptrf
-fpk_lapack_avx512_ssyev
-fpk_lapack_avx512_ssyevd
-fpk_lapack_avx512_strtrs
-fpk_lapack_sse2_dgeqp3
-fpk_lapack_sse2_dgeqrf
-fpk_lapack_sse2_dgerqf
-fpk_lapack_sse2_dgesvd
-fpk_lapack_sse2_dorgqr
-fpk_lapack_sse2_dormqr
-fpk_lapack_sse2_dormrq
-fpk_lapack_sse2_dpotrf
-fpk_lapack_sse2_dpotri
-fpk_lapack_sse2_dpotrs
-fpk_lapack_sse2_dgetrf
-fpk_lapack_sse2_dgetrs
-fpk_lapack_sse2_dpptrf
-fpk_lapack_sse2_dsyev
-fpk_lapack_sse2_dsyevd
-fpk_lapack_sse2_dtrtrs
-fpk_lapack_sse2_sgeqp3
-fpk_lapack_sse2_sgeqrf
-fpk_lapack_sse2_sgerqf
-fpk_lapack_sse2_sgesvd
-fpk_lapack_sse2_sorgqr
-fpk_lapack_sse2_sormqr
-fpk_lapack_sse2_sormrq
-fpk_lapack_sse2_spotrf
-fpk_lapack_sse2_spotri
-fpk_lapack_sse2_spotrs
-fpk_lapack_sse2_sgetrf
-fpk_lapack_sse2_sgetrs
-fpk_lapack_sse2_spptrf
-fpk_lapack_sse2_ssyev
-fpk_lapack_sse2_ssyevd
-fpk_lapack_sse2_strtrs
-fpk_lapack_sse42_dgeqp3
-fpk_lapack_sse42_dgeqrf
-fpk_lapack_sse42_dgerqf
-fpk_lapack_sse42_dgesvd
-fpk_lapack_sse42_dorgqr
-fpk_lapack_sse42_dormqr
-fpk_lapack_sse42_dormrq
-fpk_lapack_sse42_dpotrf
-fpk_lapack_sse42_dpotri
-fpk_lapack_sse42_dpotrs
-fpk_lapack_sse42_dgetrf
-fpk_lapack_sse42_dgetrs
-fpk_lapack_sse42_dpptrf
-fpk_lapack_sse42_dsyev
-fpk_lapack_sse42_dsyevd
-fpk_lapack_sse42_dtrtrs
-fpk_lapack_sse42_sgeqp3
-fpk_lapack_sse42_sgeqrf
-fpk_lapack_sse42_sgerqf
-fpk_lapack_sse42_sgesvd
-fpk_lapack_sse42_sorgqr
-fpk_lapack_sse42_sormqr
-fpk_lapack_sse42_sormrq
-fpk_lapack_sse42_spotrf
-fpk_lapack_sse42_spotri
-fpk_lapack_sse42_spotrs
-fpk_lapack_sse42_sgetrf
-fpk_lapack_sse42_sgetrs
-fpk_lapack_sse42_spptrf
-fpk_lapack_sse42_ssyev
-fpk_lapack_sse42_ssyevd
-fpk_lapack_sse42_strtrs
-fpk_serv_get_max_threads
-fpk_serv_set_num_threads
-fpk_serv_set_num_threads_local
-fpk_serv_get_ncpus
-fpk_serv_get_ncorespercpu
-fpk_serv_get_ht
-fpk_serv_get_nlogicalcores
-fpk_spblas_avx2_mkl_dcsrmm
-fpk_spblas_avx2_mkl_dcsrmultd
-fpk_spblas_avx2_mkl_dcsrmv
-fpk_spblas_avx2_mkl_scsrmm
-fpk_spblas_avx2_mkl_scsrmultd
-fpk_spblas_avx2_mkl_scsrmv
-fpk_spblas_avx512_mkl_dcsrmm
-fpk_spblas_avx512_mkl_dcsrmultd
-fpk_spblas_avx512_mkl_dcsrmv
-fpk_spblas_avx512_mkl_scsrmm
-fpk_spblas_avx512_mkl_scsrmultd
-fpk_spblas_avx512_mkl_scsrmv
-fpk_spblas_sse2_mkl_dcsrmm
-fpk_spblas_sse2_mkl_dcsrmultd
-fpk_spblas_sse2_mkl_dcsrmv
-fpk_spblas_sse2_mkl_scsrmm
-fpk_spblas_sse2_mkl_scsrmultd
-fpk_spblas_sse2_mkl_scsrmv
-fpk_spblas_sse42_mkl_dcsrmm
-fpk_spblas_sse42_mkl_dcsrmultd
-fpk_spblas_sse42_mkl_dcsrmv
-fpk_spblas_sse42_mkl_scsrmm
-fpk_spblas_sse42_mkl_scsrmultd
-fpk_spblas_sse42_mkl_scsrmv
-fpk_serv_enable_instructions
-fpk_serv_cpuisknm
-fpk_serv_memmove_s
-fpk_vsl_serv_threader_for
-fpk_vsl_serv_threader_for_ordered
-fpk_vsl_serv_threader_sections
-fpk_vsl_serv_threader_ordered
-fpk_vsl_serv_threader_get_num_threads_limit
diff --git a/cpp/daal/src/threading/export_mac.def b/cpp/daal/src/threading/export_mac.def
index af053955a02..7962b0a8844 100644
--- a/cpp/daal/src/threading/export_mac.def
+++ b/cpp/daal/src/threading/export_mac.def
@@ -15,204 +15,3 @@
 ;===============================================================================
 
 EXPORTS
-_fpk_blas_avx2_daxpy
-_fpk_blas_avx2_dgemm
-_fpk_blas_avx2_dgemv
-_fpk_blas_avx2_dsymm
-_fpk_blas_avx2_dsyr
-_fpk_blas_avx2_dsyrk
-_fpk_blas_avx2_saxpy
-_fpk_blas_avx2_sgemm
-_fpk_blas_avx2_sgemv
-_fpk_blas_avx2_ssymm
-_fpk_blas_avx2_ssyr
-_fpk_blas_avx2_ssyrk
-_fpk_blas_avx2_xdgemm
-_fpk_blas_avx2_xdsymm
-_fpk_blas_avx2_xdsyr
-_fpk_blas_avx2_xdsyrk
-_fpk_blas_avx2_xsgemm
-_fpk_blas_avx2_xssymm
-_fpk_blas_avx2_xssyr
-_fpk_blas_avx2_xssyrk
-_fpk_blas_avx2_xsdot
-_fpk_blas_avx2_xddot
-_fpk_blas_avx512_daxpy
-_fpk_blas_avx512_dgemm
-_fpk_blas_avx512_dgemv
-_fpk_blas_avx512_dsymm
-_fpk_blas_avx512_dsyr
-_fpk_blas_avx512_dsyrk
-_fpk_blas_avx512_saxpy
-_fpk_blas_avx512_sgemm
-_fpk_blas_avx512_sgemv
-_fpk_blas_avx512_ssymm
-_fpk_blas_avx512_ssyr
-_fpk_blas_avx512_ssyrk
-_fpk_blas_avx512_xdgemm
-_fpk_blas_avx512_xdsymm
-_fpk_blas_avx512_xdsyr
-_fpk_blas_avx512_xdsyrk
-_fpk_blas_avx512_xsgemm
-_fpk_blas_avx512_xssymm
-_fpk_blas_avx512_xssyr
-_fpk_blas_avx512_xssyrk
-_fpk_blas_avx512_xsdot
-_fpk_blas_avx512_xddot
-_fpk_blas_avx_daxpy
-_fpk_blas_avx_dgemm
-_fpk_blas_avx_dgemv
-_fpk_blas_avx_dsymm
-_fpk_blas_avx_dsyr
-_fpk_blas_avx_dsyrk
-_fpk_blas_avx_saxpy
-_fpk_blas_avx_sgemm
-_fpk_blas_avx_sgemv
-_fpk_blas_avx_ssymm
-_fpk_blas_avx_ssyr
-_fpk_blas_avx_ssyrk
-_fpk_blas_avx_xdgemm
-_fpk_blas_avx_xdsymm
-_fpk_blas_avx_xdsyr
-_fpk_blas_avx_xdsyrk
-_fpk_blas_avx_xsgemm
-_fpk_blas_avx_xssymm
-_fpk_blas_avx_xssyr
-_fpk_blas_avx_xssyrk
-_fpk_blas_avx_xsdot
-_fpk_blas_avx_xddot
-_fpk_dft_avx2_ippsSortRadixAscend_32f_I
-_fpk_dft_avx2_ippsSortRadixAscend_64f_I
-_fpk_dft_avx512_ippsSortRadixAscend_32f_I
-_fpk_dft_avx512_ippsSortRadixAscend_64f_I
-_fpk_dft_avx_ippsSortRadixAscend_32f_I
-_fpk_dft_avx_ippsSortRadixAscend_64f_I
-_fpk_lapack_avx2_dgeqp3
-_fpk_lapack_avx2_dgeqrf
-_fpk_lapack_avx2_dgerqf
-_fpk_lapack_avx2_dgesvd
-_fpk_lapack_avx2_dorgqr
-_fpk_lapack_avx2_dormqr
-_fpk_lapack_avx2_dormrq
-_fpk_lapack_avx2_dpotrf
-_fpk_lapack_avx2_dpotri
-_fpk_lapack_avx2_dpotrs
-_fpk_lapack_avx2_dgetrf
-_fpk_lapack_avx2_dgetrs
-_fpk_lapack_avx2_dpptrf
-_fpk_lapack_avx2_dsyev
-_fpk_lapack_avx2_dsyevd
-_fpk_lapack_avx2_dtrtrs
-_fpk_lapack_avx2_sgeqp3
-_fpk_lapack_avx2_sgeqrf
-_fpk_lapack_avx2_sgerqf
-_fpk_lapack_avx2_sgesvd
-_fpk_lapack_avx2_sorgqr
-_fpk_lapack_avx2_sormqr
-_fpk_lapack_avx2_sormrq
-_fpk_lapack_avx2_spotrf
-_fpk_lapack_avx2_spotri
-_fpk_lapack_avx2_spotrs
-_fpk_lapack_avx2_sgetrf
-_fpk_lapack_avx2_sgetrs
-_fpk_lapack_avx2_spptrf
-_fpk_lapack_avx2_ssyev
-_fpk_lapack_avx2_ssyevd
-_fpk_lapack_avx2_strtrs
-_fpk_lapack_avx512_dgeqp3
-_fpk_lapack_avx512_dgeqrf
-_fpk_lapack_avx512_dgerqf
-_fpk_lapack_avx512_dgesvd
-_fpk_lapack_avx512_dorgqr
-_fpk_lapack_avx512_dormqr
-_fpk_lapack_avx512_dormrq
-_fpk_lapack_avx512_dpotrf
-_fpk_lapack_avx512_dpotri
-_fpk_lapack_avx512_dpotrs
-_fpk_lapack_avx512_dgetrf
-_fpk_lapack_avx512_dgetrs
-_fpk_lapack_avx512_dpptrf
-_fpk_lapack_avx512_dsyev
-_fpk_lapack_avx512_dsyevd
-_fpk_lapack_avx512_dtrtrs
-_fpk_lapack_avx512_sgeqp3
-_fpk_lapack_avx512_sgeqrf
-_fpk_lapack_avx512_sgerqf
-_fpk_lapack_avx512_sgesvd
-_fpk_lapack_avx512_sorgqr
-_fpk_lapack_avx512_sormqr
-_fpk_lapack_avx512_sormrq
-_fpk_lapack_avx512_spotrf
-_fpk_lapack_avx512_spotri
-_fpk_lapack_avx512_spotrs
-_fpk_lapack_avx512_sgetrf
-_fpk_lapack_avx512_sgetrs
-_fpk_lapack_avx512_spptrf
-_fpk_lapack_avx512_ssyev
-_fpk_lapack_avx512_ssyevd
-_fpk_lapack_avx512_strtrs
-_fpk_lapack_avx_dgeqp3
-_fpk_lapack_avx_dgeqrf
-_fpk_lapack_avx_dgerqf
-_fpk_lapack_avx_dgesvd
-_fpk_lapack_avx_dorgqr
-_fpk_lapack_avx_dormqr
-_fpk_lapack_avx_dormrq
-_fpk_lapack_avx_dpotrf
-_fpk_lapack_avx_dpotri
-_fpk_lapack_avx_dpotrs
-_fpk_lapack_avx_dgetrf
-_fpk_lapack_avx_dgetrs
-_fpk_lapack_avx_dpptrf
-_fpk_lapack_avx_dsyev
-_fpk_lapack_avx_dsyevd
-_fpk_lapack_avx_dtrtrs
-_fpk_lapack_avx_sgeqp3
-_fpk_lapack_avx_sgeqrf
-_fpk_lapack_avx_sgerqf
-_fpk_lapack_avx_sgesvd
-_fpk_lapack_avx_sorgqr
-_fpk_lapack_avx_sormqr
-_fpk_lapack_avx_sormrq
-_fpk_lapack_avx_spotrf
-_fpk_lapack_avx_spotri
-_fpk_lapack_avx_spotrs
-_fpk_lapack_avx_sgetrf
-_fpk_lapack_avx_sgetrs
-_fpk_lapack_avx_spptrf
-_fpk_lapack_avx_ssyev
-_fpk_lapack_avx_ssyevd
-_fpk_lapack_avx_strtrs
-_fpk_serv_get_max_threads
-_fpk_serv_set_num_threads
-_fpk_serv_set_num_threads_local
-_fpk_serv_get_ncpus
-_fpk_serv_get_ncorespercpu
-_fpk_serv_get_ht
-_fpk_serv_get_nlogicalcores
-_fpk_spblas_avx2_mkl_dcsrmm
-_fpk_spblas_avx2_mkl_dcsrmultd
-_fpk_spblas_avx2_mkl_dcsrmv
-_fpk_spblas_avx2_mkl_scsrmm
-_fpk_spblas_avx2_mkl_scsrmultd
-_fpk_spblas_avx2_mkl_scsrmv
-_fpk_spblas_avx512_mkl_dcsrmm
-_fpk_spblas_avx512_mkl_dcsrmultd
-_fpk_spblas_avx512_mkl_dcsrmv
-_fpk_spblas_avx512_mkl_scsrmm
-_fpk_spblas_avx512_mkl_scsrmultd
-_fpk_spblas_avx512_mkl_scsrmv
-_fpk_spblas_avx_mkl_dcsrmm
-_fpk_spblas_avx_mkl_dcsrmultd
-_fpk_spblas_avx_mkl_dcsrmv
-_fpk_spblas_avx_mkl_scsrmm
-_fpk_spblas_avx_mkl_scsrmultd
-_fpk_spblas_avx_mkl_scsrmv
-_fpk_serv_enable_instructions
-_fpk_serv_cpuisknm
-_fpk_serv_memmove_s
-_fpk_vsl_serv_threader_for
-_fpk_vsl_serv_threader_for_ordered
-_fpk_vsl_serv_threader_sections
-_fpk_vsl_serv_threader_ordered
-_fpk_vsl_serv_threader_get_num_threads_limit
diff --git a/cpp/daal/src/threading/threading.cpp b/cpp/daal/src/threading/threading.cpp
index 15c39368238..3b280229ee7 100644
--- a/cpp/daal/src/threading/threading.cpp
+++ b/cpp/daal/src/threading/threading.cpp
@@ -103,7 +103,7 @@ DAAL_EXPORT size_t _setNumberOfThreads(const size_t numThreads, void ** globalCo
     return 1;
 }
 
-DAAL_EXPORT void _daal_threader_for(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -160,7 +160,7 @@ DAAL_EXPORT void _daal_threader_for_blocked_size(size_t n, size_t block, const v
     }
 }
 
-DAAL_EXPORT void _daal_threader_for_simple(int n, int threads_request, const void * a, daal::functype func)
+DAAL_EXPORT void _daal_threader_for_simple(int n, int reserved, const void * a, daal::functype func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
@@ -318,7 +318,7 @@ DAAL_PARALLEL_SORT_IMPL(daal::IdxValType<double>, pair_fp64_uint64)
 
 #undef DAAL_PARALLEL_SORT_IMPL
 
-DAAL_EXPORT void _daal_threader_for_blocked(int n, int threads_request, const void * a, daal::functype2 func)
+DAAL_EXPORT void _daal_threader_for_blocked(int n, int reserved, const void * a, daal::functype2 func)
 {
     if (daal::threader_env()->getNumberOfThreads() > 1)
     {
diff --git a/cpp/daal/src/threading/threading.h b/cpp/daal/src/threading/threading.h
index 0b4a9881b97..ca8661f2203 100644
--- a/cpp/daal/src/threading/threading.h
+++ b/cpp/daal/src/threading/threading.h
@@ -198,107 +198,196 @@ inline size_t setNumberOfThreads(const size_t numThreads, void ** globalControl)
 template <typename F>
 inline void threader_func(int i, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i);
+    const F & func = *static_cast<const F *>(a);
+    func(i);
 }
 
 template <typename F>
 inline void static_threader_func(size_t i, size_t tid, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, tid);
+    const F & func = *static_cast<const F *>(a);
+    func(i, tid);
 }
 
 template <typename F>
 inline void threader_func_b(int i0, int in, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i0, in);
+    const F & func = *static_cast<const F *>(a);
+    func(i0, in);
 }
 
 template <typename F>
 inline void threader_func_break(int i, bool & needBreak, const void * a)
 {
-    const F & lambda = *static_cast<const F *>(a);
-    lambda(i, needBreak);
+    const F & func = *static_cast<const F *>(a);
+    func(i, needBreak);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^31 - 1 (INT32_MAX)`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for(int n, int threads_request, const F & lambda)
+inline void threader_for(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for(n, threads_request, a, threader_func<F>);
+    _daal_threader_for(n, reserved, a, threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^63 - 1 (INT64_MAX)`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+/// The iterations of the loop should be logically independent.
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int64_t i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] func     Callable object that defines the loop body.
 template <typename F>
-inline void threader_for_int64(int64_t n, const F & lambda)
+inline void threader_for_int64(int64_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int64(n, a, threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is 2^31 - 1.
+///
+/// The specifics of this loop comparing to `threader_for` is that the iteration space
+/// of the loop is always chunked with chunk size 1.
+/// This means the threading layer tries to assign consecutive iterations to
+/// different threads, if possible.
+/// In case of oneTBB threading backend this means that `simple_partitioner`
+/// (https://oneapi-src.github.io/oneTBB/main/tbb_userguide/Partitioner_Summary.html)
+/// with chunk size 1 is used to produce iteration to threads mappings.
+///
+/// Data dependencies between the iterations are allowed, but may requre the use
+/// of synchronization primitives.
+///
+/// @tparam F   Callable object of type `[/* captures */](int i) -> void`,
+///             where `i` is the loop's iteration index, `0 <= i < n`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void threader_for_simple(int n, int threads_request, const F & lambda)
+inline void threader_for_simple(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for_simple(n, threads_request, a, threader_func<F>);
+    _daal_threader_for_simple(n, reserved, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_int32ptr(const int * begin, const int * end, const F & lambda)
+inline void threader_for_int32ptr(const int * begin, const int * end, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_int32ptr(begin, end, a, threader_func<F>);
 }
 
+/// Execute the for loop defined by the input parameters in parallel.
+/// The maximal number of iterations in the loop is `SIZE_MAX` in C99 standard.
+///
+/// The work is scheduled statically across threads.
+/// This means that the work is always scheduled in the same way across the threads:
+/// each thread processes the same set of iterations on each invocation of this loop.
+///
+/// It is recommended to use this parallel loop if each iteration of the loop
+/// performs equal amount of work.
+///
+/// Let `t` be the number of threads available to oneDAL. The number of iterations
+/// processed by each threads (except maybe the last one) is computed as:
+/// `nI = (n + t - 1) / t`
+///
+/// Here is how the work is split across the threads:
+/// The 1st thread executes iterations `0, ..., nI - 1`;
+/// the 2nd thread executes iterations `nI, ..., 2 * nI - 1`;
+/// ...
+/// the `t`-th thread executes iterations `(t - 1) * nI, ..., n - 1`.
+///
+/// @tparam F   Callable object of type `[/* captures */](size_t i, size_t tid) -> void`,
+///             where
+///                 `i` is the loop's iteration index, `0 <= i < n`;
+///                 `tid` is the index of the thread, `0 <= tid < t`.
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] func     Callable object that defines iteration's body.
 template <typename F>
-inline void static_threader_for(size_t n, const F & lambda)
+inline void static_threader_for(size_t n, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_static_threader_for(n, a, static_threader_func<F>);
 }
 
+/// Pass a function to be executed in a for loop to the threading layer.
+/// The maximal number of iterations in the loop is `2^31 - 1 INT32_MAX`.
+/// The default scheduling of the threading layer is used to assign
+/// the iterations of the loop to threads.
+///
+/// @tparam F   Callable object of type `[/* captures */](int beginRange, int endRange) -> void`
+///             where
+///                 `beginRange` is the starting index of the loop iterations block to be
+///                                processed by a thread, `0 <= beginRange < n`;
+///                 `endRange`   is the index after the end of the loop's iterations block to be
+///                                processed by a thread, `beginRange < endRange <= n`;
+///
+/// @param[in] n        Number of iterations in the for loop.
+/// @param[in] reserved Parameter reserved for the future. Currently unused.
+/// @param[in] func     Callable object that processes the block of loop's iterations
+///                     `[beginRange, endRange)`.
 template <typename F>
-inline void threader_for_blocked(int n, int threads_request, const F & lambda)
+inline void threader_for_blocked(int n, int reserved, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
-    _daal_threader_for_blocked(n, threads_request, a, threader_func_b<F>);
+    _daal_threader_for_blocked(n, reserved, a, threader_func_b<F>);
 }
 
 template <typename F>
-inline void threader_for_optional(int n, int threads_request, const F & lambda)
+inline void threader_for_optional(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_optional(n, threads_request, a, threader_func<F>);
 }
 
 template <typename F>
-inline void threader_for_break(int n, int threads_request, const F & lambda)
+inline void threader_for_break(int n, int threads_request, const F & func)
 {
-    const void * a = static_cast<const void *>(&lambda);
+    const void * a = static_cast<const void *>(&func);
 
     _daal_threader_for_break(n, threads_request, a, threader_func_break<F>);
 }
 
-template <typename lambdaType>
+template <typename callableType>
 inline void * tls_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void tls_reduce_func(void * v, const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    lambda((F)v);
+    const callableType & func = *static_cast<const callableType *>(a);
+    func((F)v);
 }
 
 struct tlsBase
@@ -313,32 +402,49 @@ class tls_deleter : public tlsBase
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class tls_deleter_ : public tls_deleter
 {
 public:
     virtual ~tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
+/// Thread-local storage (TLS).
+/// Can change its local variable after a nested parallel constructs.
+/// @note Thread-local storage in nested parallel regions is, in general, not thread local.
+/// The use of nested parallelism should be avoided if possible, otherwise extra care
+/// must be taken with thread-local values.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class tls : public tlsBase
 {
 public:
-    template <typename lambdaType>
-    explicit tls(const lambdaType & lambda)
+    /// Initialize thread-local storage
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit tls(const callableType & func)
     {
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        tlsPtr = _daal_get_tls_ptr(a, tls_func<lambdaType>);
+        tlsPtr = _daal_get_tls_ptr(a, tls_func<callableType>);
     }
 
+    /// Destroys the memory associated with a thread-local storage
+    ///
+    /// @note TLS does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~tls()
     {
         d->del(voidLambda);
@@ -346,26 +452,43 @@ class tls : public tlsBase
         _daal_del_tls_ptr(tlsPtr);
     }
 
+    /// Access a local data of a thread by value
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _daal_get_tls_local(tlsPtr);
         return (static_cast<F>(pf));
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
-    template <typename lambdaType>
-    void parallel_reduce(const lambdaType & lambda)
+    /// Parallel reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func     Callable object that is applied to each element of thread-local
+    ///                 storage in parallel.
+    template <typename callableType>
+    void parallel_reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, lambdaType>);
+        _daal_parallel_reduce_tls(tlsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:
@@ -374,11 +497,11 @@ class tls : public tlsBase
     tls_deleter * d;
 };
 
-template <typename F, typename lambdaType>
+template <typename F, typename callableType>
 inline void * creater_func(const void * a)
 {
-    const lambdaType & lambda = *static_cast<const lambdaType *>(a);
-    return lambda();
+    const callableType & func = *static_cast<const callableType *>(a);
+    return func();
 }
 
 class static_tls_deleter
@@ -388,20 +511,28 @@ class static_tls_deleter
     virtual void del(void * a) = 0;
 };
 
-template <typename lambdaType>
+template <typename callableType>
 class static_tls_deleter_ : public static_tls_deleter
 {
 public:
     virtual ~static_tls_deleter_() {}
-    virtual void del(void * a) { delete static_cast<lambdaType *>(a); }
+    virtual void del(void * a) { delete static_cast<callableType *>(a); }
 };
 
+/// Thread-local storage (TLS) for the case of static parallel work scheduling.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class static_tls
 {
 public:
-    template <typename lambdaType>
-    explicit static_tls(const lambdaType & lambda)
+    /// Initialize thread-local storage.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func Callable object that initializes a thread-local storage
+    template <typename callableType>
+    explicit static_tls(const callableType & func)
     {
         _nThreads = threader_get_max_threads_number();
 
@@ -417,8 +548,8 @@ class static_tls
             _storage[i] = nullptr;
         }
 
-        lambdaType * locall = new lambdaType(lambda);
-        _deleter            = new static_tls_deleter_<lambdaType>();
+        callableType * locall = new callableType(func);
+        _deleter              = new static_tls_deleter_<callableType>();
         if (!locall || !_deleter)
         {
             return;
@@ -428,9 +559,14 @@ class static_tls
         void * a        = const_cast<void *>(ac);
         _creater        = a;
 
-        _creater_func = creater_func<F, lambdaType>;
+        _creater_func = creater_func<F, callableType>;
     }
 
+    /// Destroys the memory associated with a thread-local storage.
+    ///
+    /// @note Static TLS does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~static_tls()
     {
         if (_deleter)
@@ -441,9 +577,16 @@ class static_tls
         delete[] _storage;
     }
 
+    /// Access a local data of a specified thread by value.
+    ///
+    /// @param tid  Index of the thread.
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local(size_t tid)
     {
-        if (_storage)
+        if (_storage && tid < _nThreads)
         {
             if (!_storage[tid])
             {
@@ -458,18 +601,27 @@ class static_tls
         }
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
         if (_storage)
         {
             for (size_t i = 0; i < _nThreads; ++i)
             {
-                if (_storage[i]) lambda(_storage[i]);
+                if (_storage[i]) func(_storage[i]);
             }
         }
     }
 
+    /// Full number of threads.
+    ///
+    /// @return Total number of threads available to oneDAL.
     size_t nthreads() const { return _nThreads; }
 
 private:
@@ -480,25 +632,43 @@ class static_tls
     static_tls_deleter * _deleter    = nullptr;
 };
 
+/// Local storage (LS) for the data of a thread.
+/// Does not change its local variable after nested parallel constructs,
+/// but can have performance penalties compared to thread-local storage type `daal::tls`.
+/// Can be safely used in case of nested parallel regions.
+///
+/// @tparam F  Type of the data located in the storage
 template <typename F>
 class ls : public tlsBase
 {
 public:
-    template <typename lambdaType>
-    explicit ls(const lambdaType & lambda, const bool isTls = false)
+    /// Initialize local storage.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */]() -> F`
+    ///
+    /// @param func     Callable object that initializes local storage
+    /// @param isTls    if `true`, then local storage is a thread-local storage (`daal::tls`)
+    ///                 and might have problems in case of nested parallel regions.
+    template <typename callableType>
+    explicit ls(const callableType & func, const bool isTls = false)
     {
-        _isTls              = isTls;
-        lambdaType * locall = new lambdaType(lambda);
-        d                   = new tls_deleter_<lambdaType>();
+        _isTls                   = isTls;
+        callableType * localfunc = new callableType(func);
+        d                        = new tls_deleter_<callableType>();
 
-        //const void* ac = static_cast<const void*>(&lambda);
-        const void * ac = static_cast<const void *>(locall);
+        //const void* ac = static_cast<const void*>(&func);
+        const void * ac = static_cast<const void *>(localfunc);
         void * a        = const_cast<void *>(ac);
         voidLambda      = a;
 
-        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<lambdaType>) : _daal_get_ls_ptr(a, tls_func<lambdaType>);
+        lsPtr = _isTls ? _daal_get_tls_ptr(a, tls_func<callableType>) : _daal_get_ls_ptr(a, tls_func<callableType>);
     }
 
+    /// Destroys the memory associated with local storage.
+    ///
+    /// @note `ls` does not release the memory allocated by a callable object
+    ///       provided to the constructor.
+    ///       Developers are responsible for deletion of that memory.
     virtual ~ls()
     {
         d->del(voidLambda);
@@ -506,6 +676,11 @@ class ls : public tlsBase
         _isTls ? _daal_del_tls_ptr(lsPtr) : _daal_del_ls_ptr(lsPtr);
     }
 
+    /// Access the local data of a thread by value.
+    ///
+    /// @return When first invoked by a thread, a callable object provided to the constructor is
+    ///         called to initialize the local data of the thread and return it.
+    ///         All the following invocations just return the same thread-local data.
     F local()
     {
         void * pf = _isTls ? _daal_get_tls_local(lsPtr) : _daal_get_ls_local(lsPtr);
@@ -517,12 +692,18 @@ class ls : public tlsBase
         if (!_isTls) _daal_release_ls_local(lsPtr, p);
     }
 
-    template <typename lambdaType>
-    void reduce(const lambdaType & lambda)
+    /// Sequential reduction.
+    ///
+    /// @tparam callableType  Callable object of type `[/* captures */](F) -> void`
+    ///
+    /// @param func Callable object that is applied to each element of thread-local
+    ///             storage sequentially.
+    template <typename callableType>
+    void reduce(const callableType & func)
     {
-        const void * ac = static_cast<const void *>(&lambda);
+        const void * ac = static_cast<const void *>(&func);
         void * a        = const_cast<void *>(ac);
-        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, lambdaType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, lambdaType>);
+        _isTls ? _daal_reduce_tls(lsPtr, a, tls_reduce_func<F, callableType>) : _daal_reduce_ls(lsPtr, a, tls_reduce_func<F, callableType>);
     }
 
 private:
diff --git a/cpp/oneapi/dal/BUILD b/cpp/oneapi/dal/BUILD
index ff6d770cc7c..7a3ba863105 100644
--- a/cpp/oneapi/dal/BUILD
+++ b/cpp/oneapi/dal/BUILD
@@ -30,7 +30,7 @@ dal_module(
         "@onedal//cpp/daal:data_management",
     ],
     dpc_deps = [
-        "@micromkl_dpc//:mkl_dpc",
+        "@mkl//:mkl_dpc",
     ],
 )
 
@@ -53,7 +53,6 @@ dal_collect_modules(
     modules = [
         "algo",
         "io",
-        "backend/micromkl",
         "backend/primitives",
     ],
 )
@@ -175,7 +174,6 @@ dal_collect_test_suites(
         "io",
         "table",
         "util",
-        "backend/micromkl",
         "backend/primitives",
     ],
     tests = [
diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
index dba39bb9d01..228e9e01863 100644
--- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
+++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp
@@ -371,7 +371,7 @@ sycl::event handle_empty_clusters(const dal::backend::context_gpu& ctx,
     auto event = queue.submit([&](sycl::handler& cgh) {
         cgh.depends_on(deps);
         cgh.parallel_for(range, [=](auto it) {
-            const auto local_id = it.get_local_id(1);
+            const auto local_id = it.get_local_id()[1];
             for (std::int64_t cluster_id = rank; cluster_id < num_clusters;
                  cluster_id += rank_count) {
                 // no need to handle non-empty clusters
diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp b/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp
index 12862ab04ba..17d504b804e 100644
--- a/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/gpu/finalize_train_kernel_cov_impl_dpc.cpp
@@ -113,18 +113,22 @@ result_t finalize_train_kernel_cov_impl<Float>::operator()(const descriptor_t& d
         data_to_compute = corr;
     }
 
-    auto [eigvecs, eigvals] = compute_eigenvectors_on_host(q,
-                                                           std::move(data_to_compute),
-                                                           component_count,
-                                                           { corr_event, vars_event, cov_event });
+    auto [eigvals, syevd_event] =
+        syevd_computation(q, data_to_compute, { cov_event, corr_event, vars_event });
+
+    auto flipped_eigvals_host = flip_eigenvalues(q, eigvals, component_count, { syevd_event });
+
+    auto flipped_eigenvectors_host =
+        flip_eigenvectors(q, data_to_compute, component_count, { syevd_event });
     if (desc.get_result_options().test(result_options::eigenvalues)) {
-        result.set_eigenvalues(homogen_table::wrap(eigvals.flatten(), 1, component_count));
+        result.set_eigenvalues(
+            homogen_table::wrap(flipped_eigvals_host.flatten(), 1, component_count));
     }
 
     if (desc.get_result_options().test(result_options::singular_values)) {
         auto singular_values =
             compute_singular_values_on_host(q,
-                                            eigvals,
+                                            flipped_eigvals_host,
                                             rows_count_global,
                                             { corr_event, vars_event, cov_event });
         result.set_singular_values(
@@ -135,7 +139,7 @@ result_t finalize_train_kernel_cov_impl<Float>::operator()(const descriptor_t& d
         auto vars_host = vars.to_host(q);
         auto explained_variances_ratio =
             compute_explained_variances_on_host(q,
-                                                eigvals,
+                                                flipped_eigvals_host,
                                                 vars_host,
                                                 { corr_event, vars_event, cov_event });
         result.set_explained_variances_ratio(
@@ -143,12 +147,13 @@ result_t finalize_train_kernel_cov_impl<Float>::operator()(const descriptor_t& d
     }
 
     if (desc.get_deterministic()) {
-        sign_flip(eigvecs);
+        sign_flip(flipped_eigenvectors_host);
     }
 
     if (desc.get_result_options().test(result_options::eigenvectors)) {
-        result.set_eigenvectors(
-            homogen_table::wrap(eigvecs.flatten(), component_count, column_count));
+        result.set_eigenvectors(homogen_table::wrap(flipped_eigenvectors_host.flatten(),
+                                                    component_count,
+                                                    column_count));
     }
 
     return result;
diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/misc.hpp b/cpp/oneapi/dal/algo/pca/backend/gpu/misc.hpp
index d86ee3a04be..8df5d89d985 100644
--- a/cpp/oneapi/dal/algo/pca/backend/gpu/misc.hpp
+++ b/cpp/oneapi/dal/algo/pca/backend/gpu/misc.hpp
@@ -57,6 +57,113 @@ auto compute_sums(sycl::queue& queue,
     return std::make_tuple(sums, sums_event);
 }
 
+///  A wrapper that computes 1d array of eigenvalues and 2d array of eigenvectors from the covariance matrix
+///
+/// @tparam Float Floating-point type used to perform computations
+///
+/// @param[in]  queue The SYCL queue
+/// @param[in]  corr  The input covariance/correlation matrix of size `column_count` x `column_count`
+/// @param[in]  deps  Events indicating availability of the `data` for reading or writing
+///
+/// @return A tuple of two elements, where the first element is the resulting 2d array of eigenvectors
+/// of size `component_count` x `column_count` and the second element is the resulting 1d array of eigenvalues
+template <typename Float>
+auto syevd_computation(sycl::queue& queue,
+                       pr::ndview<Float, 2>& corr,
+                       const bk::event_vector& deps = {}) {
+    sycl::event::wait_and_throw(deps);
+
+    const std::int64_t column_count = corr.get_dimension(1);
+    auto eigenvalues = pr::ndarray<Float, 1>::empty(queue, { column_count }, alloc::device);
+
+    std::int64_t lda = column_count;
+
+    sycl::event syevd_event;
+    {
+        syevd_event = pr::syevd<mkl::job::vec, mkl::uplo::upper>(queue,
+                                                                 column_count,
+                                                                 corr,
+                                                                 lda,
+                                                                 eigenvalues,
+                                                                 { deps });
+    }
+    syevd_event.wait_and_throw();
+    return std::make_tuple(eigenvalues, syevd_event);
+}
+
+///  A wrapper that flips 2d array of eigenvectors from the syevd result in necessary order
+///
+/// @tparam Float Floating-point type used to perform computations
+///
+/// @param[in]  queue The SYCL queue
+/// @param[in]  data  The input eigenvectors in ascending order of size `column_count` x `column_count`
+/// @param[in]  component_count  The number of `component_count` of the descriptor
+/// @param[in]  deps  Events indicating availability of the `data` for reading or writing
+///
+/// @return The resulting 2d array of eigenvectors
+template <typename Float>
+auto flip_eigenvectors(sycl::queue& queue,
+                       pr::ndview<Float, 2>& data,
+                       std::int64_t component_count,
+                       const bk::event_vector& deps = {}) {
+    const std::int64_t column_count = data.get_dimension(1);
+    const std::int64_t row_count = data.get_dimension(0);
+    auto data_ptr = data.get_data();
+    auto eigenvectors =
+        pr::ndarray<Float, 2>::empty(queue, { component_count, column_count }, alloc::device);
+    auto eigenvectors_ptr = eigenvectors.get_mutable_data();
+    auto flip_event = queue.submit([&](sycl::handler& h) {
+        const auto range = bk::make_range_2d(component_count, column_count);
+        h.depends_on(deps);
+        h.parallel_for(range, [=](sycl::id<2> id) {
+            const std::int64_t row = id[0];
+            const std::int64_t column = id[1];
+            eigenvectors_ptr[row * column_count + column] =
+                data_ptr[(row_count - 1 - row) * column_count + column];
+        });
+    });
+
+    flip_event.wait_and_throw();
+    auto flipped_eigenvectors_host = eigenvectors.to_host(queue);
+
+    return flipped_eigenvectors_host;
+}
+
+///  A wrapper that flips 1d array of eigenvalues from syevd result in descending order
+///
+/// @tparam Float Floating-point type used to perform computations
+///
+/// @param[in]  queue The SYCL queue
+/// @param[in]  eigenvalues  The input eigenvalues in ascending order of size `column_count`
+/// @param[in]  component_count  The number of `component_count` of the descriptor
+/// @param[in]  deps  Events indicating availability of the `data` for reading or writing
+///
+/// @return The resulting 1d array of eigenvalues
+template <typename Float>
+auto flip_eigenvalues(sycl::queue& queue,
+                      pr::ndview<Float, 1>& eigenvalues,
+                      std::int64_t component_count,
+                      const bk::event_vector& deps = {}) {
+    auto column_count = eigenvalues.get_dimension(0);
+    auto data_ptr = eigenvalues.get_data();
+    auto flipped_eigenvalues =
+        pr::ndarray<Float, 1>::empty(queue, { component_count }, alloc::device);
+    auto flipped_eigenvalues_ptr = flipped_eigenvalues.get_mutable_data();
+    auto flip_event = queue.submit([&](sycl::handler& h) {
+        const auto range = bk::make_range_1d(component_count);
+        h.depends_on(deps);
+        h.parallel_for(range, [=](sycl::id<1> id) {
+            const std::int64_t col = id[0];
+            flipped_eigenvalues_ptr[col] = data_ptr[(column_count - 1) - col];
+        });
+    });
+
+    flip_event.wait_and_throw();
+    auto flipped_eigenvalues_host = flipped_eigenvalues.to_host(queue);
+
+    return flipped_eigenvalues_host;
+}
+
 ///  A wrapper that computes 1d array of means of the columns from precomputed sums
 ///
 /// @tparam Float Floating-point type used to perform computations
@@ -290,36 +397,6 @@ auto compute_correlation_from_covariance(sycl::queue& queue,
 
 // SVD method
 
-///  A wrapper that computes 1d array of eigenvalues and 2d array of eigenvectors from the covariance matrix
-///
-/// @tparam Float Floating-point type used to perform computations
-///
-/// @param[in]  queue The SYCL queue
-/// @param[in]  corr  The input covariance/correlation matrix of size `column_count` x `column_count`
-/// @param[in]  component_count  The number of `component_count` of the descriptor
-/// @param[in]  deps  Events indicating availability of the `data` for reading or writing
-///
-/// @return A tuple of two elements, where the first element is the resulting 2d array of eigenvectors
-/// of size `component_count` x `column_count` and the second element is the resulting 1d array of eigenvalues
-template <typename Float>
-auto compute_eigenvectors_on_host(sycl::queue& queue,
-                                  pr::ndarray<Float, 2>&& corr,
-                                  std::int64_t component_count,
-                                  const dal::backend::event_vector& deps = {}) {
-    ONEDAL_PROFILER_TASK(compute_eigenvectors_on_host);
-    ONEDAL_ASSERT(corr.get_dimension(0) == corr.get_dimension(1),
-                  "Correlation matrix must be square");
-    ONEDAL_ASSERT(corr.get_dimension(0) > 0);
-    const std::int64_t column_count = corr.get_dimension(0);
-
-    auto eigvecs = pr::ndarray<Float, 2>::empty({ component_count, column_count });
-    auto eigvals = pr::ndarray<Float, 1>::empty(component_count);
-    auto host_corr = corr.to_host(queue, deps);
-    pr::sym_eigvals_descending(host_corr, component_count, eigvecs, eigvals);
-
-    return std::make_tuple(eigvecs, eigvals);
-}
-
 ///  A wrapper that computes 1d array of eigenvalues from the 1d array of the singular values
 ///
 /// @tparam Float Floating-point type used to perform computations
diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_cov_impl_dpc.cpp b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_cov_impl_dpc.cpp
index feaa810230f..da397a273a5 100644
--- a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_cov_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_cov_impl_dpc.cpp
@@ -70,7 +70,7 @@ result_t train_kernel_cov_impl<Float>::operator()(const descriptor_t& desc, cons
     const auto data_nd = pr::table2ndarray<Float>(q_, data, alloc::device);
 
     auto [sums, sums_event] = compute_sums(q_, data_nd);
-
+    sums_event.wait_and_throw();
     {
         ONEDAL_PROFILER_TASK(allreduce_sums, q_);
         comm_.allreduce(sums.flatten(q_, { sums_event }), spmd::reduce_op::sum).wait();
@@ -97,12 +97,13 @@ result_t train_kernel_cov_impl<Float>::operator()(const descriptor_t& desc, cons
     sycl::event means_event;
     if (desc.get_result_options().test(result_options::means)) {
         auto [means, means_event] = compute_means(q_, sums, rows_count_global, { gemm_event });
+        means_event.wait_and_throw();
         result.set_means(homogen_table::wrap(means.flatten(q_, { means_event }), 1, column_count));
     }
 
     auto [cov, cov_event] =
         compute_covariance(q_, rows_count_global, xtx, sums, bias, { gemm_event });
-
+    cov_event.wait_and_throw();
     auto [vars, vars_event] = compute_variances(q_, cov, { cov_event, means_event });
 
     if (desc.get_result_options().test(result_options::vars)) {
@@ -110,53 +111,57 @@ result_t train_kernel_cov_impl<Float>::operator()(const descriptor_t& desc, cons
             homogen_table::wrap(vars.flatten(q_, { vars_event }), 1, column_count));
     }
 
-    auto data_to_compute = cov;
+    auto eigenvectors = cov;
 
     sycl::event corr_event;
     if (desc.get_normalization_mode() == normalization::zscore) {
         auto corr = pr::ndarray<Float, 2>::empty(q_, { column_count, column_count }, alloc::device);
         corr_event =
             pr::correlation_from_covariance(q_, rows_count_global, cov, corr, bias, { cov_event });
-        data_to_compute = corr;
+        eigenvectors = corr;
+        corr_event.wait_and_throw();
     }
 
-    auto [eigvecs, eigvals] = compute_eigenvectors_on_host(q_,
-                                                           std::move(data_to_compute),
-                                                           component_count,
-                                                           { cov_event, corr_event, vars_event });
+    auto [eigvals, syevd_event] =
+        syevd_computation(q_, eigenvectors, { cov_event, corr_event, vars_event });
+
+    auto flipped_eigvals_host = flip_eigenvalues(q_, eigvals, component_count, { syevd_event });
 
     if (desc.get_result_options().test(result_options::eigenvalues)) {
-        result.set_eigenvalues(homogen_table::wrap(eigvals.flatten(), 1, component_count));
+        result.set_eigenvalues(
+            homogen_table::wrap(flipped_eigvals_host.flatten(), 1, component_count));
     }
 
+    auto flipped_eigenvectors_host =
+        flip_eigenvectors(q_, eigenvectors, component_count, { syevd_event });
+
     if (desc.get_result_options().test(result_options::singular_values)) {
-        auto singular_values =
-            compute_singular_values_on_host(q_,
-                                            eigvals,
-                                            rows_count_global,
-                                            { cov_event, corr_event, vars_event });
+        auto singular_values = compute_singular_values_on_host(q_,
+                                                               flipped_eigvals_host,
+                                                               rows_count_global,
+                                                               { syevd_event });
         result.set_singular_values(
             homogen_table::wrap(singular_values.flatten(), 1, component_count));
     }
 
     if (desc.get_result_options().test(result_options::explained_variances_ratio)) {
         auto vars_host = vars.to_host(q_);
-        auto explained_variances_ratio =
-            compute_explained_variances_on_host(q_,
-                                                eigvals,
-                                                vars_host,
-                                                { cov_event, corr_event, vars_event });
+        auto explained_variances_ratio = compute_explained_variances_on_host(q_,
+                                                                             flipped_eigvals_host,
+                                                                             vars_host,
+                                                                             { syevd_event });
         result.set_explained_variances_ratio(
             homogen_table::wrap(explained_variances_ratio.flatten(), 1, component_count));
     }
 
     if (desc.get_deterministic()) {
-        sign_flip(eigvecs);
+        sign_flip(flipped_eigenvectors_host);
     }
 
     if (desc.get_result_options().test(result_options::eigenvectors)) {
-        result.set_eigenvectors(
-            homogen_table::wrap(eigvecs.flatten(), component_count, column_count));
+        result.set_eigenvectors(homogen_table::wrap(flipped_eigenvectors_host.flatten(),
+                                                    flipped_eigenvectors_host.get_dimension(0),
+                                                    flipped_eigenvectors_host.get_dimension(1)));
     }
 
     return result;
diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_precomputed_impl_dpc.cpp b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_precomputed_impl_dpc.cpp
index 75970b945f9..a32ffb379a4 100644
--- a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_precomputed_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_precomputed_impl_dpc.cpp
@@ -65,18 +65,25 @@ result_t train_kernel_precomputed_impl<Float>::operator()(const descriptor_t& de
     }
     if (desc.get_result_options().test(result_options::eigenvectors |
                                        result_options::eigenvalues)) {
-        auto [eigvecs, eigvals] =
-            compute_eigenvectors_on_host(q_, std::move(data_nd), component_count);
+        auto [eigvals, syevd_event] = syevd_computation(q_, data_nd, {});
+
+        auto flipped_eigvals_host = flip_eigenvalues(q_, eigvals, component_count, { syevd_event });
+
+        auto flipped_eigenvectors_host =
+            flip_eigenvectors(q_, data_nd, component_count, { syevd_event });
         if (desc.get_result_options().test(result_options::eigenvalues)) {
-            result.set_eigenvalues(homogen_table::wrap(eigvals.flatten(), 1, component_count));
+            result.set_eigenvalues(
+                homogen_table::wrap(flipped_eigvals_host.flatten(), 1, component_count));
         }
 
         if (desc.get_deterministic()) {
-            sign_flip(eigvecs);
+            sign_flip(flipped_eigenvectors_host);
         }
         if (desc.get_result_options().test(result_options::eigenvectors)) {
             result.set_eigenvectors(
-                homogen_table::wrap(eigvecs.flatten(), component_count, column_count));
+                homogen_table::wrap(flipped_eigenvectors_host.flatten(),
+                                    flipped_eigenvectors_host.get_dimension(0),
+                                    flipped_eigenvectors_host.get_dimension(1)));
         }
     }
 
diff --git a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_svd_impl_dpc.cpp b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_svd_impl_dpc.cpp
index d5e6d3f9fbd..87095c5f912 100644
--- a/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_svd_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/pca/backend/gpu/train_kernel_svd_impl_dpc.cpp
@@ -33,7 +33,7 @@ namespace oneapi::dal::pca::backend {
 
 namespace bk = dal::backend;
 namespace pr = dal::backend::primitives;
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 using alloc = sycl::usm::alloc;
 
 using bk::context_gpu;
diff --git a/cpp/oneapi/dal/backend/micromkl/BUILD b/cpp/oneapi/dal/backend/micromkl/BUILD
deleted file mode 100644
index 52a5e4bd86b..00000000000
--- a/cpp/oneapi/dal/backend/micromkl/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-load("@onedal//dev/bazel:dal.bzl",
-    "dal_module",
-    "dal_test_suite",
-)
-
-dal_module(
-    name = "micromkl",
-    auto = True,
-    dal_deps = [
-        "@onedal//cpp/oneapi/dal:common",
-    ],
-)
-
-dal_test_suite(
-    name = "tests",
-    framework = "catch2",
-    private = True,
-    dal_deps = [],
-)
diff --git a/cpp/oneapi/dal/backend/micromkl/macro.hpp b/cpp/oneapi/dal/backend/micromkl/macro.hpp
deleted file mode 100644
index 35b24d3e701..00000000000
--- a/cpp/oneapi/dal/backend/micromkl/macro.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-* Copyright contributors to the oneDAL project
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#pragma once
-
-#include <daal/include/services/daal_defines.h>
-
-#ifndef __MICROMKL_INCLUDE_GUARD__
-#error "This header cannot be included outside of micromkl module"
-#endif
-
-#define STRINGIFY(x)    #x
-#define DAL_EXPAND(...) __VA_ARGS__
-
-#ifdef ONEDAL_REF
-#define FUNC_NAME(prefix, name)          name
-#define FUNC_NAME_CPU(cpu, prefix, name) name
-#else
-#define FUNC_NAME(prefix, name)          prefix##_##name
-#define FUNC_NAME_CPU(cpu, prefix, name) prefix##_##cpu##_##name
-#endif
-
-#define DISPATCH_ID_NAME(cpu) oneapi::dal::backend::cpu_dispatch_##cpu
-
-#define FUNC_CPU_DECL(cpu, prefix, name, argdecl) \
-    extern "C" void FUNC_NAME_CPU(cpu, prefix, name) argdecl;
-
-#define DISPATCH_FUNC_DECL(prefix, name, arcdecl) \
-    template <typename Cpu>                       \
-    ONEDAL_FORCEINLINE void FUNC_NAME(prefix, name) arcdecl;
-
-#define DISPATCH_FUNC_CPU(nominal_cpu, actual_cpu, prefix, name, arcdecl, argcall)           \
-    template <>                                                                              \
-    ONEDAL_FORCEINLINE void FUNC_NAME(prefix, name)<DISPATCH_ID_NAME(nominal_cpu)> arcdecl { \
-        FUNC_NAME_CPU(actual_cpu, prefix, name) argcall;                                     \
-    }
-
-#define FUNC_CPU(nominal_cpu, actual_cpu, prefix, name, argdecl, argcall) \
-    FUNC_CPU_DECL(nominal_cpu, prefix, name, argdecl)                     \
-    DISPATCH_FUNC_CPU(nominal_cpu, actual_cpu, prefix, name, argdecl, argcall)
-
-#if defined(TARGET_X86_64)
-#define FUNC_AVX512(...) DAL_EXPAND(FUNC_CPU(avx512, avx512, __VA_ARGS__))
-#define FUNC_AVX2(...)   DAL_EXPAND(FUNC_CPU(avx2, avx2, __VA_ARGS__))
-#elif defined(TARGET_ARM)
-#define FUNC_A8SVE(...) DAL_EXPAND(FUNC_CPU(sve, sve, __VA_ARGS__))
-#elif defined(TARGET_RISCV64)
-#define FUNC_RV64(...) DAL_EXPAND(FUNC_CPU(rv64, rv64, __VA_ARGS__))
-#endif
-
-#ifdef __APPLE__
-#define FUNC_SSE42(...) DAL_EXPAND(FUNC_CPU(sse42, avx2, __VA_ARGS__))
-#define FUNC_SSE2(...)  DAL_EXPAND(FUNC_CPU(sse2, avx2, __VA_ARGS__))
-#else
-#define FUNC_SSE42(...) DAL_EXPAND(FUNC_CPU(sse42, sse42, __VA_ARGS__))
-#define FUNC_SSE2(...)  DAL_EXPAND(FUNC_CPU(sse2, sse2, __VA_ARGS__))
-#endif
-
-#if defined(TARGET_X86_64)
-#define FUNC(prefix, name, argdecl, argcall)    \
-    DISPATCH_FUNC_DECL(prefix, name, argdecl)   \
-    FUNC_AVX512(prefix, name, argdecl, argcall) \
-    FUNC_AVX2(prefix, name, argdecl, argcall)   \
-    FUNC_SSE42(prefix, name, argdecl, argcall)  \
-    FUNC_SSE2(prefix, name, argdecl, argcall)
-#elif defined(TARGET_ARM)
-#define FUNC(prefix, name, argdecl, argcall)  \
-    DISPATCH_FUNC_DECL(prefix, name, argdecl) \
-    FUNC_A8SVE(prefix, name, argdecl, argcall)
-#elif defined(TARGET_RISCV64)
-#define FUNC(prefix, name, argdecl, argcall)  \
-    DISPATCH_FUNC_DECL(prefix, name, argdecl) \
-    FUNC_RV64(prefix, name, argdecl, argcall)
-#endif
-
-#ifdef ONEDAL_REF
-#define FUNC_DECL(prefix, floatabr, name, argdecl, argcall) \
-    FUNC(prefix, floatabr##name##_, argdecl, argcall)
-
-#define FUNC_CALL(prefix, floatabr, name, cargcall) floatabr##name##_<Cpu> cargcall;
-#else
-#define FUNC_DECL(prefix, floatabr, name, argdecl, argcall) \
-    FUNC(prefix, floatabr##name, argdecl, argcall)
-
-#define FUNC_CALL(prefix, floatabr, name, cargcall) prefix##_##floatabr##name<Cpu> cargcall;
-#endif
-
-#define INSTANTIATE_CPU(cpu, name, Float, argdecl) \
-    template void name<DISPATCH_ID_NAME(cpu), Float> argdecl(Float);
-
-#ifdef ONEDAL_CPU_DISPATCH_A8SVE
-#define INSTANTIATE_A8SVE(...) DAL_EXPAND(INSTANTIATE_CPU(sve, __VA_ARGS__))
-#else
-#define INSTANTIATE_A8SVE(...)
-#endif
-
-#ifdef ONEDAL_CPU_DISPATCH_AVX512
-#define INSTANTIATE_AVX512(...) DAL_EXPAND(INSTANTIATE_CPU(avx512, __VA_ARGS__))
-#else
-#define INSTANTIATE_AVX512(...)
-#endif
-
-#ifdef ONEDAL_CPU_DISPATCH_AVX2
-#define INSTANTIATE_AVX2(...) DAL_EXPAND(INSTANTIATE_CPU(avx2, __VA_ARGS__))
-#else
-#define INSTANTIATE_AVX2(...)
-#endif
-
-#ifdef ONEDAL_CPU_DISPATCH_SSE42
-#define INSTANTIATE_SSE42(...) DAL_EXPAND(INSTANTIATE_CPU(sse42, __VA_ARGS__))
-#else
-#define INSTANTIATE_SSE42(...)
-#endif
-
-#ifdef ONEDAL_CPU_DISPATCH_RV64
-#define INSTANTIATE_RV64(...) DAL_EXPAND(INSTANTIATE_CPU(rv64, __VA_ARGS__))
-#else
-#define INSTANTIATE_RV64(...)
-#endif
-
-#define INSTANTIATE_SSE2(...) DAL_EXPAND(INSTANTIATE_CPU(sse2, __VA_ARGS__))
-
-#if defined(TARGET_X86_64)
-#define INSTANTIATE_FLOAT(name, Float, argdecl) \
-    INSTANTIATE_AVX512(name, Float, argdecl)    \
-    INSTANTIATE_AVX2(name, Float, argdecl)      \
-    INSTANTIATE_SSE42(name, Float, argdecl)     \
-    INSTANTIATE_SSE2(name, Float, argdecl)
-#elif defined(TARGET_ARM)
-#define INSTANTIATE_FLOAT(name, Float, argdecl) INSTANTIATE_A8SVE(name, Float, argdecl)
-#elif defined(TARGET_RISCV64)
-#define INSTANTIATE_FLOAT(name, Float, argdecl) INSTANTIATE_RV64(name, Float, argdecl)
-#endif
-
-#define FUNC_TEMPLATE(prefix, name, fargdecl, cargdecl, fargcall, cargcall) \
-    FUNC_DECL(prefix, s, name, fargdecl(float), fargcall)                   \
-    FUNC_DECL(prefix, d, name, fargdecl(double), fargcall)                  \
-                                                                            \
-    namespace oneapi::dal::backend::micromkl {                              \
-                                                                            \
-    template <typename Cpu, typename Float>                                 \
-    void name cargdecl(Float) {                                             \
-        static_assert(sizeof(std::int64_t) == sizeof(DAAL_INT));            \
-        if constexpr (std::is_same_v<Float, float>) {                       \
-            FUNC_CALL(prefix, s, name, cargcall)                            \
-        }                                                                   \
-        else {                                                              \
-            FUNC_CALL(prefix, d, name, cargcall)                            \
-        }                                                                   \
-    }                                                                       \
-                                                                            \
-    INSTANTIATE_FLOAT(name, float, cargdecl)                                \
-    INSTANTIATE_FLOAT(name, double, cargdecl)                               \
-    }
diff --git a/cpp/oneapi/dal/backend/micromkl/micromkl.cpp b/cpp/oneapi/dal/backend/micromkl/micromkl.cpp
deleted file mode 100644
index 442ae288e10..00000000000
--- a/cpp/oneapi/dal/backend/micromkl/micromkl.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <daal/include/services/daal_defines.h>
-#include "oneapi/dal/backend/micromkl/micromkl.hpp"
-#include "oneapi/dal/backend/dispatcher.hpp"
-
-#define __MICROMKL_INCLUDE_GUARD__
-
-#include "oneapi/dal/backend/micromkl/macro.hpp"
-
-/* ================================== SYEVD ================================= */
-#define SYEVD_F_DECLARGS(Float) \
-    (const char* jobz,          \
-     const char* uplo,          \
-     const DAAL_INT* n,         \
-     Float* a,                  \
-     const DAAL_INT* lda,       \
-     Float* w,                  \
-     Float* work,               \
-     const DAAL_INT* lwork,     \
-     DAAL_INT* iwork,           \
-     const DAAL_INT* liwork,    \
-     DAAL_INT* info,            \
-     int ijobz,                 \
-     int iuplo)
-
-#define SYEVD_C_DECLARGS(Float) \
-    (char jobz,                 \
-     char uplo,                 \
-     std::int64_t n,            \
-     Float* a,                  \
-     std::int64_t lda,          \
-     Float* w,                  \
-     Float* work,               \
-     std::int64_t lwork,        \
-     std::int64_t* iwork,       \
-     std::int64_t liwork,       \
-     std::int64_t& info)
-
-#define SYEVD_F_CALLARGS (jobz, uplo, n, a, lda, w, work, lwork, iwork, liwork, info, ijobz, iuplo)
-
-#define SYEVD_C_CALLARGS                   \
-    (&jobz,                                \
-     &uplo,                                \
-     reinterpret_cast<DAAL_INT*>(&n),      \
-     a,                                    \
-     reinterpret_cast<DAAL_INT*>(&lda),    \
-     w,                                    \
-     work,                                 \
-     reinterpret_cast<DAAL_INT*>(&lwork),  \
-     reinterpret_cast<DAAL_INT*>(iwork),   \
-     reinterpret_cast<DAAL_INT*>(&liwork), \
-     reinterpret_cast<DAAL_INT*>(&info),   \
-     1,                                    \
-     1)
-
-#ifdef ONEDAL_REF
-FUNC_TEMPLATE(unused, syevd, SYEVD_F_DECLARGS, SYEVD_C_DECLARGS, SYEVD_F_CALLARGS, SYEVD_C_CALLARGS)
-#else
-FUNC_TEMPLATE(fpk_lapack,
-              syevd,
-              SYEVD_F_DECLARGS,
-              SYEVD_C_DECLARGS,
-              SYEVD_F_CALLARGS,
-              SYEVD_C_CALLARGS)
-#endif
diff --git a/cpp/oneapi/dal/backend/primitives/blas/gemm_dpc.cpp b/cpp/oneapi/dal/backend/primitives/blas/gemm_dpc.cpp
index 5f00860293d..1cbb5512eb5 100644
--- a/cpp/oneapi/dal/backend/primitives/blas/gemm_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/blas/gemm_dpc.cpp
@@ -18,7 +18,7 @@
 #include "oneapi/dal/backend/primitives/blas/gemm.hpp"
 #include "oneapi/dal/backend/primitives/blas/misc.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
diff --git a/cpp/oneapi/dal/backend/primitives/blas/gemv_dpc.cpp b/cpp/oneapi/dal/backend/primitives/blas/gemv_dpc.cpp
index a0cc31d8ff8..d13e51e1e00 100644
--- a/cpp/oneapi/dal/backend/primitives/blas/gemv_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/blas/gemv_dpc.cpp
@@ -18,7 +18,7 @@
 #include "oneapi/dal/backend/primitives/blas/gemv.hpp"
 #include "oneapi/dal/backend/primitives/blas/misc.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
diff --git a/cpp/oneapi/dal/backend/primitives/blas/misc.hpp b/cpp/oneapi/dal/backend/primitives/blas/misc.hpp
index 518c59bdf50..64f9b70fece 100644
--- a/cpp/oneapi/dal/backend/primitives/blas/misc.hpp
+++ b/cpp/oneapi/dal/backend/primitives/blas/misc.hpp
@@ -18,12 +18,13 @@
 
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
+#ifdef ONEDAL_DATA_PARALLEL
 /// Convert oneDAL `ndorder` to oneMKL `layout`
 inline constexpr mkl::layout order_as_layout(ndorder order) {
     return (order == ndorder::c) ? mkl::layout::R /* row-major */
@@ -55,5 +56,5 @@ inline constexpr mkl::uplo ident_uplo(mkl::uplo order) {
     constexpr auto lower = mkl::uplo::lower;
     return (order == upper) ? upper : lower;
 }
-
+#endif
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/blas/syrk.hpp b/cpp/oneapi/dal/backend/primitives/blas/syrk.hpp
index 7bc219b4b41..c254eddaadd 100644
--- a/cpp/oneapi/dal/backend/primitives/blas/syrk.hpp
+++ b/cpp/oneapi/dal/backend/primitives/blas/syrk.hpp
@@ -23,7 +23,7 @@ namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
 template <mkl::uplo ul, typename Float, ndorder ao>
 sycl::event syrk(sycl::queue& queue,
diff --git a/cpp/oneapi/dal/backend/primitives/blas/syrk_dpc.cpp b/cpp/oneapi/dal/backend/primitives/blas/syrk_dpc.cpp
index 6c91531a2c0..dc883a3e77f 100644
--- a/cpp/oneapi/dal/backend/primitives/blas/syrk_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/blas/syrk_dpc.cpp
@@ -18,7 +18,7 @@
 #include "oneapi/dal/backend/primitives/blas/syrk.hpp"
 #include "oneapi/dal/backend/primitives/blas/misc.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
diff --git a/cpp/oneapi/dal/backend/primitives/lapack.hpp b/cpp/oneapi/dal/backend/primitives/lapack.hpp
index 8c6fd87e4d9..e5ae59f2a74 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack.hpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "oneapi/dal/backend/primitives/lapack/eigen.hpp"
 #include "oneapi/dal/backend/primitives/lapack/solve.hpp"
 #include "oneapi/dal/backend/primitives/lapack/misc.hpp"
 #include "oneapi/dal/backend/primitives/lapack/gesvd.hpp"
+#include "oneapi/dal/backend/primitives/lapack/syevd.hpp"
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/BUILD b/cpp/oneapi/dal/backend/primitives/lapack/BUILD
index 799117800f9..fced4d31462 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/BUILD
+++ b/cpp/oneapi/dal/backend/primitives/lapack/BUILD
@@ -8,7 +8,6 @@ dal_module(
     name = "lapack",
     auto = True,
     dal_deps = [
-        "@onedal//cpp/oneapi/dal/backend/micromkl",
         "@onedal//cpp/oneapi/dal/backend/primitives:blas",
         "@onedal//cpp/oneapi/dal/backend/primitives:common",
     ],
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/eigen.cpp b/cpp/oneapi/dal/backend/primitives/lapack/eigen.cpp
deleted file mode 100644
index 433001b077a..00000000000
--- a/cpp/oneapi/dal/backend/primitives/lapack/eigen.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "oneapi/dal/backend/primitives/lapack/eigen.hpp"
-#include "oneapi/dal/backend/dispatcher.hpp"
-#include "oneapi/dal/backend/micromkl/micromkl.hpp"
-
-namespace oneapi::dal::backend::primitives {
-
-template <typename... Args>
-inline void syevd(Args&&... args) {
-    dispatch_by_cpu(context_cpu{}, [&](auto cpu) {
-        using dal::backend::micromkl::syevd;
-        syevd<decltype(cpu)>(std::forward<Args>(args)...);
-    });
-}
-
-template <typename Float>
-void sym_eigvals_impl(Float* a, std::int64_t n, std::int64_t lda, Float* w) {
-    ONEDAL_ASSERT(a);
-    ONEDAL_ASSERT(w);
-    ONEDAL_ASSERT(n > 0);
-    ONEDAL_ASSERT(lda >= n);
-
-    const std::int64_t lwork = 2 * n * n + 6 * n + 1;
-    const std::int64_t liwork = 5 * n + 3;
-
-    ONEDAL_ASSERT(lwork > n);
-    ONEDAL_ASSERT(liwork > n);
-
-    const auto work = ndarray<Float, 1>::empty(lwork);
-    const auto iwork = ndarray<std::int64_t, 1>::empty(liwork);
-
-    Float* work_ptr = work.get_mutable_data();
-    std::int64_t* iwork_ptr = iwork.get_mutable_data();
-
-    std::int64_t info;
-    syevd('V', 'U', n, a, lda, w, work_ptr, lwork, iwork_ptr, liwork, info);
-
-    if (info != 0) {
-        throw internal_error{ dal::detail::error_messages::failed_to_compute_eigenvectors() };
-    }
-}
-
-template <typename Float>
-void flip_eigvals_impl(Float* a,
-                       Float* w,
-                       std::int64_t n,
-                       std::int64_t lda,
-                       std::int64_t w_count,
-                       Float* a_flipped,
-                       std::int64_t lda_flipped,
-                       Float* w_flipped) {
-    dispatch_by_cpu(context_cpu{}, [&](auto cpu) {
-        flip_eigvals_impl_cpu<decltype(cpu)>(a,
-                                             w,
-                                             n,
-                                             lda,
-                                             w_count,
-                                             a_flipped,
-                                             lda_flipped,
-                                             w_flipped);
-    });
-}
-
-#define INSTANTIATE(F)                                                  \
-    template void sym_eigvals_impl(F*, std::int64_t, std::int64_t, F*); \
-    template void                                                       \
-    flip_eigvals_impl(F*, F*, std::int64_t, std::int64_t, std::int64_t, F*, std::int64_t, F*);
-
-INSTANTIATE(float)
-INSTANTIATE(double)
-
-} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/eigen.hpp b/cpp/oneapi/dal/backend/primitives/lapack/eigen.hpp
deleted file mode 100644
index 3bef21dc882..00000000000
--- a/cpp/oneapi/dal/backend/primitives/lapack/eigen.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#pragma once
-
-#include "oneapi/dal/backend/primitives/ndarray.hpp"
-
-namespace oneapi::dal::backend::primitives {
-
-/// Do not use this.
-template <typename Float>
-void sym_eigvals_impl(Float* a, std::int64_t n, std::int64_t lda, Float* w);
-
-/// Do not use this.
-template <typename Float>
-void flip_eigvals_impl(Float* a,
-                       Float* w,
-                       std::int64_t n,
-                       std::int64_t lda,
-                       std::int64_t w_count,
-                       Float* a_flipped,
-                       std::int64_t lda_flipped,
-                       Float* w_flipped);
-
-/// Do not use this.
-template <typename Cpu, typename Float>
-void flip_eigvals_impl_cpu(Float* a,
-                           Float* w,
-                           std::int64_t n,
-                           std::int64_t lda,
-                           std::int64_t w_count,
-                           Float* a_flipped,
-                           std::int64_t lda_flipped,
-                           Float* w_flipped);
-
-/// Computes eigenvectors and eigenvalues in-place.
-///
-/// @param[in, out] data_or_eigvecs The input parameter is interpreted as symmetric matrix of
-///                                 size [n x n]. The computed eigenvectors is written to that
-///                                 matrix. If `order == ndorder::c`, $i$-th row of the matrix
-///                                 contains $i$-th eigenvector. If `order == ndorder::f`, $i$-th
-///                                 column of the matrix contains $i$-th eigenvector.
-/// @param[out] eigvals             The output array of size [n] that stores computed eigenvalues.
-///                                 The eigenvalues are written in ascending order. $i$-th eigenvalue
-///                                 corrensponds to $i$-th eigenvector.
-template <typename Float, ndorder order>
-inline void sym_eigvals(ndview<Float, 2, order>& data_or_eigvecs, ndview<Float, 1>& eigvals) {
-    ONEDAL_ASSERT(data_or_eigvecs.get_dimension(0) == data_or_eigvecs.get_dimension(1),
-                  "Input matrix must be square");
-    ONEDAL_ASSERT(eigvals.get_dimension(0) >= data_or_eigvecs.get_dimension(0));
-    ONEDAL_ASSERT(data_or_eigvecs.has_mutable_data());
-    ONEDAL_ASSERT(eigvals.has_mutable_data());
-
-    sym_eigvals_impl(data_or_eigvecs.get_mutable_data(),
-                     data_or_eigvecs.get_dimension(0),
-                     data_or_eigvecs.get_leading_stride(),
-                     eigvals.get_mutable_data());
-}
-
-/// Computes eigenvectors and eigenvalues in-place. Eigenvectors and eigenvalues are written in
-/// descending order determined by eigenvalues. For more details, see `sym_eigvals`.
-template <typename Float, ndorder order>
-inline void sym_eigvals_descending(ndview<Float, 2, order>& data_or_eigvecs,
-                                   ndview<Float, 1>& eigvals) {
-    sym_eigvals(data_or_eigvecs, eigvals);
-    flip_eigvals_impl(data_or_eigvecs.get_mutable_data(),
-                      eigvals.get_mutable_data(),
-                      data_or_eigvecs.get_dimension(0),
-                      data_or_eigvecs.get_leading_stride(),
-                      data_or_eigvecs.get_dimension(0),
-                      data_or_eigvecs.get_mutable_data(),
-                      data_or_eigvecs.get_leading_stride(),
-                      eigvals.get_mutable_data());
-}
-
-/// Computes eigenvectors and eigenvalues in-place. `eigval_count` eigenvectors
-/// and eigenvalues are written in descending order determined by eigenvalues to
-/// `eigvecs` and `eigvals` arrays.
-///
-/// @param[in, out] data_or_scratchpad The input parameter is interpreted as symmetric matrix
-///                                    of size [n x n]. The memory is used as a storage for
-///                                    intermediate computations.
-/// @param[in] eigval_count            The number of eigenvalues and eigenvectors to store to
-///                                    the output buffers.
-/// @param[out] eigvecs                The output array of size [eigval_count x n] that stores
-///                                    eigenvectors. If `order == ndorder::c`, $i$-th row of the
-///                                    matrix contains $i$-th eigenvector. If `order == ndorder::f`,
-///                                    $i$-th column of the matrix contains $i$-th eigenvector.
-/// @param[out] eigvals                The output array of size [eigval_count] that stores computed
-///                                    eigenvalues. The eigenvalues are written in ascending order.
-///                                    $i$-th eigenvalue corrensponds to $i$-th eigenvector.
-template <typename Float, ndorder order>
-inline void sym_eigvals_descending(ndview<Float, 2, order>& data_or_scratchpad,
-                                   std::int64_t eigval_count,
-                                   ndview<Float, 2, order>& eigvecs,
-                                   ndview<Float, 1>& eigvals) {
-    auto eigvals_full = ndarray<Float, 1>::empty(data_or_scratchpad.get_dimension(0));
-    sym_eigvals(data_or_scratchpad, eigvals_full);
-    flip_eigvals_impl(data_or_scratchpad.get_mutable_data(),
-                      eigvals_full.get_mutable_data(),
-                      data_or_scratchpad.get_dimension(0),
-                      data_or_scratchpad.get_leading_stride(),
-                      eigval_count,
-                      eigvecs.get_mutable_data(),
-                      eigvecs.get_leading_stride(),
-                      eigvals.get_mutable_data());
-}
-
-} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/eigen_cpu.cpp b/cpp/oneapi/dal/backend/primitives/lapack/eigen_cpu.cpp
deleted file mode 100644
index 7b0264c0c1f..00000000000
--- a/cpp/oneapi/dal/backend/primitives/lapack/eigen_cpu.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "oneapi/dal/backend/dispatcher.hpp"
-#include "oneapi/dal/backend/primitives/lapack/eigen.hpp"
-
-namespace oneapi::dal::backend::primitives {
-
-template <typename Cpu, typename Float>
-void flip_eigvals_impl_cpu(Float* a,
-                           Float* w,
-                           std::int64_t n,
-                           std::int64_t lda,
-                           std::int64_t w_count,
-                           Float* a_flipped,
-                           std::int64_t lda_flipped,
-                           Float* w_flipped) {
-    ONEDAL_ASSERT(a);
-    ONEDAL_ASSERT(w);
-    ONEDAL_ASSERT(a_flipped);
-    ONEDAL_ASSERT(w_flipped);
-    ONEDAL_ASSERT(n > 0);
-    ONEDAL_ASSERT(lda >= n);
-    ONEDAL_ASSERT(w_count > 0);
-    ONEDAL_ASSERT(w_count <= n);
-
-    if (a == a_flipped) {
-        ONEDAL_ASSERT(lda == lda_flipped);
-
-        for (std::int64_t i = 0; i < n / 2; i++) {
-            const std::int64_t src_i = i;
-            const std::int64_t dst_i = n - i - 1;
-            for (std::int64_t j = 0; j < n; j++) {
-                std::swap(a[src_i * lda + j], a[dst_i * lda + j]);
-            }
-        }
-    }
-    else {
-        PRAGMA_IVDEP
-        for (std::int64_t i = 0; i < w_count; i++) {
-            const std::int64_t src_i = n - i - 1;
-            const std::int64_t dst_i = i;
-            for (std::int64_t j = 0; j < n; j++) {
-                a_flipped[dst_i * lda_flipped + j] = a[src_i * lda + j];
-            }
-        }
-    }
-
-    if (w == w_flipped) {
-        ONEDAL_ASSERT(n == w_count);
-
-        for (std::int64_t i = 0; i < n / 2; i++) {
-            const std::int64_t src_i = i;
-            const std::int64_t dst_i = n - i - 1;
-            std::swap(w[src_i], w[dst_i]);
-        }
-    }
-    else {
-        PRAGMA_IVDEP
-        for (std::int64_t i = 0; i < w_count; i++) {
-            const std::int64_t src_i = n - i - 1;
-            const std::int64_t dst_i = i;
-            w_flipped[dst_i] = w[src_i];
-        }
-    }
-}
-
-#define INSTANTIATE(Cpu, Float)                                   \
-    template void flip_eigvals_impl_cpu<Cpu, Float>(Float*,       \
-                                                    Float*,       \
-                                                    std::int64_t, \
-                                                    std::int64_t, \
-                                                    std::int64_t, \
-                                                    Float*,       \
-                                                    std::int64_t, \
-                                                    Float*);
-
-INSTANTIATE(__CPU_TAG__, float)
-INSTANTIATE(__CPU_TAG__, double)
-
-} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/gesvd.hpp b/cpp/oneapi/dal/backend/primitives/lapack/gesvd.hpp
index 0aba5f8edf9..311f2a5a3c2 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/gesvd.hpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/gesvd.hpp
@@ -24,7 +24,7 @@ namespace oneapi::dal::backend::primitives {
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
 template <mkl::jobsvd jobu, mkl::jobsvd jobvt, typename Float>
 sycl::event gesvd(sycl::queue& queue,
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/gesvd_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/gesvd_dpc.cpp
index 8bb07ca2ed2..59b02c2191b 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/gesvd_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/gesvd_dpc.cpp
@@ -18,7 +18,7 @@
 #include "oneapi/dal/backend/primitives/lapack/gesvd.hpp"
 #include "oneapi/dal/backend/primitives/blas/misc.hpp"
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/misc.hpp b/cpp/oneapi/dal/backend/primitives/lapack/misc.hpp
index 62cf57fe6be..7893d7523b9 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/misc.hpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/misc.hpp
@@ -18,11 +18,11 @@
 
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
 inline constexpr mkl::job ident_job(mkl::job order) {
     constexpr auto novec = mkl::job::novec;
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/syevd.hpp b/cpp/oneapi/dal/backend/primitives/lapack/syevd.hpp
new file mode 100644
index 00000000000..dbed4c9f84c
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/lapack/syevd.hpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+#include "oneapi/dal/backend/primitives/blas/misc.hpp"
+#include "oneapi/dal/backend/primitives/lapack/misc.hpp"
+
+namespace oneapi::dal::backend::primitives {
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+namespace mkl = oneapi::mkl;
+
+template <mkl::job jobz, mkl::uplo uplo, typename Float>
+sycl::event syevd(sycl::queue& queue,
+                  std::int64_t column_count,
+                  ndview<Float, 2>& a,
+                  std::int64_t lda,
+                  ndview<Float, 1>& eigenvalues,
+                  const event_vector& deps = {});
+
+#endif
+
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/syevd_dpc.cpp b/cpp/oneapi/dal/backend/primitives/lapack/syevd_dpc.cpp
new file mode 100644
index 00000000000..cbb3e06a779
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/lapack/syevd_dpc.cpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/detail/profiler.hpp"
+#include "oneapi/dal/backend/primitives/lapack/syevd.hpp"
+#include "oneapi/dal/backend/primitives/blas/misc.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+#include <oneapi/mkl.hpp>
+
+namespace oneapi::dal::backend::primitives {
+
+template <typename Float>
+static sycl::event syevd_wrapper(sycl::queue& queue,
+                                 mkl::job jobz,
+                                 mkl::uplo uplo,
+                                 std::int64_t column_count,
+                                 Float* data_ptr,
+                                 std::int64_t lda,
+                                 Float* eigenvalues,
+                                 Float* scratchpad,
+                                 std::int64_t scratchpad_size,
+                                 const event_vector& deps) {
+    ONEDAL_ASSERT(lda >= column_count);
+
+    return mkl::lapack::syevd(queue,
+                              jobz,
+                              uplo,
+                              column_count,
+                              data_ptr,
+                              lda,
+                              eigenvalues,
+                              scratchpad,
+                              scratchpad_size,
+                              deps);
+}
+
+template <mkl::job jobz, mkl::uplo uplo, typename Float>
+sycl::event syevd(sycl::queue& queue,
+                  std::int64_t column_count,
+                  ndview<Float, 2>& a,
+                  std::int64_t lda,
+                  ndview<Float, 1>& eigenvalues,
+                  const event_vector& deps) {
+    constexpr auto job = ident_job(jobz);
+    constexpr auto ul = ident_uplo(uplo);
+
+    const auto scratchpad_size =
+        mkl::lapack::syevd_scratchpad_size<Float>(queue, jobz, uplo, column_count, lda);
+    auto scratchpad =
+        ndarray<Float, 1>::empty(queue, { scratchpad_size }, sycl::usm::alloc::device);
+
+    return syevd_wrapper(queue,
+                         job,
+                         ul,
+                         column_count,
+                         a.get_mutable_data(),
+                         lda,
+                         eigenvalues.get_mutable_data(),
+                         scratchpad.get_mutable_data(),
+                         scratchpad_size,
+                         deps);
+}
+
+#define INSTANTIATE(jobz, uplo, F)                                               \
+    template ONEDAL_EXPORT sycl::event syevd<jobz, uplo, F>(sycl::queue & queue, \
+                                                            std::int64_t n,      \
+                                                            ndview<F, 2> & a,    \
+                                                            std::int64_t lda,    \
+                                                            ndview<F, 1> & w,    \
+                                                            const event_vector& deps);
+
+#define INSTANTIATE_FLOAT(jobz, uplo) \
+    INSTANTIATE(jobz, uplo, float)    \
+    INSTANTIATE(jobz, uplo, double)
+
+#define INSTANTIATE_JOB(uplo)                \
+    INSTANTIATE_FLOAT(mkl::job::novec, uplo) \
+    INSTANTIATE_FLOAT(mkl::job::vec, uplo)
+
+INSTANTIATE_JOB(mkl::uplo::upper)
+INSTANTIATE_JOB(mkl::uplo::lower)
+
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/lapack/test/eigen.cpp b/cpp/oneapi/dal/backend/primitives/lapack/test/syevd_dpc.cpp
similarity index 59%
rename from cpp/oneapi/dal/backend/primitives/lapack/test/eigen.cpp
rename to cpp/oneapi/dal/backend/primitives/lapack/test/syevd_dpc.cpp
index 6a6a0b44bbc..56484014a81 100644
--- a/cpp/oneapi/dal/backend/primitives/lapack/test/eigen.cpp
+++ b/cpp/oneapi/dal/backend/primitives/lapack/test/syevd_dpc.cpp
@@ -14,11 +14,11 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "oneapi/dal/backend/primitives/lapack/eigen.hpp"
+#include "oneapi/dal/backend/primitives/lapack/syevd.hpp"
 
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/math.hpp"
-#include "oneapi/dal/test/engine/io.hpp"
+#include "oneapi/dal/test/engine/fixtures.hpp"
 
 namespace oneapi::dal::backend::primitives::test {
 
@@ -26,8 +26,9 @@ namespace te = dal::test::engine;
 namespace la = te::linalg;
 
 template <typename Float>
-class sym_eigvals_test {
+class syevd_test : public te::float_algo_fixture<Float> {
 public:
+    using float_t = Float;
     std::int64_t generate_dim() const {
         return GENERATE(3, 28, 125, 256);
     }
@@ -47,23 +48,6 @@ class sym_eigvals_test {
         return call_sym_eigvals_inplace_generic(symmetric_matrix, is_ascending);
     }
 
-    auto call_sym_eigvals_descending(const la::matrix<Float>& symmetric_matrix,
-                                     std::int64_t eigval_count) {
-        ONEDAL_ASSERT(symmetric_matrix.get_row_count() == symmetric_matrix.get_column_count());
-
-        const std::int64_t dim = symmetric_matrix.get_row_count();
-        const auto s_copy_flat = symmetric_matrix.copy().get_array();
-
-        auto data_or_scratchpad_nd = ndarray<Float, 2>::wrap_mutable(s_copy_flat, { dim, dim });
-        auto eigvecs_nd = ndarray<Float, 2>::empty({ eigval_count, dim });
-        auto eigvals_nd = ndarray<Float, 1>::empty(eigval_count);
-        sym_eigvals_descending(data_or_scratchpad_nd, eigval_count, eigvecs_nd, eigvals_nd);
-
-        const auto eigvecs = la::matrix<Float>::wrap_nd(eigvecs_nd);
-        const auto eigvals = la::matrix<Float>::wrap_nd(eigvals_nd);
-        return std::make_tuple(eigvecs, eigvals);
-    }
-
     auto call_sym_eigvals_inplace_generic(const la::matrix<Float>& symmetric_matrix,
                                           bool is_ascending) {
         ONEDAL_ASSERT(symmetric_matrix.get_row_count() == symmetric_matrix.get_column_count());
@@ -72,17 +56,51 @@ class sym_eigvals_test {
         const auto s_copy_flat = symmetric_matrix.copy().get_array();
 
         auto data_or_eigenvectors_nd = ndarray<Float, 2>::wrap_mutable(s_copy_flat, { dim, dim });
-        auto eigenvalues_nd = ndarray<Float, 1>::empty(dim);
+        data_or_eigenvectors_nd.to_device(this->get_queue());
+        auto eigenvalues_nd =
+            ndarray<Float, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
         if (is_ascending) {
-            sym_eigvals(data_or_eigenvectors_nd, eigenvalues_nd);
+            auto syevd_event = syevd<mkl::job::vec, mkl::uplo::upper>(this->get_queue(),
+                                                                      dim,
+                                                                      data_or_eigenvectors_nd,
+                                                                      dim,
+                                                                      eigenvalues_nd,
+                                                                      {});
+            syevd_event.wait_and_throw();
+            const auto eigenvectors =
+                la::matrix<Float>::wrap_nd(data_or_eigenvectors_nd.to_host(this->get_queue()));
+            const auto eigenvalues =
+                la::matrix<Float>::wrap_nd(eigenvalues_nd.to_host(this->get_queue()));
+            return std::make_tuple(eigenvectors, eigenvalues);
         }
         else {
-            sym_eigvals_descending(data_or_eigenvectors_nd, eigenvalues_nd);
+            auto syevd_event = syevd<mkl::job::vec, mkl::uplo::upper>(this->get_queue(),
+                                                                      dim,
+                                                                      data_or_eigenvectors_nd,
+                                                                      dim,
+                                                                      eigenvalues_nd,
+                                                                      {});
+            syevd_event.wait_and_throw();
+
+            auto data_ptr = eigenvalues_nd.get_data();
+            auto flipped_eigenvalues =
+                ndarray<Float, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
+            auto flipped_eigenvalues_ptr = flipped_eigenvalues.get_mutable_data();
+            auto queue = this->get_queue();
+            auto flip_event = queue.submit([&](sycl::handler& h) {
+                const auto range = make_range_1d(dim);
+                h.depends_on({ syevd_event });
+                h.parallel_for(range, [=](sycl::id<1> id) {
+                    const std::int64_t col = id[0];
+                    flipped_eigenvalues_ptr[col] = data_ptr[(dim - 1) - col];
+                });
+            });
+            const auto eigenvectors =
+                la::matrix<Float>::wrap_nd(data_or_eigenvectors_nd.to_host(this->get_queue()));
+            const auto eigenvalues =
+                la::matrix<Float>::wrap_nd(flipped_eigenvalues.to_host(this->get_queue()));
+            return std::make_tuple(eigenvectors, eigenvalues);
         }
-
-        const auto eigenvectors = la::matrix<Float>::wrap_nd(data_or_eigenvectors_nd);
-        const auto eigenvalues = la::matrix<Float>::wrap_nd(eigenvalues_nd);
-        return std::make_tuple(eigenvectors, eigenvalues);
     }
 
     void check_eigvals_definition(const la::matrix<Float>& s,
@@ -132,38 +150,21 @@ class sym_eigvals_test {
     static constexpr int seed_ = 7777;
 };
 
-using eigen_types = COMBINE_TYPES((float, double));
-
-#define SYM_EIGVALS_TEST(name) \
-    TEMPLATE_LIST_TEST_M(sym_eigvals_test, name, "[sym_eigvals]", eigen_types)
+using eigen_types = COMBINE_TYPES((float));
 
-SYM_EIGVALS_TEST("check inplace sym_eigvals on symmetric positive-definite matrix") {
+TEMPLATE_LIST_TEST_M(syevd_test, "test syevd with pos def matrix", "[sym_eigvals]", eigen_types) {
     const auto s = this->generate_symmetric_positive();
-
     const auto [eigenvectors, eigenvalues] = this->call_sym_eigvals_inplace(s);
 
     this->check_eigvals_definition(s, eigenvectors, eigenvalues);
     this->check_eigvals_are_ascending(eigenvalues);
 }
 
-SYM_EIGVALS_TEST("check inplace sym_eigvals_descending on symmetric positive-definite matrix") {
+TEMPLATE_LIST_TEST_M(syevd_test, "test syevd with pos def matrix 2", "[sym_eigvals]", eigen_types) {
     const auto s = this->generate_symmetric_positive();
 
     const auto [eigenvectors, eigenvalues] = this->call_sym_eigvals_inplace_descending(s);
 
-    this->check_eigvals_definition(s, eigenvectors, eigenvalues);
-    this->check_eigvals_are_descending(eigenvalues);
-}
-
-SYM_EIGVALS_TEST("check sym_eigvals_descending on symmetric positive-definite matrix") {
-    const auto s = this->generate_symmetric_positive();
-    const std::int64_t eigvals_count = GENERATE_COPY(1, s.get_row_count() / 2, s.get_row_count());
-
-    const auto [eigenvectors, eigenvalues] = this->call_sym_eigvals_descending(s, eigvals_count);
-
-    REQUIRE(eigenvectors.get_row_count() == eigvals_count);
-    REQUIRE(eigenvalues.get_count() == eigvals_count);
-    this->check_eigvals_definition(s, eigenvectors, eigenvalues);
     this->check_eigvals_are_descending(eigenvalues);
 }
 
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp b/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp
index e3dce105dbc..6e9d726e998 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp
@@ -44,7 +44,8 @@ sycl::event compute_probabilities(sycl::queue& q,
     auto fill_event = fill<Float>(q, probabilities, Float(1), deps);
     using oneapi::dal::backend::operator+;
 
-    Float w0 = fit_intercept ? parameters.get_slice(0, 1).at_device(q, 0l) : 0; // Poor perfomance
+    Float w0 =
+        fit_intercept ? parameters.get_slice(0, 1).at_device(q, 0l, deps) : 0; // Poor perfomance
     ndview<Float, 1> param_suf = fit_intercept ? parameters.get_slice(1, p + 1) : parameters;
 
     sycl::event gemv_event;
@@ -87,7 +88,8 @@ sycl::event compute_probabilities_sparse(sycl::queue& q,
     const std::int64_t p = parameters.get_dimension(0) - (fit_intercept ? 1 : 0);
 
     auto fill_event = fill<Float>(q, probabilities, Float(1), deps);
-    Float w0 = fit_intercept ? parameters.get_slice(0, 1).at_device(q, 0l) : 0; // Poor perfomance
+    Float w0 =
+        fit_intercept ? parameters.get_slice(0, 1).at_device(q, 0l, deps) : 0; // Poor perfomance
     ndview<Float, 1> param_suf = fit_intercept ? parameters.get_slice(1, p + 1) : parameters;
 
     sycl::event gemv_event;
diff --git a/cpp/oneapi/dal/backend/primitives/sparse_blas/misc.hpp b/cpp/oneapi/dal/backend/primitives/sparse_blas/misc.hpp
index 8a475db7cb1..8fe574a36be 100644
--- a/cpp/oneapi/dal/backend/primitives/sparse_blas/misc.hpp
+++ b/cpp/oneapi/dal/backend/primitives/sparse_blas/misc.hpp
@@ -18,11 +18,11 @@
 
 #include "oneapi/dal/table/common.hpp"
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::backend::primitives {
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
 /// Convert oneDAL `sparse_indexing` to oneMKL `index_base`
 inline constexpr mkl::index_base sparse_indexing_to_mkl(const sparse_indexing indexing) {
diff --git a/cpp/oneapi/dal/detail/sparse_matrix_handle_impl.hpp b/cpp/oneapi/dal/detail/sparse_matrix_handle_impl.hpp
index fb340e311a6..9f382b3dc2a 100644
--- a/cpp/oneapi/dal/detail/sparse_matrix_handle_impl.hpp
+++ b/cpp/oneapi/dal/detail/sparse_matrix_handle_impl.hpp
@@ -18,13 +18,13 @@
 
 #ifdef ONEDAL_DATA_PARALLEL
 
-#include <mkl_dal_sycl.hpp>
+#include <oneapi/mkl.hpp>
 
 namespace oneapi::dal::detail {
 
 namespace v1 {
 
-namespace mkl = oneapi::fpk;
+namespace mkl = oneapi::mkl;
 
 /// Class that hides the implementation details of the `backend::primitives::sparse_matrix_handle` class
 class sparse_matrix_handle_impl {
diff --git a/cpp/oneapi/dal/test/engine/mkl/BUILD b/cpp/oneapi/dal/test/engine/mkl/BUILD
index 6bf0e21fcde..92ead4e03e0 100644
--- a/cpp/oneapi/dal/test/engine/mkl/BUILD
+++ b/cpp/oneapi/dal/test/engine/mkl/BUILD
@@ -13,7 +13,7 @@ dal_test_module(
     extra_deps = [{
         "@config//:backend_ref": [ "@openblas//:openblas",
                                     ],
-        "//conditions:default": [ "@mkl//:mkl_seq", 
+        "//conditions:default": [ "@mkl//:mkl_thr", 
                                 ],
         }],
 )
diff --git a/dev/bazel/deps/micromkl.bzl b/dev/bazel/deps/micromkl.bzl
deleted file mode 100644
index e06ce773cf5..00000000000
--- a/dev/bazel/deps/micromkl.bzl
+++ /dev/null
@@ -1,67 +0,0 @@
-#===============================================================================
-# Copyright 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-load("@onedal//dev/bazel:repos.bzl", "repos")
-
-micromkl_repo = repos.prebuilt_libs_repo_rule(
-    includes = [
-        "include",
-        "%{os}/include",
-    ],
-    libs = [
-        "%{os}/lib/libdaal_mkl_thread.a",
-        "%{os}/lib/libdaal_vmlipp_core.a",
-    ],
-    build_template = "@onedal//dev/bazel/deps:micromkl.tpl.BUILD",
-    download_mapping = {
-    # Required directory layout and layout in the downloaded
-    # archives may be different. Mapping helps to setup relations
-    # between required layout (LHS) and downloaded (RHS).
-    # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
-    "lib/": "lib/intel64/",
-    },
-    local_mapping = {
-    # Required directory layout and layout in the downloaded
-    # archives may be different. Mapping helps to setup relations
-    # between required layout (LHS) and downloaded (RHS).
-    # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
-    "lib/": "lib/intel64/",
-    },
-)
-
-micromkl_dpc_repo = repos.prebuilt_libs_repo_rule(
-    includes = [
-        "include",
-    ],
-    libs = [
-        "lib/libdaal_sycl.a",
-    ],
-    build_template = "@onedal//dev/bazel/deps:micromkldpc.tpl.BUILD",
-    download_mapping = {
-    # Required directory layout and layout in the downloaded
-    # archives may be different. Mapping helps to setup relations
-    # between required layout (LHS) and downloaded (RHS).
-    # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
-    "lib/": "lib/intel64/",
-    },
-    local_mapping = {
-    # Required directory layout and layout in the downloaded
-    # archives may be different. Mapping helps to setup relations
-    # between required layout (LHS) and downloaded (RHS).
-    # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
-    "lib/": "lib/intel64/",
-    },
-)
diff --git a/dev/bazel/deps/micromkl.tpl.BUILD b/dev/bazel/deps/micromkl.tpl.BUILD
deleted file mode 100644
index eef6d53297f..00000000000
--- a/dev/bazel/deps/micromkl.tpl.BUILD
+++ /dev/null
@@ -1,27 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "headers",
-    hdrs = glob(["include/*.h", "%{os}/include/*.h"]),
-    includes = [ "include", "%{os}/include" ],
-)
-
-cc_library(
-    name = "vml_ipp",
-    srcs = [
-        "%{os}/lib/libdaal_vmlipp_core.a",
-    ],
-    deps = [
-        ":headers",
-    ],
-)
-
-cc_library(
-    name = "mkl_thr",
-    srcs = [
-        "%{os}/lib/libdaal_mkl_thread.a",
-    ],
-    deps = [
-        ":headers",
-    ],
-)
diff --git a/dev/bazel/deps/micromkldpc.tpl.BUILD b/dev/bazel/deps/micromkldpc.tpl.BUILD
deleted file mode 100644
index 844e1150264..00000000000
--- a/dev/bazel/deps/micromkldpc.tpl.BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "headers",
-    hdrs = glob(["include/*.h", "include/*.hpp"]),
-    includes = [ "include" ],
-)
-
-cc_library(
-    name = "mkl_dpc",
-    srcs = [
-        "lib/libdaal_sycl.a",
-    ],
-    deps = [
-        ":headers",
-        "@opencl//:opencl_binary",
-    ],
-)
diff --git a/dev/bazel/deps/mkl.bzl b/dev/bazel/deps/mkl.bzl
index e5e0cf15a1f..8fda1bbfa34 100644
--- a/dev/bazel/deps/mkl.bzl
+++ b/dev/bazel/deps/mkl.bzl
@@ -22,8 +22,9 @@ mkl_repo = repos.prebuilt_libs_repo_rule(
     ],
     libs = [
         "lib/libmkl_core.a",
-        "lib/libmkl_sequential.a",
         "lib/libmkl_intel_ilp64.a",
+        "lib/libmkl_tbb_thread.a",
+        "lib/libmkl_sycl.a",
     ],
     build_template = "@onedal//dev/bazel/deps:mkl.tpl.BUILD",
     download_mapping = {
@@ -33,11 +34,4 @@ mkl_repo = repos.prebuilt_libs_repo_rule(
     # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
     "lib/intel64": "lib/",
     },
-    local_mapping = {
-    # Required directory layout and layout in the downloaded
-    # archives may be different. Mapping helps to setup relations
-    # between required layout (LHS) and downloaded (RHS).
-    # In this case, files from `lib/*` will be copied to `lib/intel64/*`.
-    "lib/": "lib/intel64/",
-    },
 )
diff --git a/dev/bazel/deps/mkl.tpl.BUILD b/dev/bazel/deps/mkl.tpl.BUILD
index 0d744544d4e..c179f5dfd41 100644
--- a/dev/bazel/deps/mkl.tpl.BUILD
+++ b/dev/bazel/deps/mkl.tpl.BUILD
@@ -2,8 +2,13 @@ package(default_visibility = ["//visibility:public"])
 
 cc_library(
     name = "headers",
-    hdrs = glob(["include/**/*.h"]),
-    includes = [ "include" ],
+    hdrs = glob([
+        "include/**/*.h",
+        "include/**/*.hpp",
+    ]),
+    includes = [
+        "include",
+    ],
     defines = [
         "MKL_ILP64"
     ],
@@ -13,38 +18,34 @@ cc_library(
     name = "mkl_core",
     srcs = [
         "lib/libmkl_core.a",
+        "lib/libmkl_intel_ilp64.a",
+        "lib/libmkl_tbb_thread.a",
     ],
     linkopts = [
         "-lpthread",
     ],
-)
-
-cc_library(
-    name = "mkl_intel_ilp64",
-    srcs = [
-        "lib/libmkl_intel_ilp64.a",
-    ],
     deps = [
-        ":mkl_core",
+        ":headers",
     ]
 )
 
 cc_library(
-    name = "libmkl_sequential",
-    srcs = [
-        "lib/libmkl_sequential.a",
+    name = "mkl_thr",
+    linkopts = [
+        "-lpthread",
     ],
     deps = [
+        ":headers",
         ":mkl_core",
     ]
 )
 
 cc_library(
-    name = "mkl_seq",
+    name = "mkl_dpc",
+    srcs = [
+        "lib/libmkl_sycl.a",
+    ],
     deps = [
         ":headers",
-        ":mkl_core",
-        ":mkl_intel_ilp64",
-        ":libmkl_sequential",
     ],
 )
diff --git a/dev/bazel/toolchains/cc_toolchain_lnx.bzl b/dev/bazel/toolchains/cc_toolchain_lnx.bzl
index 4dd36108503..e9c5b631be6 100644
--- a/dev/bazel/toolchains/cc_toolchain_lnx.bzl
+++ b/dev/bazel/toolchains/cc_toolchain_lnx.bzl
@@ -128,7 +128,8 @@ def _preapre_builtin_include_directory_paths(repo_ctx, tools):
             tools.dpcc,
             "-xc++",
             get_no_canonical_prefixes_opt(repo_ctx, tools.dpcc) +
-            _add_gcc_toolchain_if_needed(repo_ctx, tools.dpcc),
+            _add_gcc_toolchain_if_needed(repo_ctx, tools.dpcc) +
+            _add_sycl_linkage(repo_ctx, tools.dpcc) if tools.is_dpc_found else [],
         ) +
         required_tmp_includes,
     )
@@ -154,6 +155,12 @@ def _add_gcc_toolchain_if_needed(repo_ctx, cc):
     else:
         return []
 
+def _add_sycl_linkage(repo_ctx, cc):
+    if ("icx" in cc) or ("icpx" in cc):
+        return ["-fsycl"]
+    else:
+        return []
+
 def configure_cc_toolchain_lnx(repo_ctx, reqs):
     if reqs.os_id != "lnx":
         auto_configure_fail("Cannot configure Linux toolchain for '{}'".format(reqs.os_id))
diff --git a/dev/docker/README.md b/dev/docker/README.md
index df4ba32bd63..0b7438d20c0 100644
--- a/dev/docker/README.md
+++ b/dev/docker/README.md
@@ -16,13 +16,18 @@
 *******************************************************************************/-->
 
 # Docker Development Environment
+
 ## How To Use
+
 There is a simple docker dev environment for the oneDAL development and build process.
 It includes dependencies for building all oneDAL components with ``make`` and ``bazel``
 
-Note: The docker setup assumes that it is executed from the oneDAL repo and copies repo files inside the container
+Note: The docker setup assumes that it is executed from the oneDAL repo and copies repo files inside the container. In order to build the container locally from the root of the `oneDAL` repository, execute the following:
+```shell
+docker build -t onedal-dev -f dev/docker/onedal-dev.Dockerfile .
+```
 
-For that, run:
-   ```sh
-   docker run -it onedal-dev /bin/bash
-   ```
+Then, in order to use the container interactively, run:
+```shell
+docker run -it onedal-dev /bin/bash
+```
diff --git a/dev/docker/onedal-dev.Dockerfile b/dev/docker/onedal-dev.Dockerfile
index fb6c02394cc..224adb4dff9 100644
--- a/dev/docker/onedal-dev.Dockerfile
+++ b/dev/docker/onedal-dev.Dockerfile
@@ -17,43 +17,42 @@
 FROM ubuntu:22.04@sha256:adbb90115a21969d2fe6fa7f9af4253e16d45f8d4c1e930182610c4731962658
 
 ARG workdirectory="/sources/oneDAL"
-
-COPY . ${workdirectory}
-
 WORKDIR ${workdirectory}
 
 #Env setup
 RUN apt-get update && \
-      apt-get -y install sudo wget gnupg git make python3-setuptools doxygen
+      apt-get -y install sudo wget gnupg git make python3-setuptools doxygen software-properties-common
 
 # Install miniconda
-ENV CONDA_DIR /opt/conda
-RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \
-    /bin/bash ~/miniconda.sh -b -p /opt/conda
+ENV CONDA_DIR=/opt/conda
+RUN wget --quiet \
+    "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" && \
+    bash Miniforge3* -b -p /opt/conda
 
 # Put conda in path to use conda activate
 ENV PATH=$CONDA_DIR/bin:$PATH
 
+# Installing environment for bazel
+RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-amd64 && \
+    chmod 755 bazelisk-linux-amd64 && \
+    mv bazelisk-linux-amd64 /usr/bin/bazel
+
+COPY . ${workdirectory}
+
 # Installing environment for base development dependencies
 RUN .ci/env/apt.sh dev-base
 
 # Installing environment for DPCPP development dependencies
 RUN .ci/env/apt.sh dpcpp
 
+# Installing environment for MKL development dependencies
+RUN .ci/env/apt.sh mkl
+
 # Installing environment for clang-format
 RUN .ci/env/apt.sh clang-format
 
-# Installing environment for bazel
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.18.0/bazelisk-linux-amd64 && \
-    chmod 755 bazelisk-linux-amd64 && \
-    mv bazelisk-linux-amd64 /usr/bin/bazel
-
 # Installing openBLAS dependency
 RUN .ci/env/openblas.sh
 
-# Installing MKL dependency
-RUN ./dev/download_micromkl.sh
-
 # Installing oneTBB dependency
 RUN ./dev/download_tbb.sh
-
diff --git a/dev/download_micromkl.bat b/dev/download_micromkl.bat
deleted file mode 100755
index a38515735a5..00000000000
--- a/dev/download_micromkl.bat
+++ /dev/null
@@ -1,77 +0,0 @@
-@echo off
-rem ============================================================================
-rem Copyright 2018 Intel Corporation
-rem
-rem Licensed under the Apache License, Version 2.0 (the "License");
-rem you may not use this file except in compliance with the License.
-rem You may obtain a copy of the License at
-rem
-rem     http://www.apache.org/licenses/LICENSE-2.0
-rem
-rem Unless required by applicable law or agreed to in writing, software
-rem distributed under the License is distributed on an "AS IS" BASIS,
-rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-rem See the License for the specific language governing permissions and
-rem limitations under the License.
-rem ============================================================================
-
-rem req: PowerShell 3.0+
-powershell.exe -command "if ($PSVersionTable.PSVersion.Major -ge 3) {exit 1} else {Write-Host \"The script requires PowerShell 3.0 or above (current version: $($PSVersionTable.PSVersion.Major).$($PSVersionTable.PSVersion.Minor))\"}" && goto Error_load
-
-set MKLURLROOT=https://github.com/oneapi-src/oneDAL/releases/download/Dependencies/
-set MKLVERSION=20230413
-set MKLGPUVERSION=20240605
-
-set MKLPACKAGE=mklfpk_win_%MKLVERSION%
-set MKLGPUPACKAGE=mklgpufpk_win_%MKLGPUVERSION%
-
-set MKLURL=%MKLURLROOT%%MKLPACKAGE%.zip
-set MKLGPUURL=%MKLURLROOT%%MKLGPUPACKAGE%.zip
-if /i "%1"=="" (
-    set CPUCOND=%~dp0..\__deps\mklfpk
-    set GPUCOND=%~dp0..\__deps\mklgpufpk
-) else (
-    set CPUCOND=%1\..\__deps\mklfpk
-    set GPUCOND=%1\..\__deps\mklgpufpk
-)
-
-set CPUDST=%CPUCOND%
-set GPUDST="%GPUCOND%\win"
-
-CALL :Download_FPK %CPUDST% , %CPUCOND% , %MKLURL% , %MKLPACKAGE%
-CALL :Download_FPK %GPUDST% , %GPUCOND% , %MKLGPUURL% , %MKLGPUPACKAGE%
-
-exit /B 0
-
-:Download_FPK
-
-set DST=%~1
-set CONDITION=%~2
-set SRC=%~3
-set FILENAME=%~4
-
-if not exist %DST% mkdir %DST%
-
-
-if not exist "%CONDITION%\win\lib" (
-
-    powershell.exe -command "(New-Object System.Net.WebClient).DownloadFile('%SRC%', '%DST%\%FILENAME%.zip')" && goto Unpack || goto Error_load
-
-:Unpack
-    powershell.exe -command "if (Get-Command Add-Type -errorAction SilentlyContinue) {Add-Type -Assembly \"System.IO.Compression.FileSystem\"; try { [IO.Compression.zipfile]::ExtractToDirectory(\"%DST%\%FILENAME%.zip\", \"%DST%\")}catch{$_.exception ; exit 1}} else {exit 1}" && goto Exit || goto Error_unpack
-
-:Error_load
-    echo download_mklfpk.bat : Error: Failed to load %SRC% to %DST%, try to load it manually
-    exit /B 1
-
-:Error_unpack
-    echo download_mklfpk.bat : Error: Failed to unpack %DST%\%FILENAME%.zip to %DST%, try unpack the archive manually
-    exit /B 1
-
-:Exit
-    echo Downloaded and unpacked Intel^(R^) MKL small libraries to %DST%
-    exit /B 0
-) else (
-    echo Intel^(R^) MKL small libraries are already installed in %DST%
-    exit /B 0
-)
diff --git a/dev/download_micromkl.sh b/dev/download_micromkl.sh
deleted file mode 100755
index 6eb52ddca76..00000000000
--- a/dev/download_micromkl.sh
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/bin/bash
-#===============================================================================
-# Copyright 2018 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-MKLFPK_URL_ROOT="https://github.com/oneapi-src/oneDAL/releases/download/Dependencies/"
-MKLFPK_VERSION="20230413"
-MKLFPK_VERSION_MAC="20210426"
-MKLGPUFPK_VERSION="20240605"
-WITH_GPU=true
-
-while true ; do
-    if [ "$1" = "--help" ] ; then
-        echo "Usage: $0 [with_gpu=true|false]"
-        echo "Usage example: $0 with_gpu=true"
-        exit 1
-    elif [ "${1:0:8}" = "with_gpu" ] ; then
-        WITH_GPU=${1:9}
-    elif [ -z "$1" ] ; then
-        break
-    else
-        echo "Error: unknown paramater $1!"
-        echo "type $0 --help"
-        exit 1
-    fi
-    shift
-done
-
-function download_fpk()
-{
-  SRC=$1
-  DST=$2
-  CONDITION=$3
-  FILENAME=$4
-
-  mkdir -p "${DST}"
-  DST=$(cd "${DST}" || exit 1;pwd)
-
-  if [ ! -e "${CONDITION}/${MKLFPK_OS}/lib/" ]; then
-    if [ -x "$(command -v curl)" ]; then
-      echo curl -L -o "${DST}/${FILENAME}" "${SRC}"
-      if curl -L -o "${DST}/${FILENAME}" "${SRC}";
-      then
-        DOWNLOAD_CODE=0
-      fi
-    elif [ -x "$(command -v wget)" ]; then
-      echo wget -O "${DST}/${FILENAME}" "${SRC}"
-      if wget -O "${DST}/${FILENAME}" "${SRC}";
-      then
-        DOWNLOAD_CODE=0
-      fi
-    else
-      echo "curl or wget not available"
-      exit 1
-    fi
-
-    if [ ${DOWNLOAD_CODE} -ne 0 ] || [ ! -e "${DST}/${FILENAME}" ]; then
-      echo "Download from ${SRC} to ${DST}/${FILENAME} failed"
-      exit 1
-    fi
-    set -x
-
-    echo tar -xf "${DST}/${FILENAME}" -C "${DST}"
-    tar -xf "${DST}/${FILENAME}" -C "${DST}"
-    echo "Downloaded and unpacked oneMKL small libraries to ${DST}"
-  else
-    echo "oneMKL small libraries are already installed in ${DST}"
-  fi
-}
-
-os=$(uname)
-if [ "$os" = "Linux" ]; then
-  MKLFPK_OS=lnx
-elif [ "$os" = "Darwin" ]; then
-  MKLFPK_OS=mac
-  MKLFPK_VERSION=${MKLFPK_VERSION_MAC}
-else
-  echo "Cannot identify operating system. Try downloading package manually."
-  exit 1
-fi
-
-MKLFPK_PACKAGE="mklfpk_${MKLFPK_OS}_${MKLFPK_VERSION}"
-MKLGPUFPK_PACKAGE="mklgpufpk_${MKLFPK_OS}_${MKLGPUFPK_VERSION}"
-MKLFPK_URL=${MKLFPK_URL_ROOT}${MKLFPK_PACKAGE}.tgz
-MKLGPUFPK_URL=${MKLFPK_URL_ROOT}${MKLGPUFPK_PACKAGE}.tgz
-CPUCOND=$(dirname "$0")/../__deps/mklfpk
-GPUCOND=$(dirname "$0")/../__deps/mklgpufpk
-CPUDST="${CPUCOND}"
-GPUDST="${GPUCOND}/${MKLFPK_OS}"
-
-download_fpk "${MKLFPK_URL}" "${CPUDST}" "${CPUCOND}" "${MKLFPK_PACKAGE}.tgz"
-if [ "${MKLFPK_OS}" != "mac" ] && [ "${WITH_GPU}" == "true" ]; then
-  download_fpk "${MKLGPUFPK_URL}" "${GPUDST}" "${GPUCOND}" "${MKLGPUFPK_PACKAGE}.tgz"
-fi
diff --git a/dev/make/deps.mkl.mk b/dev/make/deps.mkl.mk
index c533d9fbb78..2c490d1bb66 100644
--- a/dev/make/deps.mkl.mk
+++ b/dev/make/deps.mkl.mk
@@ -17,44 +17,44 @@
 #++
 #  Math backend (MKL) definitions for makefile
 #--
-MKLFPKDIR:= $(if $(wildcard $(DIR)/__deps/mklfpk/$(_OS)/*),$(DIR)/__deps/mklfpk,                        \
-            $(if $(wildcard $(MKLFPKROOT)/include/*),$(subst \,/,$(MKLFPKROOT)),                        \
-            $(error Can`t find MKLFPK libs nether in $(DIR)/__deps/mklfpk/$(_OS) not in MKLFPKROOT.)))
-MKLFPKDIR.include := $(MKLFPKDIR)/include $(MKLFPKDIR)/$(if $(OS_is_fbsd),lnx,$(_OS))/include
-MKLFPKDIR.libia   := $(MKLFPKDIR)/$(if $(OS_is_fbsd),lnx,$(_OS))/lib/$(_IA)
 
-RELEASEDIR.include.mklgpufpk := $(RELEASEDIR.include)/services/internal/sycl/math
+MKLDIR:= $(subst \,/,$(MKLROOT))
+MKLDIR.include := $(MKLDIR)/include
+MKLDIR.libia   := $(MKLDIR)/lib
+RELEASEDIR.include.mklgpu := $(RELEASEDIR.include)/services/internal/sycl/math
 
-MKLGPUFPKDIR:= $(if $(wildcard $(DIR)/__deps/mklgpufpk/$(_OS)/*),$(DIR)/__deps/mklgpufpk/$(_OS),$(subst \,/,$(MKLGPUFPKROOT)))
-MKLGPUFPKDIR.include := $(MKLGPUFPKDIR)/include
-MKLGPUFPKDIR.lib   := $(MKLGPUFPKDIR)/lib/
+MKLGPUDIR:= $(subst \,/,$(MKLROOT))
+MKLGPUDIR.include := $(MKLGPUDIR)/include/oneapi
+MKLGPUDIR.lib   := $(MKLGPUDIR)/lib
 
-mklgpufpk.LIBS_A := $(MKLGPUFPKDIR.lib)/$(plib)daal_sycl$d.$(a)
-mklgpufpk.HEADERS := $(MKLGPUFPKDIR.include)/mkl_dal_sycl.hpp $(MKLGPUFPKDIR.include)/mkl_dal_blas_sycl.hpp
+mklgpu.HEADERS := $(MKLGPUDIR.include)/mkl.hpp
 
-daaldep.math_backend.incdir := $(MKLFPKDIR.include) $(MKLGPUFPKDIR.include)
-daaldep.math_backend_oneapi.incdir := $(MKLFPKDIR.include) $(MKLGPUFPKDIR.include)
+daaldep.math_backend.incdir := $(MKLDIR.include)
+daaldep.math_backend_oneapi.incdir := $(MKLDIR.include) $(MKLGPUDIR.include)
 
-daaldep.lnx32e.mkl.thr := $(MKLFPKDIR.libia)/$(plib)daal_mkl_thread.$a
-daaldep.lnx32e.mkl.seq := $(MKLFPKDIR.libia)/$(plib)daal_mkl_sequential.$a
-daaldep.lnx32e.mkl := $(MKLFPKDIR.libia)/$(plib)daal_vmlipp_core.$a
+daaldep.lnx32e.mkl.thr := $(MKLDIR.libia)/$(plib)mkl_tbb_thread.$a
+daaldep.lnx32e.mkl.seq := $(MKLDIR.libia)/$(plib)mkl_sequential.$a
+daaldep.lnx32e.mkl.core := $(MKLDIR.libia)/$(plib)mkl_core.$a 
+daaldep.lnx32e.mkl.interfaces := $(MKLDIR.libia)/$(plib)mkl_intel_ilp64.$a
+daaldep.lnx32e.mkl.sycl := $(MKLGPUDIR.lib)/$(plib)mkl_sycl.$a
 
-daaldep.win32e.mkl.thr := $(MKLFPKDIR.libia)/daal_mkl_thread$d.$a
-daaldep.win32e.mkl.seq := $(MKLFPKDIR.libia)/daal_mkl_sequential.$a
-daaldep.win32e.mkl := $(MKLFPKDIR.libia)/$(plib)daal_vmlipp_core$d.$a
+daaldep.win32e.mkl.thr := $(MKLDIR.libia)/mkl_tbb_thread$d.$a
+daaldep.win32e.mkl.seq := $(MKLDIR.libia)/mkl_sequential.$a
+daaldep.win32e.mkl.interfaces := $(MKLDIR.libia)/mkl_intel_ilp64.$a
+daaldep.win32e.mkl.core := $(MKLDIR.libia)/mkl_core.$a
+daaldep.win32e.mkl.sycl := $(MKLGPUDIR.lib)/mkl_sycl$d.$a
 
-daaldep.mac32e.mkl.thr := $(MKLFPKDIR.libia)/$(plib)daal_mkl_thread.$a
-daaldep.mac32e.mkl.seq := $(MKLFPKDIR.libia)/$(plib)daal_mkl_sequential.$a
-daaldep.mac32e.mkl := $(MKLFPKDIR.libia)/$(plib)daal_vmlipp_core.$a
+daaldep.fbsd32e.mkl.thr := $(MKLDIR.libia)/$(plib)mkl_tbb_thread.$a
+daaldep.fbsd32e.mkl.seq := $(MKLDIR.libia)/$(plib)mkl_sequential.$a
+daaldep.fbsd32e.mkl.interfaces := $(MKLDIR.libia)/$(plib)mkl_intel_ilp64.$a
+daaldep.fbsd32e.mkl.core := $(MKLDIR.libia)/$(plib)mkl_core.$a
+daaldep.fbsd32e.mkl.sycl := $(MKLGPUDIR.lib)/$(plib)mkl_sycl.$a
 
-daaldep.fbsd32e.mkl.thr := $(MKLFPKDIR.libia)/$(plib)daal_mkl_thread.$a
-daaldep.fbsd32e.mkl.seq := $(MKLFPKDIR.libia)/$(plib)daal_mkl_sequential.$a
-daaldep.fbsd32e.mkl := $(MKLFPKDIR.libia)/$(plib)daal_vmlipp_core.$a
-
-
-daaldep.mkl     := $(daaldep.$(PLAT).mkl)
+daaldep.math_backend.core     := $(daaldep.$(PLAT).mkl.core)
+daaldep.math_backend.interfaces     := $(daaldep.$(PLAT).mkl.interfaces)
 daaldep.math_backend.thr := $(daaldep.$(PLAT).mkl.thr)
-daaldep.math_backend.seq := $(daaldep.$(PLAT).mkl.seq) $(daaldep.mkl)
+daaldep.math_backend.seq := $(daaldep.$(PLAT).mkl.seq)
+daaldep.math_backend.sycl := $(daaldep.$(PLAT).mkl.sycl)
 
 daaldep.lnx32e.vml :=
 daaldep.lnx32e.ipp := $(if $(COV.libia),$(COV.libia)/libcov.a)
@@ -71,4 +71,5 @@ daaldep.fbsd32e.ipp := $(if $(COV.libia),$(COV.libia)/libcov.a)
 daaldep.vml     := $(daaldep.$(PLAT).vml)
 daaldep.ipp     := $(daaldep.$(PLAT).ipp)
 
-daaldep.math_backend.ext := $(daaldep.ipp) $(daaldep.vml) $(daaldep.mkl)
+daaldep.math_backend.ext := $(daaldep.ipp) $(daaldep.vml) $(daaldep.math_backend.interfaces) $(daaldep.math_backend.thr) $(daaldep.math_backend.core)
+daaldep.math_backend.sycl := $(daaldep.math_backend.sycl)
diff --git a/dev/make/deps.ref.mk b/dev/make/deps.ref.mk
index 636e5c16268..9cfdcf719af 100644
--- a/dev/make/deps.ref.mk
+++ b/dev/make/deps.ref.mk
@@ -43,3 +43,7 @@ ifeq ($(RNG_OPENRNG), yes)
 
 	daaldep.math_backend.incdir += $(daaldep.rng_backend.incdir)
 endif
+
+daaldep.math_backend.ext := $(daaldep.math_backend.thr)
+daaldep.math_backend.sycl := $(daaldep.math_backend.thr)
+daaldep.math_backend.oneapi := $(daaldep.math_backend.thr)
diff --git a/docs/source/contribution/threading.rst b/docs/source/contribution/threading.rst
new file mode 100644
index 00000000000..cd1acd84e95
--- /dev/null
+++ b/docs/source/contribution/threading.rst
@@ -0,0 +1,166 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+.. highlight:: cpp
+
+Threading Layer
+^^^^^^^^^^^^^^^
+
+oneDAL uses Intel\ |reg|\  oneAPI Threading Building Blocks (Intel\ |reg|\  oneTBB) to do parallel
+computations on CPU.
+
+But oneTBB is not used in the code of oneDAL algorithms directly. The algorithms rather
+use custom primitives that either wrap oneTBB functionality or are in-house developed.
+Those primitives form oneDAL's threading layer.
+
+This is done in order not to be dependent on possible oneTBB API changes and even
+on the particular threading technology like oneTBB, C++11 standard threads, etc.
+
+The API of the layer is defined in
+`threading.h <https://github.com/oneapi-src/oneDAL/blob/main/cpp/daal/src/threading/threading.h>`_.
+Please be aware that the threading API is not a part of oneDAL product API.
+This is the product internal API that aimed to be used only by oneDAL developers, and can be changed at any time
+without any prior notification.
+
+This chapter describes common parallel patterns and primitives of the threading layer.
+
+threader_for
+************
+
+Consider a case where you need to compute an elementwise sum of two arrays and store the results
+into another array.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/sum-sequential.rst
+
+There are several options available in the threading layer of oneDAL to let the iterations of this code
+run in parallel.
+One of the options is to use ``daal::threader_for`` as shown here:
+
+.. include:: ../includes/threading/sum-parallel.rst
+
+The iteration space here goes from ``0`` to ``n-1``.
+The last argument is a function object that performs a single iteration of the loop, given loop index ``i``.
+
+Blocking
+--------
+
+To have more control over the parallel execution and to increase
+`cache locality <https://en.wikipedia.org/wiki/Locality_of_reference>`_ oneDAL usually splits
+the data into blocks and then processes those blocks in parallel.
+
+This code shows how a typical parallel loop in oneDAL looks like:
+
+.. include:: ../includes/threading/sum-parallel-by-blocks.rst
+
+Thread-local Storage (TLS)
+**************************
+
+Consider you need to compute a dot product of two arrays.
+Here is a variant of sequential implementation:
+
+.. include:: ../includes/threading/dot-sequential.rst
+
+Parallel computations can be performed in two steps:
+
+    1. Compute partial dot product in each thread.
+    2. Perform a reduction: Add the partial results from all threads to compute the final dot product.
+
+``daal::tls`` provides a local storage where each thread can accumulate its local results.
+The following code allocates memory that would store partial dot products for each thread:
+
+.. include:: ../includes/threading/dot-parallel-init-tls.rst
+
+``SafeStatus`` in this code denotes a thread-safe counterpart of the ``Status`` class.
+``SafeStatus`` allows to collect errors from all threads and report them to the user using the
+``detach()`` method. An example will be shown later in the documentation.
+
+Checking the status right after the initialization code won't show the allocation errors,
+because oneTBB uses lazy evaluation and the lambda function passed to the constructor of the TLS
+is evaluated on first use of the thread-local storage (TLS).
+
+There are several options available in the threading layer of oneDAL to compute the partial
+dot product results at each thread.
+One of the options is to use the already mentioned ``daal::threader_for`` and blocking approach
+as shown here:
+
+.. include:: ../includes/threading/dot-parallel-partial-compute.rst
+
+To compute the final result it is required to reduce each thread's partial results
+as shown here:
+
+.. include:: ../includes/threading/dot-parallel-reduction.rst
+
+Local memory of the threads should be released when it is no longer needed.
+
+The complete parallel version of dot product computations would look like:
+
+.. include:: ../includes/threading/dot-parallel.rst
+
+Static Work Scheduling
+**********************
+
+By default, oneTBB uses
+`dynamic work scheduling <https://oneapi-src.github.io/oneTBB/main/tbb_userguide/How_Task_Scheduler_Works.html>`_
+and work stealing.
+It means that two different runs of the same parallel loop can produce different
+mappings of the loop's iteration space to the available threads.
+This strategy is beneficial when it is difficult to estimate the amount of work performed
+by each iteration.
+
+In the cases when it is known that the iterations perform an equal amount of work, it
+is more performant to use predefined mapping of the loop's iterations to threads.
+This is what static work scheduling does.
+
+``daal::static_threader_for`` and ``daal::static_tls`` allow implementation of static
+work scheduling within oneDAL.
+
+Here is a variant of parallel dot product computation with static scheduling:
+
+.. include:: ../includes/threading/dot-static-parallel.rst
+
+Nested Parallelism
+******************
+
+oneDAL supports nested parallel loops.
+It is important to know that:
+
+    "when a parallel construct calls another parallel construct, a thread can obtain a task
+     from the outer-level construct while waiting for completion of the inner-level one."
+
+    -- `oneTBB documentation <https://www.intel.com/content/www/us/en/docs/onetbb/developer-guide-api-reference/2021-13/work-isolation.html>`_
+
+In practice, this means that a thread-local variable might unexpectedly
+change its value after a nested parallel construct:
+
+.. include:: ../includes/threading/nested-parallel.rst
+
+In some scenarios this can lead to deadlocks, segmentation faults and other issues.
+
+oneTBB provides ways to isolate execution of a parallel construct, for its tasks
+to not interfere with other simultaneously running tasks.
+
+Those options are preferred when the parallel loops are initially written as nested.
+But in oneDAL there are cases when one parallel algorithm, the outer one,
+calls another parallel algorithm, the inner one, within a parallel region.
+
+The inner algorithm in this case can also be called solely, without additional nesting.
+And we do not always want to make it isolated.
+
+For the cases like that, oneDAL provides ``daal::ls``. Its ``local()`` method always
+returns the same value for the same thread, regardless of the nested execution:
+
+.. include:: ../includes/threading/nested-parallel-ls.rst
diff --git a/docs/source/includes/threading/dot-parallel-init-tls.rst b/docs/source/includes/threading/dot-parallel-init-tls.rst
new file mode 100644
index 00000000000..8e72646a7ca
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-init-tls.rst
@@ -0,0 +1,30 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> dotProductTLS([=, &safeStat]() {
+      float * dotProductPtr = new (std::nothrow) float;
+      if (!dotProductPtr) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      dotProductPtr[0] = 0.0f;
+      return dotProductPtr;
+   });
diff --git a/docs/source/includes/threading/dot-parallel-partial-compute.rst b/docs/source/includes/threading/dot-parallel-partial-compute.rst
new file mode 100644
index 00000000000..0521e4bb2cf
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-partial-compute.rst
@@ -0,0 +1,40 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   constexpr size_t blockSize = 1024;
+   const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+   daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+      const size_t iStart = iBlock * blockSize;
+      const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+      // Compute partial result for this block
+      float partialDotProduct = 0.0f;
+      for (size_t i = iStart; i < iEnd; ++i) {
+         partialDotProduct += a[i] * b[i];
+      }
+
+      // Update thread-local result
+      float * localDotProduct = dotProductTLS.local();
+      if (!localDotProduct) {
+         // Allocation error happened earlier
+         return;
+      }
+      localDotProduct[0] += partialDotProduct;
+   });
+   DAAL_CHECK_SAFE_STATUS();  // if (!safeStat) return safeStat.detach();
diff --git a/docs/source/includes/threading/dot-parallel-reduction.rst b/docs/source/includes/threading/dot-parallel-reduction.rst
new file mode 100644
index 00000000000..e86ca030246
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel-reduction.rst
@@ -0,0 +1,25 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+  float dotProduct = 0.0f;
+  tls.reduce([&](float * localDotProduct) {
+    if (localDotProduct) {
+      dotProduct += localDotProduct[0];
+      delete localDotProduct;
+    }
+  });
diff --git a/docs/source/includes/threading/dot-parallel.rst b/docs/source/includes/threading/dot-parallel.rst
new file mode 100644
index 00000000000..d0230715a01
--- /dev/null
+++ b/docs/source/includes/threading/dot-parallel.rst
@@ -0,0 +1,63 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr) {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         float * localDotProduct = dotProductTLS.local();
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/dot-sequential.rst b/docs/source/includes/threading/dot-sequential.rst
new file mode 100644
index 00000000000..93300053c32
--- /dev/null
+++ b/docs/source/includes/threading/dot-sequential.rst
@@ -0,0 +1,25 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   float dot(const size_t n, const float* a, const float* b) {
+      float result = 0.0f;
+      for (size_t i = 0; i < n; ++i) {
+         result += a[i] * b[i];
+      }
+      return result;
+   }
diff --git a/docs/source/includes/threading/dot-static-parallel.rst b/docs/source/includes/threading/dot-static-parallel.rst
new file mode 100644
index 00000000000..dfec18b6d21
--- /dev/null
+++ b/docs/source/includes/threading/dot-static-parallel.rst
@@ -0,0 +1,64 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   services::Status dot(const size_t n, const float* a, const float* b, float &dotProduct) {
+      SafeStatus safeStat;
+      daal::static_tls<float *> dotProductTLS([=, &safeStat]() {
+         float * dotProductPtr = new (std::nothrow) float;
+         if (!dotProductPtr) {
+            safeStat.add(services::ErrorMemoryAllocationFailed);
+         }
+         dotProductPtr[0] = 0.0f;
+         return dotProductPtr;
+      });
+
+      constexpr size_t blockSize = 1024;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::static_threader_for(nBlocks, [&](size_t iBlock, size_t threadId) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+
+         // Compute partial result for this block
+         float partialDotProduct = 0.0f;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            partialDotProduct += a[i] * b[i];
+         }
+
+         // Update thread-local result
+         // Note that exact thread index is used to get access to the thread's data
+         float * localDotProduct = dotProductTLS.local(threadId);
+         if (!localDotProduct) {
+            // Allocation error happened earlier
+            return;
+         }
+         localDotProduct[0] += partialDotProduct;
+      });
+      DAAL_CHECK_SAFE_STATUS();
+
+      tls.reduce([&](float * localDotProduct) {
+         if (localDotProduct) {
+            dotProduct += localDotProduct[0];
+            delete localDotProduct;
+         }
+      });
+      return services::Status();
+   }
diff --git a/docs/source/includes/threading/nested-parallel-ls.rst b/docs/source/includes/threading/nested-parallel-ls.rst
new file mode 100644
index 00000000000..1ceb0414c0b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel-ls.rst
@@ -0,0 +1,53 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::ls<float *> ls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = ls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // The thread specific value always stays unchanged after the nested execution.
+      assert(localBuffer == ls.local()); // Assertion never fails!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   ls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });
diff --git a/docs/source/includes/threading/nested-parallel.rst b/docs/source/includes/threading/nested-parallel.rst
new file mode 100644
index 00000000000..fc41f70398b
--- /dev/null
+++ b/docs/source/includes/threading/nested-parallel.rst
@@ -0,0 +1,54 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/algorithms/service_error_handling.h"
+   #include "src/threading/threading.h"
+
+   SafeStatus safeStat;
+   daal::tls<float *> tls([=, &safeStat]() {
+      float * localBuffer = new (std::nothrow) float[localSize];
+      if (!localBuffer) {
+         safeStat.add(services::ErrorMemoryAllocationFailed);
+      }
+      return localBuffer;
+   })
+   daal::threader_for(n, n, [&](size_t i) {
+      float * localBuffer = tls.local();
+      if (!localBuffer) {
+         // Allocation error happened earlier
+         return;
+      }
+
+      // Initialize localBuffer with some data here
+
+      daal::threader_for(m, m, [&](size_t j) {
+         /* Some work */
+      });
+
+      // While executing the above parallel_for, the thread might have run iterations
+      // of the outer parallel_for, and so might have changed the thread specific value.
+      assert(localBuffer == tls.local()); // The assertion may fail!
+   });
+   DAAL_CHECK_SAFE_STATUS()
+
+   tls.reduce([&](float * localBuffer) {
+      if (localBuffer) {
+         /* Do reduction */
+         delete localBuffer;
+      }
+   });
diff --git a/docs/source/includes/threading/sum-parallel-by-blocks.rst b/docs/source/includes/threading/sum-parallel-by-blocks.rst
new file mode 100644
index 00000000000..0e05cae1008
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel-by-blocks.rst
@@ -0,0 +1,32 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/threading/threading.h"
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      constexpr size_t blockSize = 256;
+      const size_t nBlocks = (n + blockSize - 1) / blockSize;
+
+      daal::threader_for(nBlocks, nBlocks, [&](size_t iBlock) {
+         const size_t iStart = iBlock * blockSize;
+         const size_t iEnd = (iBlock < (nBlocks - 1)) ? iStart + blockSize : n;
+         for (size_t i = iStart; i < iEnd; ++i) {
+            c[i] = a[i] + b[i];
+         }
+      });
+   }
diff --git a/docs/source/includes/threading/sum-parallel.rst b/docs/source/includes/threading/sum-parallel.rst
new file mode 100644
index 00000000000..ba4ee4ae591
--- /dev/null
+++ b/docs/source/includes/threading/sum-parallel.rst
@@ -0,0 +1,26 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   #include "src/threading/threading.h"
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      daal::threader_for(n, n, [&](size_t i) {
+         c[i] = a[i] + b[i];
+      });
+   }
+
diff --git a/docs/source/includes/threading/sum-sequential.rst b/docs/source/includes/threading/sum-sequential.rst
new file mode 100644
index 00000000000..b91c7c2d836
--- /dev/null
+++ b/docs/source/includes/threading/sum-sequential.rst
@@ -0,0 +1,23 @@
+.. ******************************************************************************
+.. * Copyright contributors to the oneDAL project
+.. *
+.. * Licensed under the Apache License, Version 2.0 (the "License");
+.. * you may not use this file except in compliance with the License.
+.. * You may obtain a copy of the License at
+.. *
+.. *     http://www.apache.org/licenses/LICENSE-2.0
+.. *
+.. * Unless required by applicable law or agreed to in writing, software
+.. * distributed under the License is distributed on an "AS IS" BASIS,
+.. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. * See the License for the specific language governing permissions and
+.. * limitations under the License.
+.. *******************************************************************************/
+
+::
+
+   void sum(const size_t n, const float* a, const float* b, float* c) {
+      for (size_t i = 0; i < n; ++i) {
+        c[i] = a[i] + b[i];
+      }
+   }
diff --git a/docs/source/index-toc.rst b/docs/source/index-toc.rst
index 2c0ff1e6011..6ce2d33a458 100644
--- a/docs/source/index-toc.rst
+++ b/docs/source/index-toc.rst
@@ -22,7 +22,7 @@
 
    data-analytics-pipeline.rst
    system-requirements.rst
-   
+
 .. toctree::
    :caption: Get Started
    :maxdepth: 2
@@ -52,3 +52,10 @@
    :caption: Contributing Guide
 
    contribution/coding_guide.rst
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :caption: Custom Components
+
+   contribution/threading.rst
diff --git a/makefile b/makefile
index 817e165dc14..2984ec4eba9 100644
--- a/makefile
+++ b/makefile
@@ -120,6 +120,7 @@ y      := $(notdir $(filter $(_OS)/%,lnx/so win/dll mac/dylib))
 -Q     := $(if $(OS_is_win),$(if $(COMPILER_is_vc),-,-Q),-)
 -cxx17 := $(if $(COMPILER_is_vc),/std:c++17,$(-Q)std=c++17)
 -fPIC  := $(if $(OS_is_win),,-fPIC)
+-DMKL_ILP64 := $(if $(filter mkl,$(BACKEND_CONFIG)),-DMKL_ILP64)
 -Zl    := $(-Zl.$(COMPILER))
 -DEBC  := $(if $(REQDBG),$(-DEBC.$(COMPILER)) -DDEBUG_ASSERT -DONEDAL_ENABLE_ASSERT) -DTBB_SUPPRESS_DEPRECATED_MESSAGES -D__TBB_LEGACY_MODE
 -DEBJ  := $(if $(REQDBG),-g,-g:none)
@@ -271,15 +272,7 @@ releasetbb.LIBS_Y := $(TBBDIR.soia)/$(plib)tbb$(if $(OS_is_win),12$(dtbb),).$(y)
                                        $(if $(wildcard $(TBBDIR.soia)/libtbbmalloc.2.dylib),$(wildcard $(TBBDIR.soia)/libtbbmalloc.2.dylib)))
 
 
-#============================= Micromkl folders =====================================
-RELEASEDIR.include.mklgpufpk := $(RELEASEDIR.include)/services/internal/sycl/math
-
-MKLGPUFPKDIR:= $(if $(wildcard $(DIR)/__deps/mklgpufpk/$(_OS)/*),$(DIR)/__deps/mklgpufpk/$(_OS),$(subst \,/,$(MKLGPUFPKROOT)))
-MKLGPUFPKDIR.include := $(MKLGPUFPKDIR)/include
-MKLGPUFPKDIR.lib   := $(MKLGPUFPKDIR)/lib
-
-mklgpufpk.LIBS_A := $(MKLGPUFPKDIR.lib)/$(plib)daal_sycl$d.$(a)
-mklgpufpk.HEADERS := $(MKLGPUFPKDIR.include)/mkl_dal_sycl.hpp $(MKLGPUFPKDIR.include)/mkl_dal_blas_sycl.hpp
+#============================= Math backend folders =====================================
 
 ifeq ($(BACKEND_CONFIG), ref)
     ifeq ($(RNG_BACKEND), openrng)
@@ -421,7 +414,7 @@ THR.srcdir       := $(CPPDIR.daal)/src/threading
 CORE.srcdir      := $(CPPDIR.daal)/src/algorithms
 EXTERNALS.srcdir := $(CPPDIR.daal)/src/externals
 
-CORE.SERV.srcdir          := $(CPPDIR.daal)/src/services
+CORE.SERV.srcdir := $(subst \,/,$(CPPDIR.daal)/src/services)
 CORE.SERV.COMPILER.srcdir := $(CPPDIR.daal)/src/services/compiler/$(CORE.SERV.COMPILER.$(COMPILER))
 
 CORE.srcdirs  := $(CORE.SERV.srcdir) $(CORE.srcdir)                  \
@@ -490,7 +483,7 @@ $(WORKDIR.lib)/$(core_y):                   $(daaldep.math_backend.ext) \
                                             $(CORE.tmpdir_y)/$(core_y:%.$y=%_link.txt) ; $(LINK.DYNAMIC) ; $(LINK.DYNAMIC.POST)
 
 $(CORE.objs_a): $(CORE.tmpdir_a)/inc_a_folders.txt
-$(CORE.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC)
+$(CORE.objs_a): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64)
 $(CORE.objs_a): COPT += -D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \
                         -DDAAL_HIDE_DEPRECATED -DTBB_USE_ASSERT=0 -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
                         $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG)
@@ -499,7 +492,7 @@ $(CORE.objs_a): COPT += @$(CORE.tmpdir_a)/inc_a_folders.txt
 $(eval $(call append_uarch_copt,$(CORE.objs_a)))
 
 $(CORE.objs_y): $(CORE.tmpdir_y)/inc_y_folders.txt
-$(CORE.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC)
+$(CORE.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-DMKL_ILP64)
 $(CORE.objs_y): COPT += -D__DAAL_IMPLEMENTATION \
                         -D__TBB_NO_IMPLICIT_LINKAGE -DDAAL_NOTHROW_EXCEPTIONS \
                         -DDAAL_HIDE_DEPRECATED -DTBB_USE_ASSERT=0 -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
@@ -550,7 +543,7 @@ PARAMETERS.tmpdir_a.dpc := $(WORKDIR)/parameters_dpc_static
 PARAMETERS.tmpdir_y.dpc := $(WORKDIR)/parameters_dpc_dynamic
 
 ONEAPI.incdirs.common := $(CPPDIR)
-ONEAPI.incdirs.thirdp := $(CORE.incdirs.common) $(daaldep.math_backend.incdir) $(TBBDIR.include)
+ONEAPI.incdirs.thirdp := $(CORE.incdirs.common) $(daaldep.math_backend_oneapi.incdir) $(TBBDIR.include)
 ONEAPI.incdirs := $(ONEAPI.incdirs.common) $(CORE.incdirs.thirdp) $(ONEAPI.incdirs.thirdp)
 
 ONEAPI.dispatcher_cpu = $(WORKDIR)/oneapi/dal/_dal_cpu_dispatcher_gen.hpp
@@ -682,7 +675,7 @@ $(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_a.dpc),.dpcpp))
 
 # Set compilation options to the object files which are part of DYNAMIC lib
 $(ONEAPI.objs_y): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y)/inc_y_folders.txt
-$(ONEAPI.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic.opts) \
+$(ONEAPI.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DMKL_ILP64) $(-DEBC) $(-EHsc) $(pedantic.opts) \
                           -DDAAL_NOTHROW_EXCEPTIONS \
                           -DDAAL_HIDE_DEPRECATED \
                           -D_ENABLE_ATOMIC_ALIGNMENT_FIX \
@@ -695,7 +688,7 @@ $(ONEAPI.objs_y): COPT += $(-fPIC) $(-cxx17) $(-Zl) $(-DEBC) $(-EHsc) $(pedantic
 $(eval $(call update_copt_from_dispatcher_tag,$(ONEAPI.objs_y)))
 
 $(ONEAPI.objs_y.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y.dpc)/inc_y_folders.txt
-$(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \
+$(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DMKL_ILP64) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \
                               -DDAAL_NOTHROW_EXCEPTIONS \
                               -DDAAL_HIDE_DEPRECATED \
                               -DONEDAL_DATA_PARALLEL \
@@ -788,7 +781,8 @@ $(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(if $(REQDBG),-flink-huge-device-code,)
 $(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(if $(OS_is_win),-IMPLIB:$(@:%.$(MAJORBINARY).dll=%_dll.lib),)
 $(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(if $(OS_is_win),$(WORKDIR.lib)/$(core_y:%.$(MAJORBINARY).dll=%_dll.lib))
 $(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(if $(OS_is_win),sycl$d.lib OpenCL.lib)
-$(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(mklgpufpk.LIBS_A)
+$(WORKDIR.lib)/$(oneapi_y.dpc): LOPT += $(daaldep.math_backend.sycl)
+
 ifdef OS_is_win
 $(WORKDIR.lib)/$(oneapi_y.dpc:%.$(MAJORBINARY).dll=%_dll.lib): $(WORKDIR.lib)/$(oneapi_y.dpc)
 endif
@@ -831,14 +825,14 @@ THR_TBB.objs_y := $(addprefix $(THR.tmpdir_y)/,$(THR.srcs:%.cpp=%_tbb.$o))
 -include $(THR.tmpdir_y)/*.d
 
 $(WORKDIR.lib)/$(thr_tbb_a): LOPT:=
-$(WORKDIR.lib)/$(thr_tbb_a): $(THR_TBB.objs_a) $(daaldep.math_backend.thr) ; $(LINK.STATIC)
+$(WORKDIR.lib)/$(thr_tbb_a): $(THR_TBB.objs_a) ; $(LINK.STATIC)
 
 $(THR.tmpdir_y)/%_link.def: $(THR.srcdir)/$(daaldep.$(PLAT).threxport) | $(THR.tmpdir_y)/.
 	$(daaldep.$(_OS).threxport.create) > $@
 
 $(WORKDIR.lib)/$(thr_tbb_y): LOPT += $(-fPIC) $(daaldep.rt.thr)
 $(WORKDIR.lib)/$(thr_tbb_y): LOPT += $(if $(OS_is_win),-IMPLIB:$(@:%.dll=%_dll.lib),)
-$(WORKDIR.lib)/$(thr_tbb_y): $(THR_TBB.objs_y) $(daaldep.math_backend.thr) $(if $(OS_is_win),$(THR.tmpdir_y)/dll_tbb.res,) $(THR.tmpdir_y)/$(thr_tbb_y:%.$y=%_link.def) ; $(LINK.DYNAMIC) ; $(LINK.DYNAMIC.POST)
+$(WORKDIR.lib)/$(thr_tbb_y): $(THR_TBB.objs_y) $(if $(OS_is_win),$(THR.tmpdir_y)/dll_tbb.res,) $(THR.tmpdir_y)/$(thr_tbb_y:%.$y=%_link.def) ; $(LINK.DYNAMIC) ; $(LINK.DYNAMIC.POST)
 
 THR.objs_a := $(THR_TBB.objs_a)
 THR.objs_y := $(THR_TBB.objs_y)
@@ -983,17 +977,17 @@ $(foreach x,$(release.PARAMETERS.LIBS_Y.dpc),$(eval $(call .release.y_win,$x,$(R
 endif
 endif
 
-ifneq ($(MKLGPUFPKDIR),)
+ifneq ($(MKLGPUDIR),)
 # Copies the file to the destination directory and renames daal -> onedal
 # $1: Path to the file to be copied
 # $2: Destination directory
 define .release.sycl.old
-_release_common: $2/$(subst daal_sycl$d.$a,onedal_sycl$d.$a,$(notdir $1))
-$2/$(subst daal_sycl$d.$a,onedal_sycl$d.$a,$(notdir $1)): $(call frompf1,$1) | $2/. ; $(value cpy)
+_release_common: $2/$(subst mkl_sycl$d.$a,onedal_sycl$d.$a,$(notdir $1))
+$2/$(subst mkl_sycl$d.$a,onedal_sycl$d.$a,$(notdir $1)): $(call frompf1,$1) | $2/. ; $(value cpy)
 endef
 
-$(foreach t,$(mklgpufpk.HEADERS),$(eval $(call .release.sycl.old,$t,$(RELEASEDIR.include.mklgpufpk))))
-$(foreach t,$(mklgpufpk.LIBS_A), $(eval $(call .release.sycl.old,$t,$(RELEASEDIR.libia))))
+$(foreach t,$(mklgpu.HEADERS),$(eval $(call .release.sycl.old,$t,$(RELEASEDIR.include.mklgpu))))
+$(foreach t,$(daaldep.math_backend.sycl), $(eval $(call .release.sycl.old,$t,$(RELEASEDIR.libia))))
 endif
 
 _release_c: ./deploy/pkg-config/pkg-config.tpl
@@ -1117,6 +1111,4 @@ Flags:
 endef
 
 daal_dbg:
-	@echo "1" "!$(mklgpufpk.LIBS_A)!"
-	@echo "2" "!$(MKLGPUFPKDIR)!"
-	@echo "3" "!$(MKLGPUFPKROOT)!"
+	@echo "1" "!$(MKLDIR)!"