diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 0000000000..43646d05d7 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,8 @@ +--- +Checks: > + bugprone-reserved-identifier, + -clang-analyzer* + +WarningsAsErrors: '*' +FormatStyle: 'file' +HeaderFilterRegex: '.*Kokkos.*' diff --git a/.github/workflows/at2.yml b/.github/workflows/at2.yml index 042ad27a93..4a62e3b0c9 100644 --- a/.github/workflows/at2.yml +++ b/.github/workflows/at2.yml @@ -18,12 +18,27 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + KOKKOS_VERSION: 4.4.01 + jobs: mi210: uses: ./.github/workflows/mi210.yml + with: + kokkos_version: 4.4.01 h100: uses: ./.github/workflows/h100.yml + with: + kokkos_version: 4.4.01 bdw: uses: ./.github/workflows/bdw.yml - #spr: - #uses: ./.github/workflows/spr.yml \ No newline at end of file + with: + kokkos_version: 4.4.01 + spr: + uses: ./.github/workflows/spr.yml + with: + kokkos_version: 4.4.01 + volta70: + uses: ./.github/workflows/volta70.yml + with: + kokkos_version: 4.4.01 diff --git a/.github/workflows/bdw.yml b/.github/workflows/bdw.yml index 3db73f06b4..14330e460d 100644 --- a/.github/workflows/bdw.yml +++ b/.github/workflows/bdw.yml @@ -1,7 +1,16 @@ name: Reusable BDW workflow +permissions: + contents: none + on: - workflow_call + workflow_call: + inputs: + kokkos_version: + description: 'The Kokkos Core version to build' + default: '' + required: true + type: string jobs: # PR_BDW_GNU1020_OPENMP_LEFT_REL_NOETI: @@ -10,15 +19,15 @@ jobs: # # steps: # - name: checkout_kokkos_kernels -# uses: actions/checkout@v3 +# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # with: # path: kokkos-kernels # # - name: checkout_kokkos -# uses: actions/checkout@v3 +# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # with: # repository: kokkos/kokkos -# ref: ${{ github.base_ref }} +# ref: ${{ inputs.kokkos_version }} # path: kokkos # # - name: configure_kokkos @@ -44,7 +53,7 @@ jobs: # # - name: build_and_install_kokkos # working-directory: kokkos/build -# run: make -j12 install +# run: make -j8 install # # - name: configure_kokkos_kernels # run: | @@ -79,7 +88,7 @@ jobs: # # - name: build_kokkos_kernels # working-directory: kokkos-kernels/build -# run: make -j12 all +# run: make -j8 all # # - name: test # working-directory: kokkos-kernels/build @@ -91,15 +100,15 @@ jobs: # # steps: # - name: checkout_kokkos_kernels -# uses: actions/checkout@v3 +# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # with: # path: kokkos-kernels # # - name: checkout_kokkos -# uses: actions/checkout@v3 +# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 # with: # repository: kokkos/kokkos -# ref: ${{ github.base_ref }} +# ref: ${{ inputs.kokkos_version }} # path: kokkos # # - name: configure_kokkos @@ -127,7 +136,7 @@ jobs: # # - name: build_and_install_kokkos # working-directory: kokkos/build -# run: make -j12 install +# run: make -j8 install # # - name: configure_kokkos_kernels # run: | @@ -161,7 +170,7 @@ jobs: # # - name: build_kokkos_kernels # working-directory: kokkos-kernels/build -# run: make -j12 all +# run: make -j8 all # # - name: test # working-directory: kokkos-kernels/build @@ -173,15 +182,15 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos - ref: ${{ github.base_ref }} + ref: ${{ inputs.kokkos_version }} path: kokkos - name: configure_kokkos @@ -252,86 +261,86 @@ jobs: working-directory: kokkos-kernels/build run: ctest --output-on-failure -V --timeout 3600 -# PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL: -# name: PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL -# runs-on: [kk-env-llvm-10.0.1-latest] -# -# steps: -# - name: checkout_kokkos_kernels -# uses: actions/checkout@v3 -# with: -# path: kokkos-kernels -# -# - name: checkout_kokkos -# uses: actions/checkout@v3 -# with: -# repository: kokkos/kokkos -# ref: ${{ github.base_ref }} -# path: kokkos -# -# - name: configure_kokkos -# run: | -# mkdir -p kokkos/{build,install} -# cd kokkos/build -# cmake \ -# -DCMAKE_CXX_COMPILER=clang++ \ -# -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ -# -DCMAKE_EXE_LINKER_FLAGS= \ -# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ -# -DKokkos_ENABLE_SERIAL=ON \ -# -DKokkos_ENABLE_THREADS=ON \ -# -DKokkos_ARCH_BDW=ON \ -# -DKokkos_ENABLE_TESTS=OFF \ -# -DKokkos_ENABLE_EXAMPLES=OFF \ -# -DCMAKE_VERBOSE_MAKEFILE=ON \ -# -DCMAKE_CXX_EXTENSIONS=OFF \ -# -DCMAKE_CXX_STANDARD=17 \ -# -DBUILD_SHARED_LIBS=OFF \ -# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -# .. -# -# - name: build_and_install_kokkos -# working-directory: kokkos/build -# run: make -j12 install -# -# - name: configure_kokkos_kernels -# run: | -# mkdir -p kokkos-kernels/{build,install} -# cd kokkos-kernels/build -# cmake \ -# -DCMAKE_CXX_COMPILER=clang++ \ -# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ -# -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ -# -DCMAKE_INSTALL_PREFIX= \ -# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ -# -DKokkosKernels_ENABLE_TESTS=ON \ -# -DKokkosKernels_ENABLE_PERFTESTS=ON \ -# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ -# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ -# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ -# -DKokkosKernels_INST_DOUBLE=ON \ -# -DKokkosKernels_INST_ORDINAL_INT=ON \ -# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -# -DKokkosKernels_INST_OFFSET_INT=ON \ -# -DKokkosKernels_INST_LAYOUTLEFT=ON \ -# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ -# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ -# -DCMAKE_EXE_LINKER_FLAGS="" \ -# -DBUILD_SHARED_LIBS=OFF \ -# -DKokkosKernels_ENABLE_DOCS=OFF \ -# .. -# -# - name: build_kokkos_kernels -# working-directory: kokkos-kernels/build -# run: make -j12 all -# -# - name: test -# working-directory: kokkos-kernels/build -# run: ctest --output-on-failure -V --timeout 3600 -# -# \ No newline at end of file + PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL: + name: PR_BDW_CLANG1001_THREADS_SERIAL_LEFT_REL + runs-on: [kk-env-llvm-10.0.1-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: kokkos/kokkos + ref: ${{ inputs.kokkos_version }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_EXE_LINKER_FLAGS= \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_SERIAL=ON \ + -DKokkos_ENABLE_THREADS=ON \ + -DKokkos_ARCH_BDW=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j8 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX= \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DCMAKE_EXE_LINKER_FLAGS="" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j8 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + \ No newline at end of file diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index c3ad5be106..374287a529 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -33,18 +33,18 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15 + uses: github/codeql-action/init@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: languages: c-cpp # If you wish to specify custom queries, you can do so here or in a config file. @@ -52,7 +52,7 @@ jobs: # Prefix the list here with "+" to use these queries and those in the config file. - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: 'kokkos/kokkos' path: 'kokkos' @@ -100,6 +100,6 @@ jobs: run: make -j2 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15 + uses: github/codeql-action/analyze@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: category: "/language:c-cpp" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index f072eabac8..955b3b3fb2 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -17,11 +17,11 @@ jobs: runs-on: ubuntu-latest steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - name: 'Checkout Repository' - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: 'Dependency Review' - uses: actions/dependency-review-action@5a2ce3f5b92ee19cbb1541a4984c76d921601d7c # v4.3.4 + uses: actions/dependency-review-action@4081bf99e2866ebe428fc0477b69eb4fcda7220a # v4.4.0 diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index a89f4480d9..02883a3013 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -13,24 +13,22 @@ jobs: docs-check: runs-on: [macos-latest] steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + - name: Install Dependencies run: | brew install doxygen python3 -m venv .venv . .venv/bin/activate - pip install sphinx -v "sphinx==6.2.1" - pip install breathe - pip install sphinx-rtd-theme + pip install -r kokkos-kernels/.github/workflows/requirements_docs.txt sphinx-build --version doxygen --version - - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 - with: - path: kokkos-kernels - - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos ref: 4.3.01 diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 2819fd1554..80f59cfdc0 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -13,7 +13,7 @@ jobs: clang-format-check: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Install Dependencies run: sudo apt install clang-format-16 diff --git a/.github/workflows/h100.yml b/.github/workflows/h100.yml index 15665b673d..f1d2033be7 100644 --- a/.github/workflows/h100.yml +++ b/.github/workflows/h100.yml @@ -1,7 +1,16 @@ name: Reusable H100 workflow +permissions: + contents: none + on: - workflow_call + workflow_call: + inputs: + kokkos_version: + description: 'The Kokkos Core version to build' + default: '' + required: true + type: string jobs: PR_HOPPER90_CUDA1180_CUDA_LEFT_RIGHT_REL: @@ -10,15 +19,15 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos - ref: ${{ github.base_ref }} + ref: ${{ inputs.kokkos_version }} path: kokkos - name: configure_kokkos diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml new file mode 100644 index 0000000000..3047830ea7 --- /dev/null +++ b/.github/workflows/linux.yml @@ -0,0 +1,100 @@ +name: github-Linux + +on: + pull_request: + paths-ignore: + - '**/*.rst' + - '**/*.md' + - '**/requirements.txt' + - '**/*.py' + - 'docs/**' + types: [ opened, reopened, synchronize ] + +permissions: + contents: none + +# Cancels any in progress 'workflow' associated with this PR +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + kokkos_version: 4.4.01 + +jobs: + check-pr-labels: + runs-on: [ubuntu-latest] + steps: + - uses: docker://agilepathway/pull-request-label-checker@sha256:ee57b0e1aedab22063ce6467a6e6358e254a9204693ca20d8a16b2d891db8d5f # v1.6.32 + with: + none_of: 'AT: WIP' + repo_token: ${{ secrets.GITHUB_TOKEN }} + sanitizers: + needs: check-pr-labels + # TODO: allow re-run via retest label if: ${{ github.event.label.name == 'AT: RETEST' }} + name: ubuntu-asan-ubsan-ci + runs-on: [ubuntu-latest] + + strategy: + matrix: + include: + - backend: "SERIAL" + cmake_build_type: "RelWithDebInfo" + debug_bounds_check: "OFF" + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: kokkos/kokkos + ref: ${{ env.kokkos_version }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cmake -S kokkos -B kokkos/build \ + -DKokkos_ENABLE_${{ matrix.backend }}=ON \ + -DCMAKE_CXX_STANDARD=17 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ + -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize=signed-integer-overflow -fno-sanitize=vptr -fno-sanitize-recover=all" \ + -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize=signed-integer-overflow -fno-sanitize=vptr -fno-sanitize-recover=all" \ + -DCMAKE_INSTALL_PREFIX=$(realpath kokkos/install) + + - name: build_and_install_kokkos + run: cmake --build kokkos/build --target install --parallel $(nproc) + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/build + cmake -S kokkos-kernels -B kokkos-kernels/build \ + -DKokkos_ROOT=$(realpath kokkos/install) \ + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=OFF \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_COMPLEX_FLOAT=OFF \ + -DKokkosKernels_INST_FLOAT=OFF \ + -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=OFF \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=OFF \ + -DCMAKE_CXX_FLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize=signed-integer-overflow -fno-sanitize=vptr -fno-sanitize-recover=all" \ + -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address -fsanitize=undefined -fno-sanitize=signed-integer-overflow -fno-sanitize=vptr -fno-sanitize-recover=all" + + - name: build_kokkos_kernels + run: cmake --build kokkos-kernels/build --parallel $(nproc) + + - name: test + run: UBSAN_OPTIONS=print_stacktrace=1 ctest --test-dir kokkos-kernels/build -j$(nproc) --output-on-failure --timeout 7200 diff --git a/.github/workflows/mi210.yml b/.github/workflows/mi210.yml index c9fc4a6aed..4ec88cc2b9 100644 --- a/.github/workflows/mi210.yml +++ b/.github/workflows/mi210.yml @@ -1,106 +1,114 @@ name: Reusable MI210 workflow +permissions: + contents: none + on: - workflow_call + workflow_call: + inputs: + kokkos_version: + description: 'The Kokkos Core version to build' + default: '' + required: true + type: string jobs: -# PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL: -# name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_REL -# runs-on: [kk-env-hip-5.6.1-latest] -# -# steps: -# - name: checkout_kokkos_kernels -# uses: actions/checkout@v4 -# with: -# path: kokkos-kernels -# -# - name: checkout_kokkos -# uses: actions/checkout@v4 -# with: -# repository: kokkos/kokkos -# ref: ${{ github.base_ref }} -# path: kokkos -# -# - name: configure_kokkos -# run: | -# mkdir -p kokkos/{build,install} -# cd kokkos/build -# HIPCC=$(which hipcc) -# cmake -DCMAKE_CXX_COMPILER=$HIPCC \ -# -DCMAKE_CXX_FLAGS=-O3 \ -# -DCMAKE_EXE_LINKER_FLAGS= \ -# -DCMAKE_INSTALL_PREFIX=$PWD/../install \ -# -DKokkos_ENABLE_SERIAL=ON \ -# -DKokkos_ENABLE_HIP=ON \ -# -DKokkos_ARCH_VEGA90A=ON \ -# -DKokkos_ENABLE_TESTS=OFF \ -# -DKokkos_ENABLE_EXAMPLES=OFF \ -# -DCMAKE_VERBOSE_MAKEFILE=ON \ -# -DCMAKE_CXX_EXTENSIONS=OFF \ -# -DCMAKE_CXX_STANDARD=17 \ -# -DBUILD_SHARED_LIBS=OFF \ -# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -# -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ -# -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -# .. -# -# - name: build_and_install_kokkos -# working-directory: kokkos/build -# run: make -j16 install -# -# - name: configure_kokkos_kernels -# run: | -# mkdir -p kokkos-kernels/{build,install} -# cd kokkos-kernels/build -# HIPCC=$(which hipcc) -# cmake -DCMAKE_CXX_COMPILER=$HIPCC \ -# -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ -# -DCMAKE_CXX_FLAGS="-O3 " \ -# -DCMAKE_INSTALL_PREFIX= \ -# -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ -# -DKokkosKernels_ENABLE_TESTS=ON \ -# -DKokkosKernels_ENABLE_PERFTESTS=ON \ -# -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ -# -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ -# -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ -# -DKokkosKernels_INST_DOUBLE=ON \ -# -DKokkosKernels_INST_ORDINAL_INT=ON \ -# -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -# -DKokkosKernels_INST_OFFSET_INT=ON \ -# -DKokkosKernels_INST_LAYOUTLEFT=ON \ -# -DKokkosKernels_ENABLE_TPL_ROCSPARSE=OFF \ -# -DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -# -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ -# -DCMAKE_EXE_LINKER_FLAGS="" \ -# -DBUILD_SHARED_LIBS=OFF \ -# -DKokkosKernels_ENABLE_DOCS=OFF \ -# .. -# -# - name: build -# working-directory: kokkos-kernels/build -# run: make -j12 all -# -# - name: test -# working-directory: kokkos-kernels/build -# run: ctest --output-on-failure -V --timeout 3600 - - PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL: - name: PR_VEGA908_ROCM561_HIP_SERIAL_LEFT_OPENBLAS_REL + PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT: + name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT + runs-on: [kk-env-openblas-0.3.23-hip-5.6.1-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: kokkos/kokkos + ref: ${{ inputs.kokkos_version }} + path: kokkos + + - name: configure_kokkos + run: | + echo "GITHUB_WORKSPACE = $GITHUB_WORKSPACE" + mkdir "$GITHUB_WORKSPACE/kokkos/build" + mkdir "$GITHUB_WORKSPACE/kokkos/install" + cd "$GITHUB_WORKSPACE/kokkos/build" + HIPCC=$(which hipcc) + cmake -S "$GITHUB_WORKSPACE/kokkos" \ + -B "$GITHUB_WORKSPACE/kokkos/build" \ + -D CMAKE_CXX_COMPILER=$HIPCC \ + -D CMAKE_CXX_FLAGS="-O3" \ + -D CMAKE_EXE_LINKER_FLAGS= \ + -D CMAKE_INSTALL_PREFIX="$GITHUB_WORKSPACE/kokkos/install" \ + -D CMAKE_VERBOSE_MAKEFILE=ON \ + -D CMAKE_CXX_EXTENSIONS=OFF \ + -D CMAKE_CXX_STANDARD=17 \ + -D BUILD_SHARED_LIBS=OFF \ + -D Kokkos_ENABLE_SERIAL=ON \ + -D Kokkos_ENABLE_HIP=ON \ + -D Kokkos_ARCH_VEGA90A=ON \ + -D Kokkos_ENABLE_TESTS=OFF \ + -D Kokkos_ENABLE_EXAMPLES=OFF \ + -D Kokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -D Kokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -D Kokkos_ENABLE_DEPRECATION_WARNINGS=OFF + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j16 install + + - name: configure_kokkos_kernels + run: | + mkdir "$GITHUB_WORKSPACE/kokkos-kernels/build" + mkdir "$GITHUB_WORKSPACE/kokkos-kernels/install" + cd "$GITHUB_WORKSPACE/kokkos-kernels/build" + HIPCC=$(which hipcc) + cmake -S "$GITHUB_WORKSPACE/kokkos-kernels" \ + -B "$GITHUB_WORKSPACE/kokkos-kernels/build" \ + -D CMAKE_CXX_COMPILER=$HIPCC \ + -D CMAKE_CXX_FLAGS="-O3 " \ + -D CMAKE_INSTALL_PREFIX= \ + -D CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -D CMAKE_EXE_LINKER_FLAGS="" \ + -D BUILD_SHARED_LIBS=OFF \ + -D Kokkos_ROOT="$GITHUB_WORKSPACE/kokkos/install" \ + -D KokkosKernels_ENABLE_TESTS=ON \ + -D KokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -D KokkosKernels_ENABLE_PERFTESTS=ON \ + -D KokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -D KokkosKernels_INST_DOUBLE=ON \ + -D KokkosKernels_INST_ORDINAL_INT=ON \ + -D KokkosKernels_INST_OFFSET_SIZE_T=ON \ + -D KokkosKernels_INST_OFFSET_INT=ON \ + -D KokkosKernels_INST_LAYOUTLEFT=ON \ + -D KokkosKernels_ENABLE_DOCS=OFF + + - name: build + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure -V --timeout 3600 + + PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_TPLS: + name: PR_VEGA90A_ROCM561_HIP_SERIAL_LEFT_TPLS runs-on: [kk-env-openblas-0.3.23-hip-5.6.1-latest] steps: - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos - ref: ${{ github.base_ref }} + ref: ${{ inputs.kokkos_version }} path: kokkos - name: configure_kokkos @@ -108,23 +116,24 @@ jobs: mkdir -p kokkos/{build,install} cd kokkos/build HIPCC=$(which hipcc) - cmake -DCMAKE_CXX_COMPILER=$HIPCC \ - -DCMAKE_CXX_FLAGS=-O3 \ - -DCMAKE_EXE_LINKER_FLAGS= \ - -DCMAKE_INSTALL_PREFIX=$PWD/../install \ - -DKokkos_ENABLE_SERIAL=ON \ - -DKokkos_ENABLE_HIP=ON \ - -DKokkos_ARCH_VEGA90A=ON \ - -DKokkos_ENABLE_TESTS=OFF \ - -DKokkos_ENABLE_EXAMPLES=OFF \ - -DCMAKE_VERBOSE_MAKEFILE=ON \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DCMAKE_CXX_STANDARD=17 \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ - -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ - .. + cmake -S "$GITHUB_WORKSPACE/kokkos" \ + -B "$GITHUB_WORKSPACE/kokkos/build" \ + -D CMAKE_CXX_COMPILER=$HIPCC \ + -D CMAKE_CXX_FLAGS=-O3 \ + -D CMAKE_EXE_LINKER_FLAGS= \ + -D CMAKE_INSTALL_PREFIX="$GITHUB_WORKSPACE/kokkos/install" \ + -D CMAKE_VERBOSE_MAKEFILE=ON \ + -D CMAKE_CXX_EXTENSIONS=OFF \ + -D CMAKE_CXX_STANDARD=17 \ + -D BUILD_SHARED_LIBS=OFF \ + -D Kokkos_ENABLE_SERIAL=ON \ + -D Kokkos_ENABLE_HIP=ON \ + -D Kokkos_ARCH_VEGA90A=ON \ + -D Kokkos_ENABLE_TESTS=OFF \ + -D Kokkos_ENABLE_EXAMPLES=OFF \ + -D Kokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -D Kokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + -D Kokkos_ENABLE_DEPRECATION_WARNINGS=OFF - name: build_and_install_kokkos working-directory: kokkos/build @@ -135,32 +144,30 @@ jobs: mkdir -p kokkos-kernels/{build,install} cd kokkos-kernels/build HIPCC=$(which hipcc) - cmake -DCMAKE_CXX_COMPILER=$HIPCC \ - -DKokkos_DIR=$PWD/../../kokkos/install/lib64/cmake/Kokkos \ - -DCMAKE_CXX_FLAGS="-O3 -I$ROCM_CORE_ROOT/include" \ - -DCMAKE_INSTALL_PREFIX= \ - -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_PERFTESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ - -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ - -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - -DKokkosKernels_INST_LAYOUTLEFT=ON \ - -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ - -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ - -DKokkosKernels_ENABLE_TPL_ROCSOLVER=ON \ - -DKokkosKernels_ENABLE_TPL_ROCSPARSE=ON \ - -DKokkosKernels_ENABLE_TPL_ROCBLAS=ON \ - -DKokkosKernels_ENABLE_TPL_BLAS=ON \ - -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ - -DCMAKE_EXE_LINKER_FLAGS="" \ - -DBUILD_SHARED_LIBS=OFF \ - -DKokkosKernels_ENABLE_DOCS=OFF \ - .. + cmake -S "$GITHUB_WORKSPACE/kokkos-kernels" \ + -B "$GITHUB_WORKSPACE/kokkos-kernels/build" \ + -D CMAKE_CXX_COMPILER=$HIPCC \ + -D CMAKE_CXX_FLAGS="-O3 -I$ROCM_CORE_ROOT/include" \ + -D CMAKE_INSTALL_PREFIX="$GITHUB_WORKSPACE/kokkos-kernels/install" \ + -D CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -D CMAKE_EXE_LINKER_FLAGS="" \ + -D BUILD_SHARED_LIBS=OFF \ + -D Kokkos_ROOT="$GITHUB_WORKSPACE/kokkos/install" \ + -D KokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -D KokkosKernels_INST_DOUBLE=ON \ + -D KokkosKernels_INST_ORDINAL_INT=ON \ + -D KokkosKernels_INST_OFFSET_SIZE_T=ON \ + -D KokkosKernels_INST_OFFSET_INT=ON \ + -D KokkosKernels_INST_LAYOUTLEFT=ON \ + -D KokkosKernels_ENABLE_TPL_ROCSOLVER=ON \ + -D KokkosKernels_ENABLE_TPL_ROCSPARSE=ON \ + -D KokkosKernels_ENABLE_TPL_ROCBLAS=ON \ + -D KokkosKernels_ENABLE_TPL_BLAS=ON \ + -D KokkosKernels_ENABLE_TESTS=ON \ + -D KokkosKernels_ENABLE_PERFTESTS=ON \ + -D KokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -D KokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -D KokkosKernels_ENABLE_DOCS=OFF - name: build working-directory: kokkos-kernels/build diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index b2227d0394..67c0cb3795 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -18,6 +18,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true +env: + kokkos_version: 4.4.01 + jobs: check-pr-labels: runs-on: [ubuntu-latest] @@ -50,15 +53,15 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos - ref: 4.3.01 + ref: ${{ env.kokkos_version }} path: kokkos - name: configure_kokkos diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f21120e376..6289799a95 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,7 +13,7 @@ jobs: hashes: ${{ steps.hash.outputs.hashes }} runs-on: ubuntu-latest steps: - - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: Build artifacts run: | git archive --prefix=kokkos-kernels-${{ github.ref_name }}/ -o kokkos-kernels-${{ github.ref_name }}.zip HEAD @@ -29,7 +29,7 @@ jobs: echo "hashes=$(base64 -w0 kokkos-kernels-${{ github.ref_name }}-SHA-256.txt)" >> "$GITHUB_OUTPUT" - name: Upload artifacts - uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: release-artifacts path: kokkos-kernels-${{ github.ref_name }}* @@ -63,7 +63,7 @@ jobs: with: name: release-artifacts - name: Upload assets - uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8 + uses: softprops/action-gh-release@e7a8f85e1c67a31e6ed99a94b41bd0b71bbee6b8 # v2.0.9 with: files: | kokkos-kernels-${{ github.ref_name }}.zip diff --git a/.github/workflows/requirements_docs.txt b/.github/workflows/requirements_docs.txt new file mode 100644 index 0000000000..7764703a39 --- /dev/null +++ b/.github/workflows/requirements_docs.txt @@ -0,0 +1,23 @@ +sphinx==6.2.1 +alabaster==0.7.16 +babel==2.16.0 +docutils==0.19 +imagesize==1.4.1 +jinja2==3.1.4 +packaging==24.1 +pygments==2.18.0 +requests==2.32.3 +snowballstemmer==2.2.0 +sphinxcontrib_htmlhelp==2.1.0 +sphinxcontrib_serializinghtml==2.0.0 +sphinxcontrib_applehelp==2.0.0 +sphinxcontrib_devhelp==2.0.0 +sphinxcontrib_jsmath==1.0.1 +sphinxcontrib_qthelp==2.0.0 +charset_normalizer==3.3.2 +idna==3.10 +MarkupSafe==2.1.5 +urllib3==2.2.3 +breathe==4.35.0 +sphinx_rtd_theme==3.0.0 +sphinxcontrib_jquery==4.1 diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index a48b1e6f89..d7c6bbffe9 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -33,12 +33,12 @@ jobs: steps: - name: Harden Runner - uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # v2.9.0 + uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 with: egress-policy: audit - name: "Checkout code" - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: persist-credentials: false @@ -65,7 +65,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # v4.3.4 + uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4.4.3 with: name: SARIF file path: results.sarif @@ -73,6 +73,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@afb54ba388a7dca6ecae48f608c4ff05ff4cc77a # v3.25.15 + uses: github/codeql-action/upload-sarif@662472033e021d55d94146f66f6058822b0b39fd # v3.27.0 with: sarif_file: results.sarif diff --git a/.github/workflows/spr.yml b/.github/workflows/spr.yml index 7f9136a699..265c460fd9 100644 --- a/.github/workflows/spr.yml +++ b/.github/workflows/spr.yml @@ -1,7 +1,16 @@ name: Reusable SPR workflow +permissions: + contents: none + on: - workflow_call + workflow_call: + inputs: + kokkos_version: + description: 'The Kokkos Core version to build' + default: '' + required: true + type: string jobs: PR_SPR_ONEAPI202310_OPENMP_LEFT_MKLBLAS_MKLLAPACK_REL: @@ -10,15 +19,15 @@ jobs: steps: - name: checkout_kokkos_kernels - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: path: kokkos-kernels - name: checkout_kokkos - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7 + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 with: repository: kokkos/kokkos - ref: ${{ github.base_ref }} + ref: ${{ inputs.kokkos_version }} path: kokkos - name: configure @@ -31,13 +40,16 @@ jobs: --arch=SPR \ --compiler=icpx \ --cxxflags="-fp-model=precise" \ + --with-scalars=double,complex_double,float,complex_float \ + --with-ordinals=int,int64_t \ + --with-offsets=int,size_t \ --with-tpls=mkl \ --kokkos-cmake-flags=-DKokkos_ENABLE_ONEDPL=OFF \ --kokkos-path=$PWD/../kokkos - name: build working-directory: build - run: make -j16 + run: make -j8 - name: test working-directory: build diff --git a/.github/workflows/volta70.yml b/.github/workflows/volta70.yml new file mode 100644 index 0000000000..a6f992df8c --- /dev/null +++ b/.github/workflows/volta70.yml @@ -0,0 +1,157 @@ +name: Reusable VOLTA70 workflow + +permissions: + contents: none + +on: + workflow_call: + inputs: + kokkos_version: + description: 'The Kokkos Core version to build' + default: '' + required: true + type: string + +jobs: + PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL: + name: PR_VOLTA70_CUDA1122_CUDA_LEFT_RIGHT_REL + runs-on: [kk-env-cuda-11.2.2-gcc-8.4.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: kokkos/kokkos + ref: ${{ inputs.kokkos_version }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake -DCMAKE_CXX_COMPILER=$PWD/../bin/nvcc_wrapper \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DCMAKE_CXX_COMPILER=$PWD/../../kokkos/bin/nvcc_wrapper \ + -DKokkos_ROOT="$PWD/../../kokkos/install" \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT=ON \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test + working-directory: kokkos-kernels/build + run: ctest --output-on-failure --timeout 3600 + + PR_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL: + name: PR_VOLTA70_GCC930_CLANG13_CUDA10_CUDA_LEFT_OPENBLAS_REL + runs-on: [kk-env-cuda-10.1.243-openblas-0.3.20-llvm-13.0.0-latest] + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + repository: kokkos/kokkos + ref: ${{ inputs.kokkos_version }} + path: kokkos + + - name: configure_kokkos + run: | + mkdir -p kokkos/{build,install} + cd kokkos/build + NVCC=$(which clang++) + cmake -DCMAKE_CXX_COMPILER=$NVCC \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkos_ENABLE_CUDA=ON \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ENABLE_TESTS=OFF \ + -DKokkos_ENABLE_EXAMPLES=OFF \ + -DCMAKE_VERBOSE_MAKEFILE=ON \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DCMAKE_CXX_STANDARD=17 \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j12 install + + - name: configure_kokkos_kernels + run: | + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + NVCC=$(which clang++) + cmake \ + -DCMAKE_CXX_COMPILER=$NVCC \ + -DKokkos_ROOT="$PWD/../../kokkos/install" \ + -DCMAKE_CXX_FLAGS="-O3 -Wall -Wunused-parameter -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized" \ + -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_PERFTESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DCMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT=ON \ + -DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF \ + -DKokkosKernels_ENABLE_TPL_BLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=ON \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=ON \ + -DCMAKE_EXE_LINKER_FLAGS="-lgfortran -lm" \ + -DBUILD_SHARED_LIBS=OFF \ + -DKokkosKernels_ENABLE_DOCS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j12 all + + - name: test EXCLUDE serial\."asum|nrm1"_complex_double + working-directory: kokkos-kernels/build + run: ctest --output-on-failure --timeout 3600 -E ".*(asum|nrm1).*" \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 343f815ed7..58695228e4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,83 @@ # Change Log +## [4.5.00](https://github.com/kokkos/kokkos-kernels/tree/4.5.00) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.01...4.5.00) + +### New Features + +#### Batched updates +- Implement batched serial laswp [\#2395](https://github.com/kokkos/kokkos-kernels/pull/2395) +- implement batched serial iamax [\#2399](https://github.com/kokkos/kokkos-kernels/pull/2399) +- Implement batched serial pbtrs [\#2330](https://github.com/kokkos/kokkos-kernels/pull/2330) +- Implement batched serial pbtrf [\#2322](https://github.com/kokkos/kokkos-kernels/pull/2322) +- Implement batched serial pttrs [\#2277](https://github.com/kokkos/kokkos-kernels/pull/2277) + +#### BLAS +- gemm perf_test: print matrix sizes [\#2362](https://github.com/kokkos/kokkos-kernels/pull/2362) + +#### LAPACK +- Modify validity checks for output views sizes in svd [\#2350](https://github.com/kokkos/kokkos-kernels/pull/2350) + +#### ODE +- Improved convergence and robustness of Runge-Kutta integrators [\#2229](https://github.com/kokkos/kokkos-kernels/pull/2229) + +#### Sparse +- Don't use bulk sort in KokkosSparse::sort_crs_matrix sometimes [\#2353](https://github.com/kokkos/kokkos-kernels/pull/2353) +- `OpenMPSmartStatic_SPMV.hpp`: throw if posix_memalign fails [\#2368](https://github.com/kokkos/kokkos-kernels/pull/2368) + +### Enhancements: +- Eti extern marking [\#2292](https://github.com/kokkos/kokkos-kernels/pull/2292) + +#### Common utilities +- Add KokkosKernels::eager_initialize() to common [\#2317](https://github.com/kokkos/kokkos-kernels/pull/2317) +- Put default types in KokkosKernels namespace [\#2341](https://github.com/kokkos/kokkos-kernels/pull/2341) + +#### TPL support +- Add MAGMA TPL support for GESV on HIP backend [\#2326](https://github.com/kokkos/kokkos-kernels/pull/2326) +- BLAS - gemv: using fallback when mode is 't' or 'c' and onemkl is used [\#2272](https://github.com/kokkos/kokkos-kernels/pull/2272) + +### Bug Fixes: +- SerialInverseLU: fix overflow in integer multiplication [\#2410](https://github.com/kokkos/kokkos-kernels/pull/2410) +- Fix potential overflow issue in spiluk [\#2409](https://github.com/kokkos/kokkos-kernels/pull/2409) +- Mult result conversion [\#2405](https://github.com/kokkos/kokkos-kernels/pull/2405) +- Blas1 asum: workaround for openblas error with short vectors [\#2384](https://github.com/kokkos/kokkos-kernels/pull/2384) +- Set `KokkosKernels_ENABLE_COMPONENT` variables to value instead of variable name [\#2380](https://github.com/kokkos/kokkos-kernels/pull/2380) +- Block Sptrsv fixes [\#2376](https://github.com/kokkos/kokkos-kernels/pull/2376) +- Fix set-but-unused in Test_ODE_BDF [\#2355](https://github.com/kokkos/kokkos-kernels/pull/2355) +- sparse_sort_crs: fix column shuffle indices [\#2346](https://github.com/kokkos/kokkos-kernels/pull/2346) +- Fix #2344: SVD hanging [\#2345](https://github.com/kokkos/kokkos-kernels/pull/2345) +- Some compilers throw shadow warnings in static functions [\#2297](https://github.com/kokkos/kokkos-kernels/pull/2297) +- A couple platforms do not correctly handle static complexes [\#2285](https://github.com/kokkos/kokkos-kernels/pull/2285) +- Help gcc/8.3 with ctad issue [\#2265](https://github.com/kokkos/kokkos-kernels/pull/2265) + +### Deprecations and Cleanup: +- Clean and replace forbidden names for macros and symbols (see [identifiers](https://en.cppreference.com/w/cpp/language/identifiers)) + - Rename reserved identifiers [\#2373](https://github.com/kokkos/kokkos-kernels/pull/2373) + - search/replace KOKKOS_-prefixed macros [\#2372](https://github.com/kokkos/kokkos-kernels/pull/2372) + - Deprecate `__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__` [\#2406](https://github.com/kokkos/kokkos-kernels/pull/2406) + - Deprecate `__KOKKOSBATCHED_ENABLE_INTEL_MKL__` [\#2403](https://github.com/kokkos/kokkos-kernels/pull/2403) + - Deprecate `__KOKKOSBATCHED_PROMOTION__` [\#2392](https://github.com/kokkos/kokkos-kernels/pull/2392) +- Update atomic function usage ahead of Kokkos deprecation and removal + - Prefer `expected == atomic_compare_exchange(ptr, expected, desired)` [\#2387](https://github.com/kokkos/kokkos-kernels/pull/2387) + - Prefer `atomic_assign(ptr, val) -> atomic_store(ptr, val)` [\#2383](https://github.com/kokkos/kokkos-kernels/pull/2383) + - Replace atomic_{inc, dec}[rement] [\#2386](https://github.com/kokkos/kokkos-kernels/pull/2386) + - Do not specify template argument when using Kokkos atomics [\#2382](https://github.com/kokkos/kokkos-kernels/pull/2382) +- Deprecate redundant team-level sort functions [\#2306](https://github.com/kokkos/kokkos-kernels/pull/2306) +- Free allocated `MatrixPrec` [\#2407](https://github.com/kokkos/kokkos-kernels/pull/2407) +- Reduce duplicated code in trsv [\#2388](https://github.com/kokkos/kokkos-kernels/pull/2388) +- perf_tests: remove false dependence on google test [\#2385](https://github.com/kokkos/kokkos-kernels/pull/2385) +- `kk_is_gpu_exec_space()` -> `is_gpu_exec_space_v` [\#2354](https://github.com/kokkos/kokkos-kernels/pull/2354) +- remove unneeded volatile qualifier for Kokkos::Single [\#2333](https://github.com/kokkos/kokkos-kernels/pull/2333) + +### Documentation and Testing: + +- CI: `address` sanitizer and most of `undefined` sanitizer [\#2408](https://github.com/kokkos/kokkos-kernels/pull/2408) +- Workflow volta70 [\#2356](https://github.com/kokkos/kokkos-kernels/pull/2356) +- AT-2: adding non-TPL build for HIP backend [\#2329](https://github.com/kokkos/kokkos-kernels/pull/2329) +- Workflows: Add remaining spr and bdw checks [\#2321](https://github.com/kokkos/kokkos-kernels/pull/2321) +- Remove review trigger and group github-{BDW,H100,MI201} under github-AT2 [\#2320](https://github.com/kokkos/kokkos-kernels/pull/2320) +- Don't error out if graph unit tests disabled [\#2305](https://github.com/kokkos/kokkos-kernels/pull/2305) + ## [4.4.01](https://github.com/kokkos/kokkos-kernels/tree/4.4.01) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.00...4.4.01) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3515e0c4..c766cdf187 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,8 +10,8 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 4) -SET(KokkosKernels_VERSION_PATCH 1) +SET(KokkosKernels_VERSION_MINOR 5) +SET(KokkosKernels_VERSION_PATCH 0) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -136,13 +136,13 @@ ELSE() SET(CMAKE_HIP_ARCHITECTURES ${Kokkos_HIP_ARCHITECTURES}) ENDIF() ENDIF() - IF(${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.3.01") + IF(${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.4.01") MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") - IF((${Kokkos_VERSION} VERSION_GREATER "4.4.99")) + IF((${Kokkos_VERSION} VERSION_GREATER "4.5.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.3.01 or greater (found ${Kokkos_VERSION})") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos 4.4.01 or greater (found ${Kokkos_VERSION})") ENDIF() ENDIF() diff --git a/batched/KokkosBatched_Util.hpp b/batched/KokkosBatched_Util.hpp index 8a1cb0e01b..520427e8c6 100644 --- a/batched/KokkosBatched_Util.hpp +++ b/batched/KokkosBatched_Util.hpp @@ -13,13 +13,27 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTIL_HPP__ -#define __KOKKOSBATCHED_UTIL_HPP__ +#ifndef KOKKOSBATCHED_UTIL_HPP +#define KOKKOSBATCHED_UTIL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) // no experimental name space guard for trilinos -#define __KOKKOSBATCHED_PROMOTION__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_PROMOTION \ + (__pragma(message("warning: __KOKKOSBATCHED_PROMOTION__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_PROMOTION \ + (__extension__({ \ + _Pragma("GCC warning \"__KOKKOSBATCHED_PROMOTION__ is deprecated and will be removed in a future version\""); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_PROMOTION 1 // no good way to deprecate? +#endif + +#define __KOKKOSBATCHED_PROMOTION__ KOKKOSBATCHED_IMPL_PROMOTION #include #include @@ -42,10 +56,41 @@ // TPL macros #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) -#define __KOKKOSBATCHED_ENABLE_INTEL_MKL__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL \ + (__pragma( \ + message("warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL \ + (__extension__({ \ + _Pragma("warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is deprecated and will be removed in a future version"); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL 1 // no good way to deprecate? +#endif +#define __KOKKOSBATCHED_ENABLE_INTEL_MKL__ KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL + #include "mkl_version.h" #if __INTEL_MKL__ >= 2018 -#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ 1 + +#if defined(KOKKOS_COMPILER_MSVC) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED \ + (__pragma(message( \ + "warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is deprecated and will be removed in a future version")) 1) +#elif defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED \ + (__extension__({ \ + _Pragma( \ + "warning: __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is deprecated and will be removed in a future version"); \ + 1; \ + })) +#else +#define KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED 1 // no good way to deprecate? +#endif +#define __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED + #define __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ 1 #include "mkl.h" // #include "mkl_types.h" @@ -671,4 +716,4 @@ KOKKOS_INLINE_FUNCTION void fma_bounds_check(ViewType v, SizeType m, SizeType n, } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_UTIL_HPP__ +#endif // KOKKOSBATCHED_UTIL_HPP diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp index d89a82ae2c..bc8c0ab772 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp index 634879530e..7b8220dcfe 100644 --- a/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_AddRadial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp index 2d3d2af915..095488ac16 100644 --- a/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyGivens_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_GIVENS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp index db85d96680..b39fa3e5ad 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp index e129fef5a5..574280c82b 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp index b322574ad0..dc5cb104dd 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp index 2474a10fe3..b3cfc8b6de 100644 --- a/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyHouseholder_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp index 10455f65b6..adbce23d9a 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp index a301382108..e013252255 100644 --- a/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyPivot_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp index ba9d85350f..f1f2d29089 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_APPLY_Q_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp index dbb11df747..d91ab8c6f6 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp index d6abd61a78..9437062765 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp index 8fc6c8a78a..5a2c074b54 100644 --- a/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ApplyQ_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_APPLY_Q_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp index 6d65ebc294..381a53e0a4 100644 --- a/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Axpy_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_AXPY_IMPL_HPP__ -#define __KOKKOSBATCHED_AXPY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_AXPY_IMPL_HPP +#define KOKKOSBATCHED_AXPY_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp index e11106cc24..0096c91e66 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_IMPL_HPP__ -#define __KOKKOSBATCHED_COPY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_COPY_IMPL_HPP +#define KOKKOSBATCHED_COPY_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp index 004c62646a..c25215be6d 100644 --- a/batched/dense/impl/KokkosBatched_Copy_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Copy_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_INTERNAL_HPP__ -#define __KOKKOSBATCHED_COPY_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_COPY_INTERNAL_HPP +#define KOKKOSBATCHED_COPY_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp index 48d1b1f1ac..75f366829c 100644 --- a/batched/dense/impl/KokkosBatched_Dot_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Dot_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_DOT_INTERNAL_HPP__ -#define __KOKKOSBATCHED_DOT_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_DOT_INTERNAL_HPP +#define KOKKOSBATCHED_DOT_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp index 8ca3b09e59..b76ad99d7c 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index b1cfb6ef25..62852f7872 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -338,7 +338,7 @@ struct SerialEigendecompositionInternal { inline static int host_invoke(const int m, RealType* A, const int as0, const int as1, RealType* er, const int ers, RealType* ei, const int eis, RealType* UL, const int uls0, const int uls1, RealType* UR, const int urs0, const int urs1, RealType* w, const int wlen) { -#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) +#if defined(__KOKKOSBATCHED_ENABLE_LAPACKE__) || defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) int matrix_layout(0), lda(0), uls(0), urs(0); if (as0 == 1) { assert(uls0 == 1 && "UL is not column major"); diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp index 97f68d63de..7ffc9f7bc0 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index 567bbd3ad5..95e9d1149d 100644 --- a/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp index 0ac8ed3859..408ba2f2ad 100644 --- a/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Eigenvalue_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_EIGENVALUE_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index 42dc948014..50bea3fd10 100644 --- a/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__ -#define __KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP +#define KOKKOSBATCHED_FIND_AMAX_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp index e303cafd1f..9a3be4ab56 100644 --- a/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Francis_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_FRANCIS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp index 82d6b1641b..fae44c8f83 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_GEMM_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" @@ -36,7 +36,7 @@ namespace KokkosBatched { /// NT/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template @@ -95,7 +95,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template @@ -154,7 +154,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template @@ -213,7 +213,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemm template diff --git a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index eaa5b67ffa..1a83a27112 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp index 64e65d62d8..3818cc4258 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GEMM_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 8ad7d570df..1b46270f5a 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp index 0a9fb87b9e..32c28b4562 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GEMM_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 1b77a25991..b8647f5205 100644 --- a/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_GEMM_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -115,7 +115,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( Kokkos::parallel_for(Kokkos::TeamThreadRange(member, mq * nq), [&](const int &ij) { int i, j; // note: the condition is constexpr - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { i = ij % mq * mb; j = ij / mq * nb; } else { diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 4f54bf7f31..9245295113 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GEMV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 8d9676b223..a10bfbaae1 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_GEMV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 16f12529d4..028e52b859 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GEMV_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 8f63e24b27..8b534c9081 100644 --- a/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_GEMV_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index ba18cbafd7..399eeea127 100644 --- a/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GESV_IMPL_HPP__ -#define __KOKKOSBATCHED_GESV_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GESV_IMPL_HPP +#define KOKKOSBATCHED_GESV_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp index 963862661b..2fb22d275d 100644 --- a/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Givens_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_GIVENS_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp index 658acd6b60..4cd3a27eeb 100644 --- a/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HadamardProduct_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__ -#define __KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP +#define KOKKOSBATCHED_HADAMARDPRODUCT_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 8db5d40a98..a72c8c0077 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_FORM_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp index 3815a9e18e..92498c0ce6 100644 --- a/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_HessenbergQR_WithShift_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_QR_WITH_SHIFT_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp index 44c5b44373..99ab0171b6 100644 --- a/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Hessenberg_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HESSENBERG_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp index 7e814646a2..8191ac2997 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Armpl_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_ARMPL_IMPL_HPP #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp index 6888de725d..2e63da90e7 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_DblBuf_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_DBLBUF_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosKernels_Error.hpp" @@ -249,8 +249,7 @@ class BatchedDblBufGemm { CViewType __C; ScalarType __alpha, __beta; int __k; - size_t __n_sub_tiles; - unsigned __tiles_per_col, __tiles_per_row; + size_t __n_sub_tiles, __tiles_per_col, __tiles_per_row; public: size_t get_n_sub_tiles() { return __n_sub_tiles; } @@ -283,8 +282,8 @@ class BatchedDblBufGemm { // with '!!'. This extra tile will hang off the edge of the 2-rank matrix. // For cases where tiles hang off the edge, we over-compute 0s within // registers via a conditional bounds check selected at compile-time. - __tiles_per_row = ei.__c_m / TILE_M + !!((unsigned)ei.__c_m % TILE_M); - __tiles_per_col = ei.__c_n / TILE_N + !!((unsigned)ei.__c_n % TILE_N); + __tiles_per_row = ei.__c_m / TILE_M + !!((size_t)ei.__c_m % TILE_M); + __tiles_per_col = ei.__c_n / TILE_N + !!((size_t)ei.__c_n % TILE_N); __n_sub_tiles = __tiles_per_row * __tiles_per_col; } diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index 6216aeb099..a248f4e14c 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP #include #include // Trans, BatchLayout #include @@ -128,7 +128,7 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, using layout_type = typename CViewType::array_layout; using exec_space = typename CViewType::execution_space; constexpr bool is_vector = KokkosBatched::is_vector::value; - constexpr bool on_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool on_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; constexpr bool on_x86_64 = KokkosKernels::Impl::kk_is_x86_64_mem_space(); constexpr bool on_a64fx = KokkosKernels::Impl::kk_is_a64fx_mem_space(); bool out_of_range = false; @@ -277,4 +277,4 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, } } // namespace Impl } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_IMPL_HPP diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp index 8da3c7acd1..c0d48da2f3 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_SERIAL_IMPL_HPP #include "KokkosBatched_Gemm_Decl.hpp" namespace KokkosBatched { diff --git a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp index 6f06694f09..3b312533aa 100644 --- a/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp +++ b/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP #include #include @@ -165,6 +165,72 @@ struct BatchedGemmSpec { } // namespace Impl } // namespace KokkosBatched +// ETI instantiation macros, consumed by *.cpp.in files +#define KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + extern template struct BatchedGemmSpec, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#if defined(KOKKOSKERNELS_DECL_LAYOUTRIGHT) +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutRight, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif + +#if defined(KOKKOSKERNELS_DECL_LAYOUTLEFT) +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, Kokkos::LayoutLeft, \ + EXEC_SPACE, MEM_SPACE) +#else +#define KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) +#endif + +///////////////// BatchLayout::Left Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Left, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLL_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Left, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +///////////////// BatchLayout::Right Permutations ///////////////// +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::NoTranspose, BatchLayout::Right, SCALAR, \ + LAYOUT, EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::NoTranspose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::NoTranspose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_DECL_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ + EXEC_SPACE, MEM_SPACE) + // ETI instantiation macros, consumed by *.cpp.in files #define KOKKOSBATCHED_GEMM_ETI_SPEC_INST_INNER(ARG_TRANS_A, ARG_TRANS_B, ARG_BATCH_LAYOUT, SCALAR, LAYOUT, EXEC_SPACE, \ MEM_SPACE) \ @@ -230,4 +296,15 @@ struct BatchedGemmSpec { #define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBATCHED_GEMM_BLR_ETI_SPEC_INST_INNER(Trans::Transpose, Trans::Transpose, BatchLayout::Right, SCALAR, LAYOUT, \ EXEC_SPACE, MEM_SPACE) -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP__ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_SPEC_HPP diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp index c8f5c7a20e..e4274d3352 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp index 0257ff4d9b..15e20a5e57 100644 --- a/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp index bc55a646bc..3bcde1964c 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp index 1074dc4280..efd046e232 100644 --- a/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp new file mode 100644 index 0000000000..9c0f99028b --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Iamax_Serial_Impl.hpp @@ -0,0 +1,37 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Iamax_Serial_Internal.hpp" + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION typename XViewType::size_type SerialIamax::invoke(const XViewType &x) { + static_assert(Kokkos::is_view_v, "KokkosBatched::iamax: XViewType is not a Kokkos::View."); + if (x.extent(0) <= 1) return 0; + using size_type = typename XViewType::size_type; + using value_type = typename XViewType::non_const_value_type; + return KokkosBatched::Impl::SerialIamaxInternal::invoke(x.extent(0), x.data(), x.stride(0)); +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_IAMAX_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp new file mode 100644 index 0000000000..89aed299ae --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Iamax_Serial_Internal.hpp @@ -0,0 +1,60 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ======================== + +struct SerialIamaxInternal { + template + KOKKOS_INLINE_FUNCTION static IndexType invoke(const int n, const ValueType *KOKKOS_RESTRICT x, const int xs0); +}; + +template +KOKKOS_INLINE_FUNCTION IndexType SerialIamaxInternal::invoke(const int n, const ValueType *KOKKOS_RESTRICT x, + const int xs0) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + + RealType amax = Kokkos::abs(x[0 * xs0]); + IndexType imax = 0; + + for (IndexType i = 1; i < static_cast(n); ++i) { + const RealType abs_x_i = Kokkos::abs(x[i * xs0]); + if (abs_x_i > amax) { + amax = abs_x_i; + imax = i; + } + } + + return imax; +}; + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_IAMAX_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp index eb576f1dff..771cd8ade9 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixA_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_A_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp index 6912c285a6..8a5f17eaea 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixB_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_B_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp index 9ad08549cb..e090ce57bd 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp index a3d6dece58..ae552ddb91 100644 --- a/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerGemmFixC_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp index 0d74598b24..fb8409afab 100644 --- a/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerLU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_LU_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp index 04825ac61c..35062c8db7 100644 --- a/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InnerTrsm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INNER_TRSM_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp index 215c62e9f2..216b5f9652 100644 --- a/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_InverseLU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_INVERSELU_SERIAL_IMPL_HPP /// \author Vinh Dang (vqdang@sandia.gov) @@ -32,7 +32,7 @@ namespace KokkosBatched { /// InverseLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp index e2acd012cb..b25b9bbc2e 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_LU_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_LU_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_LU_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -31,7 +31,7 @@ namespace KokkosBatched { /// SerialLU no piv /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template <> template diff --git a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp index 6555a16d93..52002ad473 100644 --- a/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_LU_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp index 9ed5e244d2..4bcfdbe29c 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_LU_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_LU_TEAM_IMPL_HPP +#define KOKKOSBATCHED_LU_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp index dacfb02ed4..75516d6f6a 100644 --- a/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LU_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_LU_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp new file mode 100644 index 0000000000..445251a647 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Laswp_Serial_Impl.hpp @@ -0,0 +1,104 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Laswp_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { +namespace Impl { + +template +KOKKOS_INLINE_FUNCTION static int checkLaswpInput(const PivViewType &piv, const AViewType &A) { + static_assert(Kokkos::is_view_v, "KokkosBatched::laswp: PivViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::laswp: AViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 1 || AViewType::rank == 2, "KokkosBatched::laswp: AViewType must have rank 1 or 2."); + static_assert(PivViewType::rank == 1, "KokkosBatched::laswp: PivViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int npiv = piv.extent(0); + const int lda = A.extent(0); + if (npiv > lda) { + Kokkos::printf( + "KokkosBatched::laswp: the dimension of the ipiv array must " + "satisfy ipiv.extent(0) <= A.extent(0): ipiv: %d, A: " + "%d \n", + npiv, lda); + return 1; + } +#endif + return 0; +} +} // namespace Impl + +/// +/// Serial Internal Impl +/// ======================== + +/// +//// Forward pivot apply +/// + +template <> +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType &piv, const AViewType &A) { + auto info = KokkosBatched::Impl::checkLaswpInput(piv, A); + if (info) return info; + + if constexpr (AViewType::rank == 1) { + const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); + return KokkosBatched::Impl::SerialLaswpVectorForwardInternal::invoke(plen, piv.data(), ps0, A.data(), as0); + } else if constexpr (AViewType::rank == 2) { + // row permutation + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + return KokkosBatched::Impl::SerialLaswpMatrixForwardInternal::invoke(n, plen, piv.data(), ps0, A.data(), as0, + as1); + } + return 0; + } +}; + +/// +/// Backward pivot apply +/// + +template <> +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType piv, const AViewType &A) { + auto info = KokkosBatched::Impl::checkLaswpInput(piv, A); + if (info) return info; + + if constexpr (AViewType::rank == 1) { + const int plen = piv.extent(0), ps0 = piv.stride(0), as0 = A.stride(0); + return KokkosBatched::Impl::SerialLaswpVectorBackwardInternal::invoke(plen, piv.data(), ps0, A.data(), as0); + } else if constexpr (AViewType::rank == 2) { + // row permutation + const int plen = piv.extent(0), ps0 = piv.stride(0), n = A.extent(1), as0 = A.stride(0), as1 = A.stride(1); + return KokkosBatched::Impl::SerialLaswpMatrixBackwardInternal::invoke(n, plen, piv.data(), ps0, A.data(), as0, + as1); + } + return 0; + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_LASWP_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp new file mode 100644 index 0000000000..dc49f367b1 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Laswp_Serial_Internal.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +#include "KokkosBatched_Util.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ======================== + +/// +//// Forward pivot apply +/// + +struct SerialLaswpVectorForwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A[idx_i]; + A[idx_i] = A[idx_p]; + A[idx_p] = tmp; + } + } + return 0; + } +}; + +struct SerialLaswpMatrixForwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { + if (as0 <= as1) { + // LayoutLeft like + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } else { + // LayoutRight like + for (int i = 0; i < plen; ++i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } + return 0; + } +}; + +/// +/// Backward pivot apply +/// + +struct SerialLaswpVectorBackwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0) { + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A[idx_i]; + A[idx_i] = A[idx_p]; + A[idx_p] = tmp; + } + } + return 0; + } +}; + +struct SerialLaswpMatrixBackwardInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const int plen, const IntType *KOKKOS_RESTRICT p, const int ps0, + /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { + if (as0 <= as1) { + // LayoutLeft like + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } else { + // LayoutRight like + for (int i = (plen - 1); i >= 0; --i) { + const int piv = p[i * ps0]; + if (piv != i) { + const int idx_i = i * as0, idx_p = piv * as0; + for (int j = 0; j < n; j++) { + ValueType *KOKKOS_RESTRICT A_at_j = A + j * as1; + const ValueType tmp = A_at_j[idx_i]; + A_at_j[idx_i] = A_at_j[idx_p]; + A_at_j[idx_p] = tmp; + } + } + } + } + return 0; + } +}; + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_LASWP_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp index c266d65c54..159317d438 100644 --- a/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_LeftEigenvectorFromSchur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_LEFT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp index af6832940b..e1e19c0194 100644 --- a/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Normalize_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__ -#define __KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP +#define KOKKOSBATCHED_NORMALIZE_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp new file mode 100644 index 0000000000..1687d82333 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Impl.hpp @@ -0,0 +1,81 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pbtrf_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPbtrfInput([[maybe_unused]] const ABViewType &Ab) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrf: ABViewType is not a Kokkos::View."); + static_assert(ABViewType::rank == 2, "KokkosBatched::pbtrf: ABViewType must have rank 2."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int kd = Ab.extent(0) - 1; + if (kd < 0) { + Kokkos::printf( + "KokkosBatched::pbtrf: input parameter kd must not be less than 0: kd " + "= " + "%d\n", + kd); + return 1; + } +#endif + return 0; +} + +//// Lower //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalLower::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +//// Upper //// +template <> +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &Ab) { + // Quick return if possible + const int n = Ab.extent(1); + if (n == 0) return 0; + + auto info = checkPbtrfInput(Ab); + if (info) return info; + + const int kd = Ab.extent(0) - 1; + return SerialPbtrfInternalUpper::invoke(n, Ab.data(), Ab.stride_0(), Ab.stride_1(), kd); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp new file mode 100644 index 0000000000..0a4ed7d697 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrf_Serial_Internal.hpp @@ -0,0 +1,272 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" + +namespace KokkosBatched { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower +/// + +template +struct SerialPbtrfInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = L*L'. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1]; + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(an - j - 1, kd); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // syr (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = k; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalLower::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = L*L**H + for (int j = 0; j < an; ++j) { + auto a_jj = AB[0 * as0 + j * as1].real(); + + // Check if L (j, j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[0 * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[0 * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of column J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[1 * as0 + j * as1]), 1); + + // zher (lower) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + 1) * as0 + j * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real() + (temp * x_k).real(); + for (int i = k + 1; i < kn; ++i) { + auto x_i = AB[(i + 1) * as0 + j * as1]; + AB[i * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } else { + AB[k * as0 + (j + 1) * as1] = AB[k * as0 + (j + 1) * as1].real(); + } + } + } + } + + return 0; +} + +/// +/// Upper +/// + +template +struct SerialPbtrfInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, const int as0, const int as1, + const int kd); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, + const int as1, const int kd); +}; + +/// +/// Real matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke(const int an, + /**/ ValueType *KOKKOS_RESTRICT AB, + const int as0, const int as1, + const int kd) { + // Compute the Cholesky factorization A = U'*U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1]; + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + return j + 1; + } +#endif + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // syr (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * x_k; + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + } + } + + return 0; +} + +/// +/// Complex matrix +/// +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrfInternalUpper::invoke( + const int an, + /**/ Kokkos::complex *KOKKOS_RESTRICT AB, const int as0, const int as1, const int kd) { + // Compute the Cholesky factorization A = U**H * U. + for (int j = 0; j < an; ++j) { + auto a_jj = AB[kd * as0 + j * as1].real(); + + // Check if U (j,j) is positive definite +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + if (a_jj <= 0) { + AB[kd * as0 + j * as1] = a_jj; + return j + 1; + } +#endif + + a_jj = Kokkos::sqrt(a_jj); + AB[kd * as0 + j * as1] = a_jj; + + // Compute elements J+1:J+KN of row J and update the + // trailing submatrix within the band. + int kn = Kokkos::min(kd, an - j - 1); + int kld = Kokkos::max(1, as0 - 1); + if (kn > 0) { + // scale to diagonal elements + const ValueType alpha = 1.0 / a_jj; + KokkosBlas::Impl::SerialScaleInternal::invoke(kn, alpha, &(AB[(kd - 1) * as0 + (j + 1) * as1]), kld); + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + + // zher (upper) with alpha = -1.0 to diagonal elements + for (int k = 0; k < kn; ++k) { + auto x_k = AB[(k + kd - 1) * as0 + (j + 1 - k) * as1]; + if (x_k != 0) { + auto temp = -1.0 * Kokkos::conj(x_k); + for (int i = 0; i < k + 1; ++i) { + auto x_i = AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]; + AB[(kd + i) * as0 + (j + 1 + k - i) * as1] += x_i * temp; + } + } + } + + // zlacgv to diagonal elements + for (int i = 0; i < kn; ++i) { + AB[(i + kd - 1) * as0 + (j + 1 - i) * as1] = Kokkos::conj(AB[(i + kd - 1) * as0 + (j + 1 - i) * as1]); + } + } + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRF_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp new file mode 100644 index 0000000000..931e878054 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Impl.hpp @@ -0,0 +1,95 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ + +#include +#include "KokkosBatched_Pbtrs_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { +namespace Impl { + +template +KOKKOS_INLINE_FUNCTION static int checkPbtrsInput([[maybe_unused]] const AViewType &A, + [[maybe_unused]] const XViewType &x) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrs: AViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pbtrs: XViewType is not a Kokkos::View."); + static_assert(AViewType::rank == 2, "KokkosBatched::pbtrs: AViewType must have rank 2."); + static_assert(XViewType::rank == 1, "KokkosBatched::pbtrs: XViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int ldb = x.extent(0); + const int lda = A.extent(0), n = A.extent(1); + const int kd = lda - 1; + if (kd < 0) { + Kokkos::printf( + "KokkosBatched::pbtrs: leading dimension of A must not be less than 1: %d, A: " + "%d " + "x %d \n", + lda, n); + return 1; + } + if (ldb < Kokkos::max(1, n)) { + Kokkos::printf( + "KokkosBatched::pbtrs: Dimensions of x and A do not match: x: %d, A: " + "%d " + "x %d \n" + "x.extent(0) must be larger or equal to A.extent(1) \n", + ldb, lda, n); + return 1; + } +#endif + return 0; +} +} // namespace Impl + +//// Lower //// +template <> +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x) { + // Quick return if possible + if (A.extent(1) == 0) return 0; + auto info = KokkosBatched::Impl::checkPbtrsInput(A, x); + if (info) return info; + + const int kd = A.extent(0) - 1; + return KokkosBatched::Impl::SerialPbtrsInternalLower::invoke( + A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), kd); + } +}; + +//// Upper //// +template <> +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, const XViewType &x) { + // Quick return if possible + if (A.extent(1) == 0) return 0; + auto info = KokkosBatched::Impl::checkPbtrsInput(A, x); + if (info) return info; + + const int kd = A.extent(0) - 1; + return KokkosBatched::Impl::SerialPbtrsInternalUpper::invoke( + A.extent(1), A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), kd); + } +}; + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRS_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp new file mode 100644 index 0000000000..f380525ae3 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pbtrs_Serial_Internal.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Tbsv_Serial_Internal.hpp" + +namespace KokkosBatched { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== + +/// +/// Lower +/// + +template +struct SerialPbtrsInternalLower { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int kd); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrsInternalLower::invoke(const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int kd) { + // Solve L*X = B, overwriting B with X. + SerialTbsvInternalLower::invoke(false, an, A, as0, as1, x, xs0, kd); + + // Solve L**T *X = B, overwriting B with X. + constexpr bool do_conj = Kokkos::ArithTraits::is_complex; + SerialTbsvInternalLowerTranspose::invoke(false, do_conj, an, A, as0, as1, x, xs0, kd); + + return 0; +} + +/// +/// Upper +/// + +template +struct SerialPbtrsInternalUpper { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int an, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, const int xs0, const int kd); +}; + +template <> +template +KOKKOS_INLINE_FUNCTION int SerialPbtrsInternalUpper::invoke(const int an, + const ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1, + /**/ ValueType *KOKKOS_RESTRICT x, + const int xs0, const int kd) { + // Solve U**T *X = B, overwriting B with X. + constexpr bool do_conj = Kokkos::ArithTraits::is_complex; + SerialTbsvInternalUpperTranspose::invoke(false, do_conj, an, A, as0, as1, x, xs0, kd); + + // Solve U*X = B, overwriting B with X. + SerialTbsvInternalUpper::invoke(false, an, A, as0, as1, x, xs0, kd); + + return 0; +} + +} // namespace Impl +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PBTRS_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp new file mode 100644 index 0000000000..3876e03918 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrs_Serial_Impl.hpp @@ -0,0 +1,91 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ +#define KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ + +#include +#include +#include "KokkosBatched_Pttrs_Serial_Internal.hpp" + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +KOKKOS_INLINE_FUNCTION static int checkPttrsInput([[maybe_unused]] const DViewType &d, + [[maybe_unused]] const EViewType &e, + [[maybe_unused]] const BViewType &b) { + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: DViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: EViewType is not a Kokkos::View."); + static_assert(Kokkos::is_view_v, "KokkosBatched::pttrs: BViewType is not a Kokkos::View."); + + static_assert(DViewType::rank == 1, "KokkosBatched::pttrs: DViewType must have rank 1."); + static_assert(EViewType::rank == 1, "KokkosBatched::pttrs: EViewType must have rank 1."); + static_assert(BViewType::rank == 1, "KokkosBatched::pttrs: BViewType must have rank 1."); + +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + const int nd = d.extent(0); + const int ne = e.extent(0); + const int ldb = b.extent(0); + + if (ne + 1 != nd) { + Kokkos::printf( + "KokkosBatched::pttrs: Dimensions of d and e do not match: d: %d, e: " + "%d \n" + "e.extent(0) must be equal to d.extent(0) - 1\n", + nd, ne); + return 1; + } + + if (ldb < Kokkos::max(1, nd)) { + Kokkos::printf( + "KokkosBatched::pttrs: Dimensions of d and b do not match: d: %d, b: " + "%d \n" + "b.extent(0) must be larger or equal to d.extent(0) \n", + ldb, nd); + return 1; + } +#endif + return 0; +} + +template +struct SerialPttrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e, const BViewType &b) { + // Quick return if possible + if (d.extent(0) == 0) return 0; + + auto info = checkPttrsInput(d, e, b); + if (info) return info; + + using ScalarType = typename DViewType::non_const_value_type; + int n = d.extent(0); + + if (n == 1) { + const ScalarType alpha = 1.0 / d(0); + return KokkosBlas::SerialScale::invoke(alpha, b); + } + + // Solve A * X = B using the factorization A = L*D*L**T, + // overwriting each right hand side vector with its solution. + return SerialPttrsInternal::invoke(n, d.data(), d.stride(0), e.data(), e.stride(0), + b.data(), b.stride(0)); + } +}; +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRS_SERIAL_IMPL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp new file mode 100644 index 0000000000..356722d110 --- /dev/null +++ b/batched/dense/impl/KokkosBatched_Pttrs_Serial_Internal.hpp @@ -0,0 +1,88 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ +#define KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +template +struct SerialPttrsInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, + const ValueType *KOKKOS_RESTRICT e, const int es0, + ValueType *KOKKOS_RESTRICT b, const int bs0); + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, + const Kokkos::complex *KOKKOS_RESTRICT e, const int es0, + Kokkos::complex *KOKKOS_RESTRICT b, const int bs0); +}; + +/// +/// Real matrix +/// + +template +template +KOKKOS_INLINE_FUNCTION int SerialPttrsInternal::invoke( + const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, const ValueType *KOKKOS_RESTRICT e, const int es0, + ValueType *KOKKOS_RESTRICT b, const int bs0) { + // Solve A * X = B using the factorization L * D * L**T + for (int i = 1; i < n; i++) { + b[i * bs0] -= e[(i - 1) * es0] * b[(i - 1) * bs0]; + } + + b[(n - 1) * bs0] /= d[(n - 1) * ds0]; + + for (int i = n - 2; i >= 0; i--) { + b[i * bs0] = b[i * bs0] / d[i * ds0] - b[(i + 1) * bs0] * e[i * es0]; + } + + return 0; +} + +/// +/// Complex matrix +/// + +template +template +KOKKOS_INLINE_FUNCTION int SerialPttrsInternal::invoke( + const int n, const ValueType *KOKKOS_RESTRICT d, const int ds0, const Kokkos::complex *KOKKOS_RESTRICT e, + const int es0, Kokkos::complex *KOKKOS_RESTRICT b, const int bs0) { + // Solve A * X = B using the factorization L * D * L**H + for (int i = 1; i < n; i++) { + auto tmp_e = std::is_same_v ? Kokkos::conj(e[(i - 1) * es0]) : e[(i - 1) * es0]; + b[i * bs0] -= tmp_e * b[(i - 1) * bs0]; + } + + b[(n - 1) * bs0] /= d[(n - 1) * ds0]; + + for (int i = n - 2; i >= 0; i--) { + auto tmp_e = std::is_same_v ? Kokkos::conj(e[i * es0]) : e[i * es0]; + b[i * bs0] = b[i * bs0] / d[i * ds0] - b[(i + 1) * bs0] * tmp_e; + } + + return 0; +} + +} // namespace KokkosBatched + +#endif // KOKKOSBATCHED_PTTRS_SERIAL_INTERNAL_HPP_ diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index 7c717c2eed..aaacb45ede 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_QR_FORM_Q_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index af7f458898..9f575d2953 100644 --- a/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_FORM_Q_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp index 1083e6af2a..1da8535628 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_QR_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp index 95ca1c4340..8aa4a6361c 100644 --- a/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_QR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp index 2497e5adf5..0f37f2d8db 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_QR_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp index e3dde67986..9ed2ac3a22 100644 --- a/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp index ed9ccd8cce..790afd31ad 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 280bfa434b..89e68e1757 100644 --- a/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp index 029875f810..a5c2a2074e 100644 --- a/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_RightEigenvectorFromSchur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_RIGHT_EIGENVECTOR_FROM_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp index e0c25c2ce7..e1d024ef39 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_SVD_SERIAL_IMPL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 0b85b1e28e..56a5619e06 100644 --- a/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SVD_SERIAL_INTERNAL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) @@ -51,11 +51,10 @@ struct SerialSVDInternal { template KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2) { - value_type a = Kokkos::ArithTraits::one(); - value_type b = -a11 - a22; - value_type c = a11 * a22 - a21 * a21; - using Kokkos::sqrt; - value_type sqrtDet = sqrt(b * b - 4 * a * c); + value_type a = Kokkos::ArithTraits::one(); + value_type b = -a11 - a22; + value_type c = a11 * a22 - a21 * a21; + value_type sqrtDet = Kokkos::sqrt(b * b - 4 * a * c); e1 = (-b + sqrtDet) / (2 * a); e2 = (-b - sqrtDet) / (2 * a); } @@ -78,7 +77,7 @@ struct SerialSVDInternal { value_type e1, e2, mu; symEigen2x2(dm * dm + fmm1 * fmm1, dm * fm, target, e1, e2); // the shift is the eigenvalue closer to the last diagonal entry of B^T*B - if (fabs(e1 - target) < fabs(e2 - target)) + if (Kokkos::abs(e1 - target) < Kokkos::abs(e2 - target)) mu = e1; else mu = e2; @@ -124,7 +123,7 @@ struct SerialSVDInternal { // Assumes i is not the last row. // U is m*m, B is n*n template - KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int m, + KOKKOS_INLINE_FUNCTION static void svdZeroRow(int i, value_type* B, int n, int Bs0, int Bs1, value_type* U, int Um, int Us0, int Us1) { Kokkos::pair G; for (int j = i + 1; j < n; j++) { @@ -138,17 +137,16 @@ struct SerialSVDInternal { &SVDIND(B, j, j + 1), Bs1); } if (U) { - KokkosBatched::SerialApplyRightGivensInternal::invoke(G, m, &SVDIND(U, 0, i), Us0, &SVDIND(U, 0, j), - Us0); + KokkosBatched::SerialApplyRightGivensInternal::invoke(G, Um, &SVDIND(U, 0, i), Us0, + &SVDIND(U, 0, j), Us0); } } } template - KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, value_type* Vt, int Vts0, - int Vts1) { - // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the - // last column. + KOKKOS_INLINE_FUNCTION static void svdZeroLastColumn(value_type* B, int n, int Bs0, int Bs1, int vn, value_type* Vt, + int Vts0, int Vts1) { + // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero up the last column. Kokkos::pair G; for (int j = n - 2; j >= 0; j--) { KokkosBatched::SerialGivensInternal::invoke(SVDIND(B, j, j), SVDIND(B, j, n - 1), &G, @@ -159,7 +157,7 @@ struct SerialSVDInternal { &SVDIND(B, j - 1, j), Bs0); } if (Vt) { - KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, n, &SVDIND(Vt, n - 1, 0), Vts1, + KokkosBatched::SerialApplyLeftGivensInternal::invoke(G, vn, &SVDIND(Vt, n - 1, 0), Vts1, &SVDIND(Vt, j, 0), Vts1); } } @@ -224,8 +222,9 @@ struct SerialSVDInternal { while (true) { // Zero out tiny superdiagonal entries for (int i = 0; i < n - 1; i++) { - if (fabs(SVDIND(B, i, i + 1)) < eps * (fabs(SVDIND(B, i, i)) + fabs(SVDIND(B, i + 1, i + 1))) || - fabs(SVDIND(B, i, i + 1)) < tol) { + if (Kokkos::abs(SVDIND(B, i, i + 1)) < + eps * (Kokkos::abs(SVDIND(B, i, i)) + Kokkos::abs(SVDIND(B, i + 1, i + 1))) || + Kokkos::abs(SVDIND(B, i, i + 1)) < tol) { SVDIND(B, i, i + 1) = KAT::zero(); } } @@ -246,25 +245,32 @@ struct SerialSVDInternal { for (p = q - 1; p > 0; p--) { if (SVDIND(B, p - 1, p) == KAT::zero()) break; } + value_type* Bsub = &SVDIND(B, p, p); + value_type* Usub = &SVDIND(U, 0, p); + value_type* Vtsub = &SVDIND(Vt, p, 0); + int nsub = q - p; // If there are zero diagonals in this range, eliminate the entire row //(effectively decoupling into two subproblems) for (int i = q - 1; i >= p; i--) { if (SVDIND(B, i, i) == KAT::zero()) { - if (i == n - 1) { + if (i == q - 1) { // Last diagonal entry being 0 is a special case. // Zero out the superdiagonal above it. - // Deal with B(n-1, n-1) = 0, by chasing the superdiagonal nonzero - // up the last column. - svdZeroLastColumn(B, n, Bs0, Bs1, Vt, Vts0, Vts1); + // Deal with B(q-1, q-1) = 0, by chasing the superdiagonal nonzero + // B(q-2, q-1) up the last column. + // + // Once that nonzero reaches B(p, q-1), we are either at the top of B + // (if p == 0) or the superdiag above B(p, p) is zero. + // In either case, the chase stops after eliminating B(p, q-1) because no + // new entry is introduced by the Givens. + svdZeroLastColumn(Bsub, nsub, Bs0, Bs1, n, Vtsub, Vts0, Vts1); } else if (SVDIND(B, i, i + 1) != KAT::zero()) { - svdZeroRow(i, B, n, Bs0, Bs1, U, m, Us0, Us1); + svdZeroRow(i - p, Bsub, nsub, Bs0, Bs1, Usub, m, Us0, Us1); } } - continue; } - int nsub = q - p; // B22 is nsub * nsub, Usub is m * nsub, and Vtsub is nsub * n - svdStep(&SVDIND(B, p, p), &SVDIND(U, 0, p), &SVDIND(Vt, p, 0), m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1); + svdStep(Bsub, Usub, Vtsub, m, n, nsub, Bs0, Bs1, Us0, Us1, Vts0, Vts1); } for (int i = 0; i < n; i++) { sigma[i * ss] = SVDIND(B, i, i); diff --git a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp index 41e525d2ba..369153c3c5 100644 --- a/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur2x2_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SCHUR2X2_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp index c6d55b301b..c619d5da26 100644 --- a/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Schur_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SCHUR_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp index 9219f3a9ec..96fb20c77c 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp index f5afb5c79c..f1432b9231 100644 --- a/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetIdentity_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp index 09e94ab5f3..2d31e14c83 100644 --- a/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SetTriangular_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP +#define KOKKOSBATCHED_SET_TRIANGULAR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index c0963447c4..8c6baecee1 100644 --- a/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_SHIFTED_TRSV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp index 3b85a26294..f707b03a31 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index 18440745eb..c1ca8dc7cc 100644 --- a/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp index 6313d817c6..e2624269b6 100644 --- a/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trmm_Serial_Impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRMM_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trmm_Serial_Internal.hpp" @@ -137,4 +137,4 @@ struct SerialTrmm:: return 0; } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP__ +#endif // KOKKOSBATCHED_TRMM_SERIAL_INTERNAL_HPP diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 694ac36fa0..dc459d23d0 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRSM_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -29,7 +29,7 @@ namespace KokkosBatched { /// B := inv(tril(A)) (alpha*B) /// A(m x m), B(m x n) -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsm { @@ -88,7 +88,7 @@ struct SerialTrsm struct SerialTrsm { @@ -167,7 +167,7 @@ struct SerialTrsm struct SerialTrsm { @@ -227,7 +227,7 @@ struct SerialTrsm struct SerialTrsm { @@ -285,7 +285,7 @@ struct SerialTrsm struct SerialTrsm { diff --git a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index 0e65d269f0..a797357f7e 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp index 145f8e0c2d..e89b114c10 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_TRSM_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index c1781a001c..916fa53225 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 371dbb483c..20426f088e 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP +#define KOKKOSBATCHED_TRSM_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index a1a7062809..1adedad593 100644 --- a/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_TRSM_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -192,7 +192,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsmInternalLeftUpper::inv Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, iend * jend), [&](const int &ij) { int i, j; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { i = ij % iend; j = ij / iend; } else { diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp index 073970caa6..d7db47375a 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRSV_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -38,7 +38,7 @@ namespace KokkosBatched { /// L/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -94,7 +94,7 @@ struct SerialTrsv /// L/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -150,7 +150,7 @@ struct SerialTrsv { /// U/NT /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { @@ -206,7 +206,7 @@ struct SerialTrsv /// U/T /// -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) template struct SerialTrsv { diff --git a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index 43d95377d4..861c72eec9 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp index 42c242414c..6fb7294693 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_TRSV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index 894e684ef2..6b2dac17b6 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp index c658080dc2..8b4be66cc4 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_TRSV_TEAM_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index ba3b2ff7b5..f0fe7ca64a 100644 --- a/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP +#define KOKKOSBATCHED_TRSV_TEAM_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp index 1068bf9e54..c234edc08e 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trtri_Serial_Internal.hpp" @@ -39,4 +39,4 @@ struct SerialTrtri { }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_SERIAL_IMPL_HPP diff --git a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index f6b0b4bf6d..e548524648 100644 --- a/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Trmm_Serial_Internal.hpp" @@ -130,4 +130,4 @@ KOKKOS_INLINE_FUNCTION int SerialTrtriInternalUpper::inv return 0; } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_SERIAL_INTERNAL_HPP diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp index de5ecebf94..12856514a5 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_UTV_TEAMVECTOR_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp index e39dba9a40..575eba64c4 100644 --- a/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__ -#define __KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP +#define KOKKOSBATCHED_UTV_TEAMVECTOR_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp index 3f56e71422..dcc0da4cef 100644 --- a/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_UpdateGivens_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ -#define __KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP +#define KOKKOSBATCHED_UPDATE_GIVENS_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp index 08628729bc..ad10b39c26 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Arith.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_ARITH_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp index f289d5be09..8da9ac7f15 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Logical.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_LOGICAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp index eefaf4ce0d..eec5c1092f 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Math.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_MATH_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp index 02f717d458..be980b986d 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Misc.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_MISC_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp index c956780192..cce4543498 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_Relation.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_RELATION_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp index 60e5e43e57..a9818493cd 100644 --- a/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp +++ b/batched/dense/impl/KokkosBatched_Vector_SIMD_View.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_VIEW_HPP #include diff --git a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp index a23a9ea4d0..1f726baf53 100644 --- a/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp +++ b/batched/dense/impl/KokkosBatched_WilkinsonShift_Serial_Internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP +#define KOKKOSBATCHED_WILKINSON_SHIFT_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp index 988bd30c93..fe142f4389 100644 --- a/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp +++ b/batched/dense/impl/KokkosBatched_Xpay_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_XPAY_IMPL_HPP__ -#define __KOKKOSBATCHED_XPAY_IMPL_HPP__ +#ifndef KOKKOSBATCHED_XPAY_IMPL_HPP +#define KOKKOSBATCHED_XPAY_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp index 7eadc43269..c906fac7af 100644 --- a/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp +++ b/batched/dense/src/KokkosBatched_AddRadial_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__ -#define __KOKKOSBATCHED_ADD_RADIAL_DECL_HPP__ +#ifndef KOKKOSBATCHED_ADD_RADIAL_DECL_HPP +#define KOKKOSBATCHED_ADD_RADIAL_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp index bee7d3a645..21a34963fe 100644 --- a/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyHouseholder_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP +#define KOKKOSBATCHED_APPLY_HOUSEHOLDER_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp index 2aa00bf8c2..f4795f863d 100644 --- a/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyPivot_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP +#define KOKKOSBATCHED_APPLY_PIVOT_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp index 7f78e31700..9d17a8b435 100644 --- a/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp +++ b/batched/dense/src/KokkosBatched_ApplyQ_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_APPLY_Q_DECL_HPP__ -#define __KOKKOSBATCHED_APPLY_Q_DECL_HPP__ +#ifndef KOKKOSBATCHED_APPLY_Q_DECL_HPP +#define KOKKOSBATCHED_APPLY_Q_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Axpy.hpp b/batched/dense/src/KokkosBatched_Axpy.hpp index 5b89c0862e..dcc02eda22 100644 --- a/batched/dense/src/KokkosBatched_Axpy.hpp +++ b/batched/dense/src/KokkosBatched_Axpy.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_AXPY_HPP__ -#define __KOKKOSBATCHED_AXPY_HPP__ +#ifndef KOKKOSBATCHED_AXPY_HPP +#define KOKKOSBATCHED_AXPY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Copy_Decl.hpp b/batched/dense/src/KokkosBatched_Copy_Decl.hpp index 0e2b24e91d..88ed46d564 100644 --- a/batched/dense/src/KokkosBatched_Copy_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Copy_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_COPY_DECL_HPP__ -#define __KOKKOSBATCHED_COPY_DECL_HPP__ +#ifndef KOKKOSBATCHED_COPY_DECL_HPP +#define KOKKOSBATCHED_COPY_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Dot.hpp b/batched/dense/src/KokkosBatched_Dot.hpp index 545a4954ce..50242bba8c 100644 --- a/batched/dense/src/KokkosBatched_Dot.hpp +++ b/batched/dense/src/KokkosBatched_Dot.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_DOT_HPP__ -#define __KOKKOSBATCHED_DOT_HPP__ +#ifndef KOKKOSBATCHED_DOT_HPP +#define KOKKOSBATCHED_DOT_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp index 39ead9e26c..23c1add77b 100644 --- a/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Eigendecomposition_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ -#define __KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP__ +#ifndef KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP +#define KOKKOSBATCHED_EIGENDECOMPOSITION_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp index 9f4b745561..eabd5c42c2 100644 --- a/batched/dense/src/KokkosBatched_Gemm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMM_DECL_HPP__ -#define __KOKKOSBATCHED_GEMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_GEMM_DECL_HPP +#define KOKKOSBATCHED_GEMM_DECL_HPP #include "KokkosBatched_Vector.hpp" @@ -75,4 +75,4 @@ struct Gemm { #include "KokkosBatched_Gemm_Team_Impl.hpp" #include "KokkosBatched_Gemm_TeamVector_Impl.hpp" -#endif // __KOKKOSBATCHED_GEMM_DECL_HPP__ +#endif // KOKKOSBATCHED_GEMM_DECL_HPP diff --git a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp index 9ab86d9e07..907f684a74 100644 --- a/batched/dense/src/KokkosBatched_Gemv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Gemv_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GEMV_DECL_HPP__ -#define __KOKKOSBATCHED_GEMV_DECL_HPP__ +#ifndef KOKKOSBATCHED_GEMV_DECL_HPP +#define KOKKOSBATCHED_GEMV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Gesv.hpp b/batched/dense/src/KokkosBatched_Gesv.hpp index 77922e4da0..c0cceb3819 100644 --- a/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/batched/dense/src/KokkosBatched_Gesv.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GESV_HPP__ -#define __KOKKOSBATCHED_GESV_HPP__ +#ifndef KOKKOSBATCHED_GESV_HPP +#define KOKKOSBATCHED_GESV_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_HadamardProduct.hpp b/batched/dense/src/KokkosBatched_HadamardProduct.hpp index f21aa8bae2..d8076dbe3c 100644 --- a/batched/dense/src/KokkosBatched_HadamardProduct.hpp +++ b/batched/dense/src/KokkosBatched_HadamardProduct.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__ -#define __KOKKOSBATCHED_HADAMARDPRODUCT_HPP__ +#ifndef KOKKOSBATCHED_HADAMARDPRODUCT_HPP +#define KOKKOSBATCHED_HADAMARDPRODUCT_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp index 0741b5b41e..2f3cf273f4 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP // Include explicit specializations of BatchedGemm. // If ETI_ONLY is disabled, the primary template will @@ -101,4 +101,4 @@ inline int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alp UnifiedBVT, UnifiedCVT>::run(handle, alpha, A, B, beta, C); } } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_DECL_HPP diff --git a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp index 2aa6f47cb0..87b605dc1e 100644 --- a/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp +++ b/batched/dense/src/KokkosBatched_HostLevel_Gemm_Handle.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ -#define __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP +#define KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP #include "KokkosBatched_Kernel_Handle.hpp" @@ -151,4 +151,4 @@ class BatchedGemmHandle : public BatchedKernelHandle { } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP__ +#endif // KOKKOSBATCHED_HOSTLEVEL_GEMM_HANDLE_DECL_HPP diff --git a/batched/dense/src/KokkosBatched_Householder_Decl.hpp b/batched/dense/src/KokkosBatched_Householder_Decl.hpp index 0a48457551..3e26d6982a 100644 --- a/batched/dense/src/KokkosBatched_Householder_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Householder_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ -#define __KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP__ +#ifndef KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP +#define KOKKOSBATCHED_HOUSEHOLDER_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Iamax.hpp b/batched/dense/src/KokkosBatched_Iamax.hpp new file mode 100644 index 0000000000..c388ca943f --- /dev/null +++ b/batched/dense/src/KokkosBatched_Iamax.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_IAMAX_HPP_ +#define KOKKOSBATCHED_IAMAX_HPP_ + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Iamax: +/// Iamax finds the index of the first element having maximum absolute value. +/// +/// \tparam XViewType: Input view type, needs to be a 1D view +/// +/// \param X [in]: Input view type +/// +/// \return The index of the first element having maximum absolute value +/// As well as Blas, this returns 0 (0 in Fortran) for an empty vector +/// No nested parallel_for is used inside of the function. +/// + +struct SerialIamax { + template + KOKKOS_INLINE_FUNCTION static typename XViewType::size_type invoke(const XViewType &x); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Iamax_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_IAMAX_HPP_ diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp index 757a92ca21..9aa7ac614b 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixA_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_A_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp index b2f885970f..9ca74934dc 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixB_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_B_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp index c61d966f77..31ba2a03d9 100644 --- a/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerGemmFixC_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP +#define KOKKOSBATCHED_INNER_GEMM_FIX_C_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp index c355185b74..1d7c9879d3 100644 --- a/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_LU_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_LU_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_LU_DECL_HPP +#define KOKKOSBATCHED_INNER_LU_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp index 5b5b9bb147..b9e62aa392 100644 --- a/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InnerTrsm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__ -#define __KOKKOSBATCHED_INNER_TRSM_DECL_HPP__ +#ifndef KOKKOSBATCHED_INNER_TRSM_DECL_HPP +#define KOKKOSBATCHED_INNER_TRSM_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp index 930bc790b0..141e2b9722 100644 --- a/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_InverseLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_INVERSELU_DECL_HPP__ -#define __KOKKOSBATCHED_INVERSELU_DECL_HPP__ +#ifndef KOKKOSBATCHED_INVERSELU_DECL_HPP +#define KOKKOSBATCHED_INVERSELU_DECL_HPP /// \author Vinh Dang (vqdang@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_LU_Decl.hpp b/batched/dense/src/KokkosBatched_LU_Decl.hpp index 363193c147..b4cc5d35e1 100644 --- a/batched/dense/src/KokkosBatched_LU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_LU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_LU_DECL_HPP__ -#define __KOKKOSBATCHED_LU_DECL_HPP__ +#ifndef KOKKOSBATCHED_LU_DECL_HPP +#define KOKKOSBATCHED_LU_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Laswp.hpp b/batched/dense/src/KokkosBatched_Laswp.hpp new file mode 100644 index 0000000000..1818d456a8 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Laswp.hpp @@ -0,0 +1,52 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_LASWP_HPP_ +#define KOKKOSBATCHED_LASWP_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Laswp: +/// +/// performs a series of row interchanges on the matrix A. +/// One row interchange is initiated for each of rows K1 through K2 of A. +/// +/// \tparam PivViewType: Input type for the a superdiagonal matrix, needs to +/// be a 1D view +/// \tparam AViewType: Input type for the vector or matrix, needs to be a 1D or +/// 2D view +/// +/// \param piv [in]: The pivot indices; for 0 <= i < N, row i of the +/// matrix was interchanged with row piv(i). +/// \param A [inout]: A is a lda by n matrix. The matrix of column dimension N +/// to which the row interchanges will be applied. +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialLaswp { + template + KOKKOS_INLINE_FUNCTION static int invoke(const PivViewType &piv, const AViewType &A); +}; +} // namespace KokkosBatched + +#include "KokkosBatched_Laswp_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_LASWP_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pbtrf.hpp b/batched/dense/src/KokkosBatched_Pbtrf.hpp new file mode 100644 index 0000000000..879dfc0db3 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pbtrf.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRF_HPP_ +#define KOKKOSBATCHED_PBTRF_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pbtrf: +/// Compute the Cholesky factorization U**H * U (or L * L**H) of a real +/// symmetric (or complex Hermitian) positive definite banded matrix A_l +/// for all l = 0, ..., +/// The factorization has the form +/// A = U**T * U , if ArgUplo = KokkosBatched::Uplo::Upper, or +/// A = L * L**T, if ArgUplo = KokkosBatched::Uplo::Lower, +/// where U is an upper triangular matrix, U**T is the transpose of U, and +/// L is lower triangular. +/// This is the unblocked version of the algorithm, calling Level 2 BLAS. +/// +/// \tparam ABViewType: Input type for a banded matrix, needs to be a 2D +/// view +/// +/// \param ab [inout]: ab is a ldab by n banded matrix, with ( kd + 1 ) diagonals +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPbtrf { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &ab); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pbtrf_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PBTRF_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pbtrs.hpp b/batched/dense/src/KokkosBatched_Pbtrs.hpp new file mode 100644 index 0000000000..cdacb52b00 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pbtrs.hpp @@ -0,0 +1,56 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PBTRS_HPP_ +#define KOKKOSBATCHED_PBTRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pbtrs: +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the Cholesky factorization A = U**H * U or A = L * L**H computed by +/// Pbtrf. +/// The matrix has the form +/// A = U**H * U , if ArgUplo = KokkosBatched::Uplo::Upper, or +/// A = L * L**H, if ArgUplo = KokkosBatched::Uplo::Lower, +/// where U is an upper triangular matrix, U**H is the transpose of U, and +/// L is lower triangular matrix, L**H is the transpose of L. +/// +/// \tparam ABViewType: Input type for a banded matrix, needs to be a 2D +/// view +/// \tparam BViewType: Input type for a right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param ab [in]: ab is a ldab by n banded matrix, with ( kd + 1 ) diagonals +/// \param b [inout]: right-hand side and the solution, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPbtrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ABViewType &ab, const BViewType &b); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pbtrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PBTRS_HPP_ diff --git a/batched/dense/src/KokkosBatched_Pttrs.hpp b/batched/dense/src/KokkosBatched_Pttrs.hpp new file mode 100644 index 0000000000..17b7d45f96 --- /dev/null +++ b/batched/dense/src/KokkosBatched_Pttrs.hpp @@ -0,0 +1,54 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBATCHED_PTTRS_HPP_ +#define KOKKOSBATCHED_PTTRS_HPP_ + +#include + +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) + +namespace KokkosBatched { + +/// \brief Serial Batched Pttrs: +/// Solve Ab_l x_l = b_l for all l = 0, ..., N +/// using the factorization A = U**H * D * U or A = L * D * L**H computed by +/// Pttrf. +/// +/// \tparam DViewType: Input type for the a diagonal matrix, needs to be a 1D +/// view +/// \tparam EViewType: Input type for the a upper/lower diagonal matrix, +/// needs to be a 1D view +/// \tparam BViewType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param d [in]: n diagonal elements of the diagonal matrix D +/// \param e [in]: n-1 upper/lower diagonal elements of the diagonal matrix E +/// \param b [inout]: right-hand side and the solution, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialPttrs { + template + KOKKOS_INLINE_FUNCTION static int invoke(const DViewType &d, const EViewType &e, const BViewType &b); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Pttrs_Serial_Impl.hpp" + +#endif // KOKKOSBATCHED_PTTRS_HPP_ diff --git a/batched/dense/src/KokkosBatched_QR_Decl.hpp b/batched/dense/src/KokkosBatched_QR_Decl.hpp index 78bdcd4d4b..4f76d6e702 100644 --- a/batched/dense/src/KokkosBatched_QR_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_DECL_HPP__ -#define __KOKKOSBATCHED_QR_DECL_HPP__ +#ifndef KOKKOSBATCHED_QR_DECL_HPP +#define KOKKOSBATCHED_QR_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp index b08e5277a0..3c663f237e 100644 --- a/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp +++ b/batched/dense/src/KokkosBatched_QR_WithColumnPivoting_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__ -#define __KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP__ +#ifndef KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP +#define KOKKOSBATCHED_QR_WITH_COLUMNPIVOTING_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_SVD_Decl.hpp b/batched/dense/src/KokkosBatched_SVD_Decl.hpp index efade8029b..a022d826cc 100644 --- a/batched/dense/src/KokkosBatched_SVD_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SVD_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SVD_DECL_HPP__ -#define __KOKKOSBATCHED_SVD_DECL_HPP__ +#ifndef KOKKOSBATCHED_SVD_DECL_HPP +#define KOKKOSBATCHED_SVD_DECL_HPP /// \author Brian Kelley (bmkelle@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Scale_Decl.hpp b/batched/dense/src/KokkosBatched_Scale_Decl.hpp index 94453a5ede..188c1374af 100644 --- a/batched/dense/src/KokkosBatched_Scale_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Scale_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SCALE_DECL_HPP__ -#define __KOKKOSBATCHED_SCALE_DECL_HPP__ +#ifndef KOKKOSBATCHED_SCALE_DECL_HPP +#define KOKKOSBATCHED_SCALE_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp index 27c2b22ed7..26d8ec3932 100644 --- a/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SetIdentity_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__ -#define __KOKKOSBATCHED_SET_IDENTITY_DECL_HPP__ +#ifndef KOKKOSBATCHED_SET_IDENTITY_DECL_HPP +#define KOKKOSBATCHED_SET_IDENTITY_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Set_Decl.hpp b/batched/dense/src/KokkosBatched_Set_Decl.hpp index d33d186275..21dbd8b91a 100644 --- a/batched/dense/src/KokkosBatched_Set_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Set_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SET_DECL_HPP__ -#define __KOKKOSBATCHED_SET_DECL_HPP__ +#ifndef KOKKOSBATCHED_SET_DECL_HPP +#define KOKKOSBATCHED_SET_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp index 119f5c6916..1d07980016 100644 --- a/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveLU_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVELU_DECL_HPP__ -#define __KOKKOSBATCHED_SOLVELU_DECL_HPP__ +#ifndef KOKKOSBATCHED_SOLVELU_DECL_HPP +#define KOKKOSBATCHED_SOLVELU_DECL_HPP /// \author Vinh Dang (vqdang@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp index c881a0b0f7..344755bbf8 100644 --- a/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_SolveUTV_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__ -#define __KOKKOSBATCHED_SOLVE_UTV_DECL_HPP__ +#ifndef KOKKOSBATCHED_SOLVE_UTV_DECL_HPP +#define KOKKOSBATCHED_SOLVE_UTV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp index c284ed63b2..1444dad486 100644 --- a/batched/dense/src/KokkosBatched_Trmm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trmm_Decl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRMM_DECL_HPP__ -#define __KOKKOSBATCHED_TRMM_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRMM_DECL_HPP +#define KOKKOSBATCHED_TRMM_DECL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -28,4 +28,4 @@ struct SerialTrmm { KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A, const BViewType &B); }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRMM_DECL_HPP__ +#endif // KOKKOSBATCHED_TRMM_DECL_HPP diff --git a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp index d2220953cc..f06cb6245e 100644 --- a/batched/dense/src/KokkosBatched_Trsm_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsm_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSM_DECL_HPP__ -#define __KOKKOSBATCHED_TRSM_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRSM_DECL_HPP +#define KOKKOSBATCHED_TRSM_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp index e3da43a95d..d711085c9b 100644 --- a/batched/dense/src/KokkosBatched_Trsv_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trsv_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_TRSV_DECL_HPP__ -#define __KOKKOSBATCHED_TRSV_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRSV_DECL_HPP +#define KOKKOSBATCHED_TRSV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Trtri_Decl.hpp b/batched/dense/src/KokkosBatched_Trtri_Decl.hpp index 8c8e6121a3..f650fd1cce 100644 --- a/batched/dense/src/KokkosBatched_Trtri_Decl.hpp +++ b/batched/dense/src/KokkosBatched_Trtri_Decl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_TRTRI_DECL_HPP__ -#define __KOKKOSBATCHED_TRTRI_DECL_HPP__ +#ifndef KOKKOSBATCHED_TRTRI_DECL_HPP +#define KOKKOSBATCHED_TRTRI_DECL_HPP #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -28,4 +28,4 @@ struct SerialTrtri { KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A); }; } // namespace KokkosBatched -#endif // __KOKKOSBATCHED_TRTRI_DECL_HPP__ +#endif // KOKKOSBATCHED_TRTRI_DECL_HPP diff --git a/batched/dense/src/KokkosBatched_UTV_Decl.hpp b/batched/dense/src/KokkosBatched_UTV_Decl.hpp index bae2780e10..318f5ddc98 100644 --- a/batched/dense/src/KokkosBatched_UTV_Decl.hpp +++ b/batched/dense/src/KokkosBatched_UTV_Decl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_UTV_DECL_HPP__ -#define __KOKKOSBATCHED_UTV_DECL_HPP__ +#ifndef KOKKOSBATCHED_UTV_DECL_HPP +#define KOKKOSBATCHED_UTV_DECL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Vector.hpp b/batched/dense/src/KokkosBatched_Vector.hpp index e44af7bc04..1eafbfc9ad 100644 --- a/batched/dense/src/KokkosBatched_Vector.hpp +++ b/batched/dense/src/KokkosBatched_Vector.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_HPP__ -#define __KOKKOSBATCHED_VECTOR_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_HPP +#define KOKKOSBATCHED_VECTOR_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index 52a73deda4..82c6e72cb6 100644 --- a/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_VECTOR_SIMD_HPP__ -#define __KOKKOSBATCHED_VECTOR_SIMD_HPP__ +#ifndef KOKKOSBATCHED_VECTOR_SIMD_HPP +#define KOKKOSBATCHED_VECTOR_SIMD_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/batched/dense/src/KokkosBatched_Xpay.hpp b/batched/dense/src/KokkosBatched_Xpay.hpp index 51418fd81a..c8d7c374f3 100644 --- a/batched/dense/src/KokkosBatched_Xpay.hpp +++ b/batched/dense/src/KokkosBatched_Xpay.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_XPAY_HPP__ -#define __KOKKOSBATCHED_XPAY_HPP__ +#ifndef KOKKOSBATCHED_XPAY_HPP +#define KOKKOSBATCHED_XPAY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/dense/unit_test/Test_Batched_Dense.hpp b/batched/dense/unit_test/Test_Batched_Dense.hpp index 76215b58f8..2378e5ff01 100644 --- a/batched/dense/unit_test/Test_Batched_Dense.hpp +++ b/batched/dense/unit_test/Test_Batched_Dense.hpp @@ -52,6 +52,17 @@ #include "Test_Batched_SerialPttrf.hpp" #include "Test_Batched_SerialPttrf_Real.hpp" #include "Test_Batched_SerialPttrf_Complex.hpp" +#include "Test_Batched_SerialPttrs.hpp" +#include "Test_Batched_SerialPttrs_Real.hpp" +#include "Test_Batched_SerialPttrs_Complex.hpp" +#include "Test_Batched_SerialPbtrf.hpp" +#include "Test_Batched_SerialPbtrf_Real.hpp" +#include "Test_Batched_SerialPbtrf_Complex.hpp" +#include "Test_Batched_SerialPbtrs.hpp" +#include "Test_Batched_SerialPbtrs_Real.hpp" +#include "Test_Batched_SerialPbtrs_Complex.hpp" +#include "Test_Batched_SerialLaswp.hpp" +#include "Test_Batched_SerialIamax.hpp" // Team Kernels #include "Test_Batched_TeamAxpy.hpp" diff --git a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp index f536f220d3..eccdba50d1 100644 --- a/batched/dense/unit_test/Test_Batched_DenseUtils.hpp +++ b/batched/dense/unit_test/Test_Batched_DenseUtils.hpp @@ -145,6 +145,170 @@ void create_diagonal_matrix(InViewType& in, OutViewType& out, int k = 0) { Kokkos::deep_copy(out, h_out); } +/// \brief Creates a positive definite symmetric (PDS) matrix. +/// Takes a full random matrix and converts it to a full pds matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// +template +void random_to_pds(InViewType& in, OutViewType& out) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(1); + using value_type = typename InViewType::non_const_value_type; + + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + Kokkos::deep_copy(h_in, in); + Kokkos::deep_copy(h_out, 0.0); + + for (int i0 = 0; i0 < N; i0++) { + // Make a hermitian matrix + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + if (i1 == i2) { + // Diagonal elements must be real + h_out(i0, i1, i2) = Kokkos::ArithTraits::real(h_in(i0, i1, i2)); + } else { + // Off-diagonal elements are complex and Hermitian + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + + // Make matrix diagonal dominant + for (int i1 = 0; i1 < BlkSize; i1++) { + value_type sum = 0; + for (int i2 = 0; i2 < BlkSize; i2++) { + if (i1 != i2) { + sum += Kokkos::abs(h_out(i0, i1, i2)); + } + } + h_out(i0, i1, i1) = sum + 1.0; + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Creates a banded positive definite symmetric (PDS) matrix. +/// Takes a full diagonal dominant matrix and converts it to a banded pds matrix either +/// in banded or full storage. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// \param band_storage [in]: Boolean flag indicating whether the output should be in banded storage format (default is +/// true) +template +void create_banded_pds_matrix(InViewType& in, OutViewType& out, int k = 1, bool band_storage = true) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + using value_type = typename InViewType::non_const_value_type; + const int N = in.extent(0), BlkSize = in.extent(1); + + Kokkos::deep_copy(h_in, in); + + if (band_storage) { + assert(out.extent(0) == in.extent(0)); + assert(out.extent(1) == static_cast(k + 1)); + assert(out.extent(2) == in.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, k - i1, i2) = h_in(i0, i2 - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i2 + i1, i2); + } + } + } + } + } else { + for (std::size_t i = 0; i < InViewType::rank(); i++) { + assert(out.extent(i) == in.extent(i)); + } + + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = i1; i2 < Kokkos::min(i1 + k + 1, BlkSize); i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < BlkSize; i1++) { + for (int i2 = Kokkos::max(0, i1 - k); i2 <= i1; i2++) { + h_out(i0, i1, i2) = h_in(i0, i1, i2); + h_out(i0, i2, i1) = Kokkos::ArithTraits::conj(h_in(i0, i1, i2)); + } + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + +/// \brief Converts a banded matrix to a full matrix. +/// Takes a banded matrix in banded storage and converts it to a full matrix. +/// +/// \tparam InViewType: Input type for the matrix, needs to be a 3D view +/// \tparam OutViewType: Output type for the matrix, needs to be a 3D view +/// \tparam UploType: Type indicating whether the matrix is upper or lower triangular +/// +/// \param in [in]: Input batched banded matrix, a rank 3 view +/// \param out [out]: Output batched full matrix, a rank 3 view +/// \param k [in]: Number of sub/super-diagonals for lower/upper triangular (default is 1) +/// +template +void banded_to_full(InViewType& in, OutViewType& out, int k = 1) { + auto h_in = Kokkos::create_mirror_view(in); + auto h_out = Kokkos::create_mirror_view(out); + const int N = in.extent(0), BlkSize = in.extent(2); + + Kokkos::deep_copy(h_in, in); + assert(in.extent(0) == out.extent(0)); + assert(in.extent(1) == static_cast(k + 1)); + assert(in.extent(2) == out.extent(2)); + if constexpr (std::is_same_v) { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = i1; i2 < BlkSize; i2++) { + h_out(i0, i2 - i1, i2) = h_in(i0, k - i1, i2); + } + } + } + } else { + for (int i0 = 0; i0 < N; i0++) { + for (int i1 = 0; i1 < k + 1; i1++) { + for (int i2 = 0; i2 < BlkSize - i1; i2++) { + h_out(i0, i2 + i1, i2) = h_in(i0, i1, i2); + } + } + } + } + Kokkos::deep_copy(out, h_out); +} + } // namespace KokkosBatched #endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp index 144bb2251e..0b2ed4a162 100644 --- a/batched/dense/unit_test/Test_Batched_SerialGemm.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialGemm.hpp @@ -25,6 +25,7 @@ #include "KokkosBatched_Gemm_Serial_Impl.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" using namespace KokkosBatched; diff --git a/batched/dense/unit_test/Test_Batched_SerialIamax.hpp b/batched/dense/unit_test/Test_Batched_SerialIamax.hpp new file mode 100644 index 0000000000..38e9e78e04 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialIamax.hpp @@ -0,0 +1,277 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Iamax.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Iamax { + +template +struct Functor_BatchedSerialIamax { + using execution_space = typename DeviceType::execution_space; + XViewType m_x; + RViewType m_r; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialIamax(const XViewType &x, const RViewType &r) : m_x(x), m_r(r) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto sub_x = Kokkos::subview(m_x, k, Kokkos::ALL()); + auto iamax = KokkosBatched::SerialIamax::invoke(sub_x); + m_r(k) = static_cast(iamax); + } + + inline void run() { + using value_type = typename XViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialIamax"); + std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, m_x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +/// \brief Implementation details of batched iamax analytical test +/// A0: [1, 2, 0] -> 1 +/// A1: [-5, 4, 3] -> 0 +/// A2: [0, 0, 0] -> 0 +/// A3: [0, -1, -1] -> 1 +/// +/// \param N [in] Batch size of A +template +void impl_test_batched_iamax_analytical(const std::size_t N) { + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using MaxView1DType = Kokkos::View; + + View2DType A0("A0", N, 3), A1("A1", N, 3), A2("A2", N, 3), A3("A3", N, 3); + MaxView1DType iamax0("iamax0", N), iamax_ref0("iamax_ref0", N), iamax1("iamax1", N), iamax_ref1("iamax_ref1", N), + iamax2("iamax2", N), iamax_ref2("iamax_ref2", N), iamax3("iamax3", N), iamax_ref3("iamax_ref3", N); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, 3, N * incx}; + StridedView2DType A0_s("A0_s", layout), A1_s("A1_s", layout), A2_s("A2_s", layout), A3_s("A3_s", layout); + MaxView1DType iamax_s0("iamax_s0", N), iamax_s1("iamax_s1", N), iamax_s2("iamax_s2", N), iamax_s3("iamax_s3", N); + + // Initialize A0, A1, A2, A3 + auto h_A0 = Kokkos::create_mirror_view(A0); + auto h_A1 = Kokkos::create_mirror_view(A1); + auto h_A2 = Kokkos::create_mirror_view(A2); + auto h_A3 = Kokkos::create_mirror_view(A3); + + auto h_iamax_ref0 = Kokkos::create_mirror_view(iamax_ref0); + auto h_iamax_ref1 = Kokkos::create_mirror_view(iamax_ref1); + auto h_iamax_ref2 = Kokkos::create_mirror_view(iamax_ref2); + auto h_iamax_ref3 = Kokkos::create_mirror_view(iamax_ref3); + for (std::size_t k = 0; k < N; k++) { + h_A0(k, 0) = 1; + h_A0(k, 1) = 2; + h_A0(k, 2) = 0; + + h_A1(k, 0) = -5; + h_A1(k, 1) = 4; + h_A1(k, 2) = 3; + + h_A2(k, 0) = 0; + h_A2(k, 1) = 0; + h_A2(k, 2) = 0; + + h_A3(k, 0) = 0; + h_A3(k, 1) = -1; + h_A3(k, 2) = -1; + + h_iamax_ref0(k) = 1; + h_iamax_ref1(k) = 0; + h_iamax_ref2(k) = 0; + h_iamax_ref3(k) = 1; + } + Kokkos::deep_copy(A0, h_A0); + Kokkos::deep_copy(A1, h_A1); + Kokkos::deep_copy(A2, h_A2); + Kokkos::deep_copy(A3, h_A3); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A0_s, A0); + Kokkos::deep_copy(A1_s, A1); + Kokkos::deep_copy(A2_s, A2); + Kokkos::deep_copy(A3_s, A3); + + Functor_BatchedSerialIamax(A0, iamax0).run(); + Functor_BatchedSerialIamax(A1, iamax1).run(); + Functor_BatchedSerialIamax(A2, iamax2).run(); + Functor_BatchedSerialIamax(A3, iamax3).run(); + + // For strided views + Functor_BatchedSerialIamax(A0_s, iamax_s0).run(); + Functor_BatchedSerialIamax(A1_s, iamax_s1).run(); + Functor_BatchedSerialIamax(A2_s, iamax_s2).run(); + Functor_BatchedSerialIamax(A3_s, iamax_s3).run(); + + Kokkos::fence(); + + // Copy to host for comparison + auto h_iamax0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax0); + auto h_iamax1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax1); + auto h_iamax2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax2); + auto h_iamax3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax3); + auto h_iamax_s0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s0); + auto h_iamax_s1 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s1); + auto h_iamax_s2 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s2); + auto h_iamax_s3 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s3); + + // Check if max index is correct + for (std::size_t k = 0; k < N; k++) { + EXPECT_EQ(h_iamax0(k), h_iamax_ref0(k)); + EXPECT_EQ(h_iamax1(k), h_iamax_ref1(k)); + EXPECT_EQ(h_iamax2(k), h_iamax_ref2(k)); + EXPECT_EQ(h_iamax3(k), h_iamax_ref3(k)); + EXPECT_EQ(h_iamax_s0(k), h_iamax_ref0(k)); + EXPECT_EQ(h_iamax_s1(k), h_iamax_ref1(k)); + EXPECT_EQ(h_iamax_s2(k), h_iamax_ref2(k)); + EXPECT_EQ(h_iamax_s3(k), h_iamax_ref3(k)); + } +} + +/// \brief Implementation details of batched pbtrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param BlkSize [in] Block size of matrix A +template +void impl_test_batched_iamax(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using MaxView1DType = Kokkos::View; + + View2DType A("A", N, BlkSize); + MaxView1DType iamax("iamax", N), iamax_ref("iamax_ref", N); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType A_s("A_s", layout); + MaxView1DType iamax_s("iamax_s", N); + + // Initialize A with random values + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A_s, A); + + Functor_BatchedSerialIamax(A, iamax).run(); + + // For strided views + Functor_BatchedSerialIamax(A_s, iamax_s).run(); + + Kokkos::fence(); + + // Reference + auto h_iamax_ref = Kokkos::create_mirror_view(iamax_ref); + if (BlkSize == 0) { + // As well as blas, we store 0 (0 in Fortran) for empty matrix + for (std::size_t k = 0; k < N; k++) { + h_iamax_ref(k) = 0; + } + } else { + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + for (std::size_t k = 0; k < N; k++) { + RealType amax = Kokkos::abs(h_A(k, 0)); + int iamax_tmp = 0; + for (std::size_t i = 1; i < BlkSize; i++) { + const RealType abs_A_i = Kokkos::abs(h_A(k, i)); + if (abs_A_i > amax) { + amax = abs_A_i; + iamax_tmp = static_cast(i); + } + } + h_iamax_ref(k) = iamax_tmp; + } + } + + // Copy to host for comparison + auto h_iamax = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax); + auto h_iamax_s = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), iamax_s); + + // Check if max index is correct + for (std::size_t k = 0; k < N; k++) { + EXPECT_EQ(h_iamax(k), h_iamax_ref(k)); + EXPECT_EQ(h_iamax_s(k), h_iamax_ref(k)); + } +} + +} // namespace Iamax +} // namespace Test + +template +int test_batched_iamax() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Iamax::impl_test_batched_iamax_analytical(1); + Test::Iamax::impl_test_batched_iamax_analytical(2); + for (std::size_t i = 0; i < 10; i++) { + Test::Iamax::impl_test_batched_iamax(1, i); + Test::Iamax::impl_test_batched_iamax(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Iamax::impl_test_batched_iamax_analytical(1); + Test::Iamax::impl_test_batched_iamax_analytical(2); + for (std::size_t i = 0; i < 10; i++) { + Test::Iamax::impl_test_batched_iamax(1, i); + Test::Iamax::impl_test_batched_iamax(2, i); + } + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_iamax_float) { test_batched_iamax(); } +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_iamax_double) { test_batched_iamax(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_iamax_fcomplex) { test_batched_iamax>(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_iamax_dcomplex) { test_batched_iamax>(); } +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp index 6f11154471..fd0ca336bf 100644 --- a/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialInverseLU.hpp @@ -143,7 +143,7 @@ void impl_test_batched_inverselu(const int N, const int BlkSize) { /// randomized input testing views AViewType a0("a0", N, BlkSize, BlkSize); AViewType a1("a1", N, BlkSize, BlkSize); - WViewType w("w", N, BlkSize * BlkSize); + WViewType w("w", N, BlkSize * static_cast(BlkSize)); AViewType c0("c0", N, BlkSize, BlkSize); Kokkos::Random_XorShift64_Pool random(13718); diff --git a/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp b/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp new file mode 100644 index 0000000000..c515ad4088 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialLaswp.hpp @@ -0,0 +1,638 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Laswp.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Laswp { + +template +struct Functor_BatchedSerialLaswp { + using execution_space = typename DeviceType::execution_space; + PivViewType m_ipiv; + AViewType m_a; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialLaswp(const PivViewType &ipiv, const AViewType &a) : m_ipiv(ipiv), m_a(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ArgDirect &, const int k, int &info) const { + auto sub_ipiv = Kokkos::subview(m_ipiv, k, Kokkos::ALL); + if constexpr (AViewType::rank == 3) { + auto sub_a = Kokkos::subview(m_a, k, Kokkos::ALL, Kokkos::ALL); + info += KokkosBatched::SerialLaswp::invoke(sub_ipiv, sub_a); + } else { + auto sub_a = Kokkos::subview(m_a, k, Kokkos::ALL); + info += KokkosBatched::SerialLaswp::invoke(sub_ipiv, sub_a); + } + } + + inline int run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialLaswp"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, m_a.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +/// \brief Implementation details of batched laswp analytical test +/// Confirm A = Ref (permuted), where +/// A0: [[4], +/// [1], +/// [2]] +/// p0: [1, 2, 0] +/// Initial 0<->1 1<->2 2<->0 +/// Forward: [4,1,2] -> [1,4,2] -> [1,2,4] -> [4,2,1] +/// Initial 2<->0 1<->2 0<->1 +/// Backward: [4,1,2] -> [2,1,4] -> [2,4,1] -> [4,2,1] +/// +/// A1: [[4, 1, 5], +/// [2, 3, 7], +/// [6, 0, 8]] +/// p1: [1, 2, 0] +/// Initial 0<->1 1<->2 2<->0 +/// Forward: [[4, 1, 5], -> [[2, 3, 7], -> [[2, 3, 7], -> [[4, 1, 5], +/// [2, 3, 7], [4, 1, 5], [6, 0, 8], [6, 0, 8], +/// [6, 0, 8]] [6, 0, 8]] [4, 1, 5]] [2, 3, 7]] +/// Initial 2<->0 1<->2 0<->1 +/// Backward: [[4, 1, 5], -> [[6, 0, 8], -> [[6, 0, 8], -> [[4, 1, 5], +/// [2, 3, 7], [2, 3, 7], 4, 1, 5], [6, 0, 8], +/// [6, 0, 8]] [4, 1, 5]] [2, 3, 7]] [2, 3, 7]] +/// +/// A2: [[5, 1], +/// [2, 4], +/// [3, 0]] +/// p2: [2, 0, 1] +/// Initial 0<->2 1<->0 2<->1 +/// Forward: [[5, 1], -> [[3, 0], -> [[2, 4], -> [[2, 4], +/// [2, 4], [2, 4] [3, 0], [5, 1], +/// [3, 0]] [5, 1]] [5, 1]] [3, 0]] +/// Initial 2<->1 1<->0 0<->2 +/// Backward: [[5, 1], -> [[5, 1], -> [[3, 0], -> [[2, 4], +/// [2, 4], [3, 0], [5, 1], [5, 1], +/// [3, 0]] [2, 4]] [2, 4]] [3, 0]] +/// +/// \param N [in] Batch size of matrices +template +void impl_test_batched_laswp_analytical(const std::size_t N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using StridedView3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + View2DType A0("A0", N, 3), Ref0("Ref0", N, 3), A0_identity("A0_identity", N, 3), Ref0_identity("Ref0_identity", N, 3); + View3DType A1("A1", N, 3, 3), Ref1("Ref1", N, 3, 3), A1_identity("A1_identity", N, 3, 3), + Ref1_identity("Ref1_identity", N, 3, 3); + View3DType A2("A2", N, 3, 2), Ref2("Ref2", N, 3, 2), A2_identity("A2_identity", N, 3, 2), + Ref2_identity("Ref2_identity", N, 3, 2); + PivView2DType ipiv0("ipiv0", N, 3), ipiv1("ipiv1", N, 3), ipiv2("ipiv2", N, 3); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout0{N, incx, 3, N * incx}; + StridedView2DType A0_s("A0_s", layout0), A0_s_identity("A0_s_identity", layout0); + + Kokkos::LayoutStride layout1{N, incx, 3, N * incx, 3, N * incx * 3}; + StridedView3DType A1_s("A1_s", layout1), A1_s_identity("A1_s_identity", layout1); + + Kokkos::LayoutStride layout2{N, incx, 3, N * incx, 2, N * incx * 3}; + StridedView3DType A2_s("A2_s", layout2), A2_s_identity("A2_s_identity", layout2); + + // Initialize A0, A1, and A2 with random numbers + auto h_A0 = Kokkos::create_mirror_view(A0); + auto h_A1 = Kokkos::create_mirror_view(A1); + auto h_A2 = Kokkos::create_mirror_view(A2); + auto h_Ref0 = Kokkos::create_mirror_view(Ref0); + auto h_Ref1 = Kokkos::create_mirror_view(Ref1); + auto h_Ref2 = Kokkos::create_mirror_view(Ref2); + + for (std::size_t ib = 0; ib < N; ib++) { + h_A0(ib, 0) = 4.0; + h_A0(ib, 1) = 1.0; + h_A0(ib, 2) = 2.0; + + h_Ref0(ib, 0) = 4.0; + h_Ref0(ib, 1) = 2.0; + h_Ref0(ib, 2) = 1.0; + + h_A1(ib, 0, 0) = 4.0; + h_A1(ib, 0, 1) = 1.0; + h_A1(ib, 0, 2) = 5.0; + h_A1(ib, 1, 0) = 2.0; + h_A1(ib, 1, 1) = 3.0; + h_A1(ib, 1, 2) = 7.0; + h_A1(ib, 2, 0) = 6.0; + h_A1(ib, 2, 1) = 0.0; + h_A1(ib, 2, 2) = 8.0; + + h_Ref1(ib, 0, 0) = 4.0; + h_Ref1(ib, 0, 1) = 1.0; + h_Ref1(ib, 0, 2) = 5.0; + h_Ref1(ib, 1, 0) = 6.0; + h_Ref1(ib, 1, 1) = 0.0; + h_Ref1(ib, 1, 2) = 8.0; + h_Ref1(ib, 2, 0) = 2.0; + h_Ref1(ib, 2, 1) = 3.0; + h_Ref1(ib, 2, 2) = 7.0; + + h_A2(ib, 0, 0) = 5.0; + h_A2(ib, 0, 1) = 1.0; + h_A2(ib, 1, 0) = 2.0; + h_A2(ib, 1, 1) = 4.0; + h_A2(ib, 2, 0) = 3.0; + h_A2(ib, 2, 1) = 0.0; + + h_Ref2(ib, 0, 0) = 2.0; + h_Ref2(ib, 0, 1) = 4.0; + h_Ref2(ib, 1, 0) = 5.0; + h_Ref2(ib, 1, 1) = 1.0; + h_Ref2(ib, 2, 0) = 3.0; + h_Ref2(ib, 2, 1) = 0.0; + } + Kokkos::deep_copy(A0, h_A0); + Kokkos::deep_copy(A1, h_A1); + Kokkos::deep_copy(A2, h_A2); + + // Strided view can be copied only on the same device + Kokkos::deep_copy(A0_s, A0); + Kokkos::deep_copy(A1_s, A1); + Kokkos::deep_copy(A2_s, A2); + + // Copy A to Ref_identity + Kokkos::deep_copy(Ref0_identity, A0); + Kokkos::deep_copy(Ref1_identity, A1); + Kokkos::deep_copy(Ref2_identity, A2); + + // Permute ipiv + auto h_ipiv0 = Kokkos::create_mirror_view(ipiv0); + auto h_ipiv1 = Kokkos::create_mirror_view(ipiv1); + auto h_ipiv2 = Kokkos::create_mirror_view(ipiv2); + + for (std::size_t ib = 0; ib < N; ib++) { + h_ipiv0(ib, 0) = 1; + h_ipiv0(ib, 1) = 2; + h_ipiv0(ib, 2) = 0; + + h_ipiv1(ib, 0) = 1; + h_ipiv1(ib, 1) = 2; + h_ipiv1(ib, 2) = 0; + + h_ipiv2(ib, 0) = 2; + h_ipiv2(ib, 1) = 0; + h_ipiv2(ib, 2) = 1; + } + + Kokkos::deep_copy(ipiv0, h_ipiv0); + Kokkos::deep_copy(ipiv1, h_ipiv1); + Kokkos::deep_copy(ipiv2, h_ipiv2); + + auto info0 = Functor_BatchedSerialLaswp(ipiv0, A0).run(); + auto info1 = Functor_BatchedSerialLaswp(ipiv1, A1).run(); + auto info2 = Functor_BatchedSerialLaswp(ipiv2, A2).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // For strided views + info0 = Functor_BatchedSerialLaswp(ipiv0, A0_s).run(); + info1 = Functor_BatchedSerialLaswp(ipiv1, A1_s).run(); + info2 = Functor_BatchedSerialLaswp(ipiv2, A2_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // Copy permuted A to A_identity which is permuted back to original A + Kokkos::deep_copy(A0_identity, A0); + Kokkos::deep_copy(A1_identity, A1); + Kokkos::deep_copy(A2_identity, A2); + + Kokkos::deep_copy(A0_s_identity, A0_s); + Kokkos::deep_copy(A1_s_identity, A1_s); + Kokkos::deep_copy(A2_s_identity, A2_s); + + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + + // Permute A_identity in inverse direction to get original A + info0 = Functor_BatchedSerialLaswp(ipiv0, A0_identity).run(); + info1 = Functor_BatchedSerialLaswp(ipiv1, A1_identity).run(); + info2 = Functor_BatchedSerialLaswp(ipiv2, A2_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + // For strided views + info0 = + Functor_BatchedSerialLaswp(ipiv0, A0_s_identity).run(); + info1 = + Functor_BatchedSerialLaswp(ipiv1, A1_s_identity).run(); + info2 = + Functor_BatchedSerialLaswp(ipiv2, A2_s_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info0, 0); + EXPECT_EQ(info1, 0); + EXPECT_EQ(info2, 0); + + RealType eps = 1.0e1 * ats::epsilon(); + + Kokkos::deep_copy(h_A0, A0); + Kokkos::deep_copy(h_A1, A1); + Kokkos::deep_copy(h_A2, A2); + auto h_A0_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A0_identity); + auto h_A1_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A1_identity); + auto h_A2_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A2_identity); + + auto h_Ref0_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref0_identity); + auto h_Ref1_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref1_identity); + auto h_Ref2_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref2_identity); + // Check if A is permuted correctly and A_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < 3; i++) { + EXPECT_NEAR_KK_REL(h_A0(ib, i), h_Ref0(ib, i), eps); + EXPECT_NEAR_KK_REL(h_A0_identity(ib, i), h_Ref0_identity(ib, i), eps); + for (std::size_t j = 0; j < 2; j++) { + EXPECT_NEAR_KK_REL(h_A2(ib, i, j), h_Ref2(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A2_identity(ib, i, j), h_Ref2_identity(ib, i, j), eps); + } + for (std::size_t j = 0; j < 3; j++) { + EXPECT_NEAR_KK_REL(h_A1(ib, i, j), h_Ref1(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A1_identity(ib, i, j), h_Ref1_identity(ib, i, j), eps); + } + } + } + + // Testing for strided views, reusing A0, A1, A2, A0_identity, A1_identity, A2_identity + Kokkos::deep_copy(A0, A0_s); + Kokkos::deep_copy(h_A0, A0); + Kokkos::deep_copy(A1, A1_s); + Kokkos::deep_copy(h_A1, A1); + Kokkos::deep_copy(A2, A2_s); + Kokkos::deep_copy(h_A2, A2); + Kokkos::deep_copy(A0_identity, A0_s_identity); + Kokkos::deep_copy(h_A0_identity, A0_identity); + Kokkos::deep_copy(A1_identity, A1_s_identity); + Kokkos::deep_copy(h_A1_identity, A1_identity); + Kokkos::deep_copy(A2_identity, A2_s_identity); + Kokkos::deep_copy(h_A2_identity, A2_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < 3; i++) { + EXPECT_NEAR_KK_REL(h_A0(ib, i), h_Ref0(ib, i), eps); + EXPECT_NEAR_KK_REL(h_A0_identity(ib, i), h_Ref0_identity(ib, i), eps); + for (std::size_t j = 0; j < 2; j++) { + EXPECT_NEAR_KK_REL(h_A2(ib, i, j), h_Ref2(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A2_identity(ib, i, j), h_Ref2_identity(ib, i, j), eps); + } + for (std::size_t j = 0; j < 3; j++) { + EXPECT_NEAR_KK_REL(h_A1(ib, i, j), h_Ref1(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A1_identity(ib, i, j), h_Ref1_identity(ib, i, j), eps); + } + } + } +} + +/// \brief Implementation details of batched laswp test on vectors +/// Apply pivot to vector +/// +/// \param N [in] Batch size of vectors +/// \param BlkSize [in] Length of vector b +template +void impl_test_batched_laswp_vector(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using StridedView2DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + View2DType b("b", N, BlkSize), Ref("Ref", N, BlkSize), b_identity("b_identity", N, BlkSize), + Ref_identity("Ref_identity", N, BlkSize); + PivView2DType ipiv("ipiv", N, BlkSize); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx}; + StridedView2DType b_s("b_s", layout), b_s_identity("b_s_identity", layout); + + // Initialize b with random numbers + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Ref, b); // This Ref is used to store permuted b + Kokkos::deep_copy(Ref_identity, b); + Kokkos::deep_copy(b_s, b); + + // Permute ipiv + auto h_ipiv = Kokkos::create_mirror_view(ipiv); + std::vector ipiv_vec(BlkSize); + for (int i = 0; i < static_cast(BlkSize); i++) { + ipiv_vec[i] = i; + } + auto rng = std::default_random_engine{}; + std::shuffle(ipiv_vec.begin(), ipiv_vec.end(), rng); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + h_ipiv(ib, i) = ipiv_vec[i]; + } + } + Kokkos::deep_copy(ipiv, h_ipiv); + + auto info = Functor_BatchedSerialLaswp(ipiv, b).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // For strided views + info = Functor_BatchedSerialLaswp(ipiv, b_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Copy permuted b to b_identity which is permuted back to original b + Kokkos::deep_copy(b_identity, b); + Kokkos::deep_copy(b_s_identity, b_s); + + // Permute b_identity in inverse direction to get original b + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + Functor_BatchedSerialLaswp(ipiv, b_identity).run(); + Functor_BatchedSerialLaswp(ipiv, b_s_identity).run(); + + // Make a reference + auto h_Ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref); + for (std::size_t ib = 0; ib < N; ib++) { + if constexpr (std::is_same_v) { + // Permute Ref by forward pivoting + for (int i = 0; i < static_cast(BlkSize); i++) { + if (h_ipiv(ib, i) != i) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i)), h_Ref(ib, i)); + } + } + } else { + // Permute Ref by backward pivoting + for (int i = (static_cast(BlkSize) - 1); i >= 0; --i) { + if (h_ipiv(ib, i) != i) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i)), h_Ref(ib, i)); + } + } + } + } + + RealType eps = 1.0e1 * ats::epsilon(); + + auto h_b = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b); + auto h_b_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_identity); + auto h_Ref_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref_identity); + // Check b is permuted correctly and b_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK_REL(h_b(ib, i), h_Ref(ib, i), eps); + EXPECT_NEAR_KK_REL(h_b_identity(ib, i), h_Ref_identity(ib, i), eps); + } + } + + // Testing for strided views, reusing b and b_identity + Kokkos::deep_copy(b, b_s); + Kokkos::deep_copy(h_b, b); + Kokkos::deep_copy(b_identity, b_s_identity); + Kokkos::deep_copy(h_b_identity, b_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK_REL(h_b(ib, i), h_Ref(ib, i), eps); + EXPECT_NEAR_KK_REL(h_b_identity(ib, i), h_Ref_identity(ib, i), eps); + } + } +} + +/// \brief Implementation details of batched laswp test on matrices +/// Apply pivot to matrix +/// +/// \param N [in] Batch size of vectors +/// \param BlkSize [in] Row size of matrix A +template +void impl_test_batched_laswp_matrix(const std::size_t N, const std::size_t BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + using StridedView3DType = Kokkos::View; + using PivView2DType = Kokkos::View; + + // In order for the tests on non-square matrices, we fix the column size to 5 + // and scan with the row size + constexpr std::size_t M = 5; + View3DType A("A", N, BlkSize, M), Ref("Ref", N, BlkSize, M), A_identity("A_identity", N, BlkSize, M), + Ref_identity("Ref_identity", N, BlkSize, M); + PivView2DType ipiv("ipiv", N, BlkSize); + + // Testing incx argument with strided views + constexpr std::size_t incx = 2; + Kokkos::LayoutStride layout{N, incx, BlkSize, N * incx, M, N * incx * BlkSize}; + StridedView3DType A_s("A_s", layout), A_s_identity("A_s_identity", layout); + + // Initialize A with random numbers + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Ref, A); // This Ref is used to store permuted A + Kokkos::deep_copy(Ref_identity, A); + Kokkos::deep_copy(A_s, A); + + // Permute ipiv + auto h_ipiv = Kokkos::create_mirror_view(ipiv); + std::vector ipiv_vec(BlkSize); + for (int i = 0; i < static_cast(BlkSize); i++) { + ipiv_vec[i] = i; + } + auto rng = std::default_random_engine{}; + std::shuffle(ipiv_vec.begin(), ipiv_vec.end(), rng); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + h_ipiv(ib, i) = ipiv_vec[i]; + } + } + Kokkos::deep_copy(ipiv, h_ipiv); + + auto info = Functor_BatchedSerialLaswp(ipiv, A).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // For strided views + info = Functor_BatchedSerialLaswp(ipiv, A_s).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Copy permuted A to A_identity which is permuted back to original A + Kokkos::deep_copy(A_identity, A); + Kokkos::deep_copy(A_s_identity, A_s); + + // Permute A_identity in inverse direction to get original A + using InvDirect = + typename std::conditional_t, Direct::Backward, Direct::Forward>; + info = Functor_BatchedSerialLaswp(ipiv, A_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + info = Functor_BatchedSerialLaswp(ipiv, A_s_identity).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // permute Ref by ipiv + auto h_Ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref); + for (std::size_t ib = 0; ib < N; ib++) { + if constexpr (std::is_same_v) { + // Permute Ref by forward pivoting + for (int i = 0; i < static_cast(BlkSize); i++) { + if (h_ipiv(ib, i) != i) { + for (int j = 0; j < static_cast(M); j++) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i), j), h_Ref(ib, i, j)); + } + } + } + } else { + // Permute Ref by backward pivoting + for (int i = (static_cast(BlkSize) - 1); i >= 0; --i) { + if (h_ipiv(ib, i) != i) { + for (int j = 0; j < static_cast(M); j++) { + Kokkos::kokkos_swap(h_Ref(ib, h_ipiv(ib, i), j), h_Ref(ib, i, j)); + } + } + } + } + } + + RealType eps = 1.0e1 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_identity); + auto h_Ref_identity = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ref_identity); + // Check A is permuted correctly and A_identity is restored + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < M; j++) { + EXPECT_NEAR_KK_REL(h_A(ib, i, j), h_Ref(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A_identity(ib, i, j), h_Ref_identity(ib, i, j), eps); + } + } + } + + // Testing for strided views, reusing A and A_identity + Kokkos::deep_copy(A, A_s); + Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(A_identity, A_s_identity); + Kokkos::deep_copy(h_A_identity, A_identity); + for (std::size_t ib = 0; ib < N; ib++) { + for (std::size_t i = 0; i < BlkSize; i++) { + for (std::size_t j = 0; j < M; j++) { + EXPECT_NEAR_KK_REL(h_A(ib, i, j), h_Ref(ib, i, j), eps); + EXPECT_NEAR_KK_REL(h_A_identity(ib, i, j), h_Ref_identity(ib, i, j), eps); + } + } + } +} + +} // namespace Laswp +} // namespace Test + +template +int test_batched_laswp() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Laswp::impl_test_batched_laswp_analytical(1); + Test::Laswp::impl_test_batched_laswp_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Laswp::impl_test_batched_laswp_vector(1, i); + Test::Laswp::impl_test_batched_laswp_vector(2, i); + Test::Laswp::impl_test_batched_laswp_matrix(1, i); + Test::Laswp::impl_test_batched_laswp_matrix(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Laswp::impl_test_batched_laswp_analytical(1); + Test::Laswp::impl_test_batched_laswp_analytical(2); + for (int i = 0; i < 10; i++) { + Test::Laswp::impl_test_batched_laswp_vector(1, i); + Test::Laswp::impl_test_batched_laswp_vector(2, i); + Test::Laswp::impl_test_batched_laswp_matrix(1, i); + Test::Laswp::impl_test_batched_laswp_matrix(2, i); + } + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_laswp_f_float) { test_batched_laswp(); } +TEST_F(TestCategory, test_batched_laswp_b_float) { test_batched_laswp(); } +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_laswp_f_double) { test_batched_laswp(); } +TEST_F(TestCategory, test_batched_laswp_b_double) { test_batched_laswp(); } +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_laswp_f_fcomplex) { + test_batched_laswp, Direct::Forward>(); +} +TEST_F(TestCategory, test_batched_laswp_b_fcomplex) { + test_batched_laswp, Direct::Backward>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_laswp_f_dcomplex) { + test_batched_laswp, Direct::Forward>(); +} +TEST_F(TestCategory, test_batched_laswp_b_dcomplex) { + test_batched_laswp, Direct::Backward>(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp new file mode 100644 index 0000000000..0b16fab242 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf.hpp @@ -0,0 +1,322 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pbtrf.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pbtrf { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrf(const ABViewType &ab) : _ab(ab) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + + info += KokkosBatched::SerialPbtrf::invoke(sub_ab); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _ab.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrf"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pbtrf test +/// Confirm A = U**H * U or L * L**H, where +/// For full storage, +/// A: [[4, 1], +/// [1, 4]] +/// L: [[sqrt(4), 0], +/// [1/sqrt(4), sqrt(4 - (1/sqrt(4))**2)] +/// U: [[sqrt(4), 1/sqrt(4)], +/// [0, sqrt(4 - (1/sqrt(4))**2)] +/// +/// For lower banded storage, Ab = Lb * Lb**H +/// Ab: [[4, 4], +/// [1, 0]] +/// Lb: [[sqrt(4), sqrt(4 - (1/sqrt(4))**2)], +/// [1/sqrt(4), 0]] +/// +/// For upper banded storage, Ab = Ub**H * Ub +/// Ab: [[0, 1], +/// [4, 4]] +/// Ub: [[0, 1/sqrt(4)], +/// [sqrt(4), sqrt(4 - (1/sqrt(4))**2)]] +/// \param N [in] Batch size of AB +void impl_test_batched_pbtrf_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View3DType = Kokkos::View; + + constexpr int BlkSize = 2, k = 1; + View3DType A("A", N, BlkSize, BlkSize), Ab("Ab", N, k + 1, BlkSize), + Ab_ref("Ab_ref", N, k + 1, BlkSize); // Banded matrix + + auto h_A = Kokkos::create_mirror_view(A); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_A(ib, i, j) = i == j ? 4.0 : 1.0; + } + } + } + + Kokkos::deep_copy(A, h_A); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_triangular_matrix(A, Ab, k, true); + + // Make a reference using the naive Cholesky decomposition + // Cholesky decomposition for full storage + // l_kk = np.sqrt( a_kk - sum_{i=1}^{k-1}( l_ik^2 ) ) + // l_ik = 1/l_kk * ( a_ik - sum_{j=1}^{k-1}( l_ij * l_kj ) ) + auto h_Ab_ref = Kokkos::create_mirror_view(Ab_ref); + auto h_Ab = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Ab); + if (std::is_same_v) { + // A = U**H * U + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 1, 0) = Kokkos::sqrt(h_Ab(ib, 1, 0)); + h_Ab_ref(ib, 0, 1) = 1.0 / h_Ab_ref(ib, 1, 0); + h_Ab_ref(ib, 1, 1) = Kokkos::sqrt(h_Ab(ib, 1, 1) - h_Ab_ref(ib, 0, 1) * h_Ab_ref(ib, 0, 1)); + } + } else { + // A = L * L**H + for (int ib = 0; ib < N; ib++) { + h_Ab_ref(ib, 0, 0) = Kokkos::sqrt(h_Ab(ib, 0, 0)); + h_Ab_ref(ib, 1, 0) = 1.0 / h_Ab_ref(ib, 0, 0); + h_Ab_ref(ib, 0, 1) = Kokkos::sqrt(h_Ab(ib, 0, 1) - h_Ab_ref(ib, 1, 0) * h_Ab_ref(ib, 1, 0)); + } + } + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + // Check if Ab == Ub or Lb + Kokkos::deep_copy(h_Ab, Ab); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < k + 1; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_Ab(ib, i, j), h_Ab_ref(ib, i, j), eps); + } + } + } +} + +template +/// \brief Implementation details of batched pbtrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrf(const int N, const int k, const int BlkSize) { + using View3DType = Kokkos::View; + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize), + Ab("Ab", N, k + 1, BlkSize); // Banded matrix + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize A_reconst with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + // Make the matrix Positive Definite Symmetric and Diagonal dominant + random_to_pds(A, A_reconst); + Kokkos::deep_copy(A, 0.0); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + // Clear matrix + Kokkos::deep_copy(A_reconst, 0.0); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + auto info = Functor_BatchedSerialPbtrf(Ab).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + if (std::is_same_v) { + // A = U**H * U + View3DType U("U", N, BlkSize, BlkSize), Uc("Uc", N, BlkSize, BlkSize); + banded_to_full(Ab, U, k); + + // Compute the complex conjugate of U + // U -> conj(U) + auto h_U = Kokkos::create_mirror_view(U); + auto h_Uc = Kokkos::create_mirror_view(Uc); + Kokkos::deep_copy(h_U, U); + Kokkos::deep_copy(h_Uc, Uc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Uc(ib, i, j) = Kokkos::ArithTraits::conj(h_U(ib, i, j)); + } + } + } + Kokkos::deep_copy(Uc, h_Uc); + + // Create conjugate of U + Functor_BatchedSerialGemm(1.0, Uc, U, 0.0, A_reconst) + .run(); + } else { + // A = L * L**H + View3DType L("L", N, BlkSize, BlkSize), Lc("Lc", N, BlkSize, BlkSize); + banded_to_full(Ab, L, k); + + // Compute the complex conjugate of L + // L -> conj(L) + auto h_L = Kokkos::create_mirror_view(L); + auto h_Lc = Kokkos::create_mirror_view(Lc); + Kokkos::deep_copy(h_L, L); + Kokkos::deep_copy(h_Lc, Lc); + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_Lc(ib, i, j) = Kokkos::ArithTraits::conj(h_L(ib, i, j)); + } + } + } + Kokkos::deep_copy(Lc, h_Lc); + + // Create conjugate of L + Functor_BatchedSerialGemm(1.0, L, Lc, 0.0, A_reconst) + .run(); + } + + // this eps is about 10^-14 + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_A = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto h_A_reconst = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A_reconst); + + // Check if A = U**H * U or A = L * L**H + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + EXPECT_NEAR_KK(h_A_reconst(ib, i, j), h_A(ib, i, j), eps); + } + } + } +} + +} // namespace Pbtrf +} // namespace Test + +template +int test_batched_pbtrf() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pbtrf::impl_test_batched_pbtrf_analytical(1); + Test::Pbtrf::impl_test_batched_pbtrf_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrf::impl_test_batched_pbtrf(1, k, i); + Test::Pbtrf::impl_test_batched_pbtrf(2, k, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp new file mode 100644 index 0000000000..3aebb8ffa5 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Complex.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_fcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_dcomplex) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp new file mode 100644 index 0000000000..e1b77416f5 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrf_Real.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pbtrf_l_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_float) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrf_l_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +TEST_F(TestCategory, test_batched_pbtrf_u_double) { + using algo_tag_type = typename Algo::Pbtrf::Unblocked; + using param_tag_type = ::Test::Pbtrf::ParamTag; + + test_batched_pbtrf(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp new file mode 100644 index 0000000000..01654a4dd5 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrs.hpp @@ -0,0 +1,295 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include +#include +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pbtrf.hpp" +#include "KokkosBatched_Pbtrs.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pbtrs { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPbtrf { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrf(const ABViewType &ab) : _ab(ab) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialPbtrf::invoke(sub_ab); + } + + inline void run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _ab.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialPbtrs { + using execution_space = typename DeviceType::execution_space; + ABViewType _ab; + BViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPbtrs(const ABViewType &ab, const BViewType &b) : _ab(ab), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto sub_ab = Kokkos::subview(_ab, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPbtrs::invoke(sub_ab, bb); + } + + inline int run() { + using value_type = typename ABViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _b.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + xViewType _x; + yViewType _y; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemv(const ScalarType alpha, const AViewType &a, const xViewType &x, const ScalarType beta, + const yViewType &y) + : _a(a), _x(x), _y(y), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(_x, k, Kokkos::ALL()); + auto yy = Kokkos::subview(_y, k, Kokkos::ALL()); + + KokkosBlas::SerialGemv::invoke(_alpha, aa, xx, _beta, yy); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPbtrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pbtrs test +/// Confirm A * x = b, where +/// A: [[4, 1, 0], +/// [1, 4, 1], +/// [0, 1, 4]] +/// b: [1, 1, 1] +/// x: [3/14, 1/7, 3/14] +/// +/// This corresponds to the following system of equations: +/// 4 x0 + x1 = 1 +/// x0 + 4 x1 + x2 = 1 +/// x1 + 4 x2 = 1 +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrs_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + constexpr int BlkSize = 3, k = 1; + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize), y0("y0", N, BlkSize); // Solutions + + auto h_A_reconst = Kokkos::create_mirror_view(A_reconst); + auto h_x_ref = Kokkos::create_mirror_view(x_ref); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + for (int j = 0; j < BlkSize; j++) { + h_A_reconst(ib, i, j) = i == j ? 4.0 : 1.0; + } + } + + h_x_ref(ib, 0) = 3.0 / 14.0; + h_x_ref(ib, 1) = 1.0 / 7.0; + h_x_ref(ib, 2) = 3.0 / 14.0; + } + + Kokkos::fence(); + + Kokkos::deep_copy(x0, ScalarType(1.0)); + Kokkos::deep_copy(A_reconst, h_A_reconst); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + Functor_BatchedSerialPbtrf(Ab).run(); + + // pbtrs (Note, Ab is a factorized matrix of A) + auto info = Functor_BatchedSerialPbtrs(Ab, x0).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + auto h_x0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x0); + + // Check x0 = x1 + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + Test::EXPECT_NEAR_KK_REL(h_x0(ib, i), h_x_ref(ib, i), eps); + } + } +} + +template +/// \brief Implementation details of batched pbtrs test +/// Confirm A * x = b, where +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pbtrs(const int N, const int k, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), A_reconst("A_reconst", N, BlkSize, BlkSize); + View3DType Ab("Ab", N, k + 1, BlkSize); // Banded matrix + View2DType x0("x0", N, BlkSize), x_ref("x_ref", N, BlkSize), y0("y0", N, BlkSize); // Solutions + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + ScalarType randStart, randEnd; + + // Initialize A_reconst with random matrix + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::fill_random(x0, rand_pool, randStart, randEnd); + Kokkos::deep_copy(x_ref, x0); + + // Make the matrix Positive Definite Symmetric and Diagonal dominant + random_to_pds(A, A_reconst); + Kokkos::deep_copy(A, ScalarType(0.0)); + + // Create banded triangluar matrix in normal and banded storage + using ArgUplo = typename ParamTagType::uplo; + create_banded_pds_matrix(A_reconst, A, k, false); + + create_banded_triangular_matrix(A_reconst, Ab, k, true); + + Kokkos::fence(); + + // Factorize with Pbtrf: A = U**H * U or A = L * L**H + Functor_BatchedSerialPbtrf(Ab).run(); + + // pbtrs (Note, Ab is a factorized matrix of A) + auto info = Functor_BatchedSerialPbtrs(Ab, x0).run(); + + Kokkos::fence(); + EXPECT_EQ(info, 0); + + // Gemv to compute A*x0, this should be identical to x_ref + Functor_BatchedSerialGemv(1.0, A, x0, 0.0, y0).run(); + + Kokkos::fence(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_y0 = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), y0); + auto h_x_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x_ref); + + // Check A * x0 = x_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + Test::EXPECT_NEAR_KK_REL(h_y0(ib, i), h_x_ref(ib, i), eps); + } + } +} + +} // namespace Pbtrs +} // namespace Test + +template +int test_batched_pbtrs() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pbtrs::impl_test_batched_pbtrs_analytical(1); + Test::Pbtrs::impl_test_batched_pbtrs_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrs::impl_test_batched_pbtrs(1, k, i); + Test::Pbtrs::impl_test_batched_pbtrs(2, k, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pbtrs::impl_test_batched_pbtrs_analytical(1); + Test::Pbtrs::impl_test_batched_pbtrs_analytical(2); + for (int i = 0; i < 10; i++) { + int k = 1; + Test::Pbtrs::impl_test_batched_pbtrs(1, k, i); + Test::Pbtrs::impl_test_batched_pbtrs(2, k, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp new file mode 100644 index 0000000000..258805d950 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrs_Complex.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pbtrs_l_fcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_fcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrs_l_dcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_dcomplex) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp new file mode 100644 index 0000000000..fbad59bbc6 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPbtrs_Real.hpp @@ -0,0 +1,45 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pbtrs_l_float) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_float) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pbtrs_l_double) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +TEST_F(TestCategory, test_batched_pbtrs_u_double) { + using algo_tag_type = typename Algo::Pbtrs::Unblocked; + using param_tag_type = ::Test::Pbtrs::ParamTag; + + test_batched_pbtrs(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp new file mode 100644 index 0000000000..6e501b6d97 --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrs.hpp @@ -0,0 +1,430 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +/// \author Yuuichi Asahi (yuuichi.asahi@cea.fr) +#include +#include +#include + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Pttrs.hpp" +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Pttrs { + +template +struct ParamTag { + using uplo = U; +}; + +template +struct Functor_BatchedSerialPttrf { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrf(const DViewType &d, const EViewType &e) : _d(d), _e(e) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + + KokkosBatched::SerialPttrf::invoke(dd, ee); + } + + inline void run() { + using value_type = typename EViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialPttrs { + using execution_space = typename DeviceType::execution_space; + DViewType _d; + EViewType _e; + BViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialPttrs(const DViewType &d, const EViewType &e, const BViewType &b) : _d(d), _e(e), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const ParamTagType &, const int k, int &info) const { + auto dd = Kokkos::subview(_d, k, Kokkos::ALL()); + auto ee = Kokkos::subview(_e, k, Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL()); + + info += KokkosBatched::SerialPttrs::invoke(dd, ee, bb); + } + + inline int run() { + using value_type = typename BViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + int info_sum = 0; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _d.extent(0)); + Kokkos::parallel_reduce(name.c_str(), policy, *this, info_sum); + Kokkos::Profiling::popRegion(); + return info_sum; + } +}; + +template +struct Functor_BatchedSerialGemv { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + xViewType _x; + yViewType _y; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemv(const ScalarType alpha, const AViewType &a, const xViewType &x, const ScalarType beta, + const yViewType &y) + : _a(a), _x(x), _y(y), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(_x, k, Kokkos::ALL()); + auto yy = Kokkos::subview(_y, k, Kokkos::ALL()); + + KokkosBlas::SerialGemv::invoke(_alpha, aa, xx, _beta, yy); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_BatchedSerialGemm { + using execution_space = typename DeviceType::execution_space; + AViewType _a; + BViewType _b; + CViewType _c; + ScalarType _alpha, _beta; + + KOKKOS_INLINE_FUNCTION + Functor_BatchedSerialGemm(const ScalarType alpha, const AViewType &a, const BViewType &b, const ScalarType beta, + const CViewType &c) + : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke(_alpha, aa, bb, _beta, cc); + } + + inline void run() { + using value_type = typename AViewType::non_const_value_type; + std::string name_region("KokkosBatched::Test::SerialPttrs"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +/// \brief Implementation details of batched pttrs test +/// Confirm A * x = b, where +/// A: [[4, 1], +/// [1, 4]] +/// b: [1, 1] +/// x: [1/5, 1/5] +/// +/// This corresponds to the following system of equations: +/// 4 x0 + x1 = 1 +/// x0 + 4 x1 = 1 +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs_analytical(const int N) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + constexpr int BlkSize = 2; + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize); // Diagonal components + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize - 1); // Upper and lower diagonal components (identical) + View2DType x(Kokkos::view_alloc("x", Kokkos::WithoutInitializing), N, + BlkSize); // Solutions + + Kokkos::deep_copy(d, RealType(4.0)); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(x, ScalarType(1.0)); // This initialy stores b + + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + auto info = + Functor_BatchedSerialPttrs(d, e, x) + .run(); + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_x = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x); + + // Check x = [1/5, 1/5] + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_x(ib, i), ScalarType(1.0 / 5.0), eps); + } + } +} + +template +/// \brief Implementation details of batched pttrs test +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + using View3DType = Kokkos::View; + + View3DType A("A", N, BlkSize, BlkSize), EL("EL", N, BlkSize, BlkSize), EU("EU", N, BlkSize, BlkSize), + D("D", N, BlkSize, BlkSize), I("I", N, BlkSize, BlkSize); + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize), // Diagonal components + ones(Kokkos::view_alloc("ones", Kokkos::WithoutInitializing), N, BlkSize); + View2DType e_upper("e_upper", N, BlkSize - 1), e_lower("e_lower", N, + BlkSize - 1); // upper and lower diagonal components + View2DType x("x", N, BlkSize), b_ref("x_ref", N, BlkSize), b("b", N, BlkSize); // Solutions + + using execution_space = typename DeviceType::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + RealType realRandStart, realRandEnd; + ScalarType randStart, randEnd; + + KokkosKernels::Impl::getRandomBounds(1.0, realRandStart, realRandEnd); + KokkosKernels::Impl::getRandomBounds(1.0, randStart, randEnd); + + // Add BlkSize to ensure positive definiteness + Kokkos::fill_random(d, rand_pool, realRandStart + BlkSize, realRandEnd + BlkSize); + Kokkos::fill_random(e_upper, rand_pool, randStart, randEnd); + Kokkos::fill_random(x, rand_pool, randStart, randEnd); + + auto h_e_upper = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), e_upper); + auto h_e_lower = Kokkos::create_mirror_view(e_lower); + + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize - 1; i++) { + // Fill the lower diagonal with conjugate of the upper diagonal + h_e_lower(ib, i) = Kokkos::ArithTraits::conj(h_e_upper(ib, i)); + } + } + + Kokkos::deep_copy(e_lower, h_e_lower); + Kokkos::deep_copy(b_ref, x); + Kokkos::deep_copy(ones, RealType(1.0)); + + // Reconstruct Tridiagonal matrix A + // A = D + EL + EU + create_diagonal_matrix(e_lower, EL, -1); + create_diagonal_matrix(e_upper, EU, 1); + create_diagonal_matrix(d, D); + create_diagonal_matrix(ones, I); + + // Matrix matrix addition by Gemm + // D + EU by D * I + EU (result stored in EU) + Functor_BatchedSerialGemm(1.0, D, I, + 1.0, EU) + .run(); + + // Copy EL to A + Kokkos::deep_copy(A, EL); + + // EU + EL by EU * I + A (result stored in A) + Functor_BatchedSerialGemm(1.0, EU, I, + 1.0, A) + .run(); + + Kokkos::fence(); + + int info = 0; + if (std::is_same_v) { + // Factorize matrix A -> U**H * D * U + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e_upper).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + info = Functor_BatchedSerialPttrs( + d, e_upper, x) + .run(); + } else { + // Factorize matrix A -> L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e_lower).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + info = Functor_BatchedSerialPttrs( + d, e_lower, x) + .run(); + } + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // Gemv to compute b = A * x, this should be identical to b_ref + Functor_BatchedSerialGemv(1.0, A, x, 0.0, b).run(); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_b = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b); + auto h_b_ref = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_ref); + + // Check A * x = b_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_b(ib, i), h_b_ref(ib, i), eps); + } + } +} + +template +/// \brief Implementation details of batched pttrs test for early return +/// BlkSize must be 0 or 1 +/// +/// \param N [in] Batch size of RHS (banded matrix can also be batched matrix) +/// \param k [in] Number of superdiagonals or subdiagonals of matrix A +/// \param BlkSize [in] Block size of matrix A +void impl_test_batched_pttrs_quick_return(const int N, const int BlkSize) { + using ats = typename Kokkos::ArithTraits; + using RealType = typename ats::mag_type; + using RealView2DType = Kokkos::View; + using View2DType = Kokkos::View; + + if (BlkSize > 1) return; + + const int BlkSize_minus_1 = BlkSize > 0 ? BlkSize - 1 : 0; + + RealView2DType d(Kokkos::view_alloc("d", Kokkos::WithoutInitializing), N, + BlkSize); // Diagonal components + View2DType e(Kokkos::view_alloc("e", Kokkos::WithoutInitializing), N, + BlkSize_minus_1); // lower diagonal components + View2DType x("x", N, BlkSize); // Solutions + + const RealType reference_value = 4.0; + + Kokkos::deep_copy(d, reference_value); + Kokkos::deep_copy(e, ScalarType(1.0)); + Kokkos::deep_copy(x, ScalarType(1.0)); + + // Factorize matrix A -> U**H * D * U or L * D * L**H + // d and e are updated by pttrf + Functor_BatchedSerialPttrf(d, e).run(); + + // pttrs (Note, d and e must be factorized by pttrf) + auto info = + Functor_BatchedSerialPttrs(d, e, x) + .run(); + + Kokkos::fence(); + + EXPECT_EQ(info, 0); + + // this eps is about 10^-14 + RealType eps = 1.0e3 * ats::epsilon(); + + auto h_x = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), x); + + // Check x = x_ref + for (int ib = 0; ib < N; ib++) { + for (int i = 0; i < BlkSize; i++) { + EXPECT_NEAR_KK(h_x(ib, i), ScalarType(1.0 / reference_value), eps); + } + } +} + +} // namespace Pttrs +} // namespace Test + +template +int test_batched_pttrs() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using LayoutType = Kokkos::LayoutLeft; + Test::Pttrs::impl_test_batched_pttrs_analytical(1); + Test::Pttrs::impl_test_batched_pttrs_analytical(2); + for (int i = 0; i < 2; i++) { + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 1, i); + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrs::impl_test_batched_pttrs(1, i); + Test::Pttrs::impl_test_batched_pttrs(2, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using LayoutType = Kokkos::LayoutRight; + Test::Pttrs::impl_test_batched_pttrs_analytical(1); + Test::Pttrs::impl_test_batched_pttrs_analytical(2); + for (int i = 0; i < 2; i++) { + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 1, i); + Test::Pttrs::impl_test_batched_pttrs_quick_return( + 2, i); + } + for (int i = 2; i < 10; i++) { + Test::Pttrs::impl_test_batched_pttrs(1, i); + Test::Pttrs::impl_test_batched_pttrs(2, i); + } + } +#endif + + return 0; +} diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp new file mode 100644 index 0000000000..78ca1b1f7c --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrs_Complex.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, test_batched_pttrs_l_fcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pttrs_u_fcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, test_batched_pttrs_l_dcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +TEST_F(TestCategory, test_batched_pttrs_u_dcomplex) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs, param_tag_type, algo_tag_type>(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp b/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp new file mode 100644 index 0000000000..6bf4fadb6d --- /dev/null +++ b/batched/dense/unit_test/Test_Batched_SerialPttrs_Real.hpp @@ -0,0 +1,43 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, test_batched_pttrs_l_float) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs(); +} +TEST_F(TestCategory, test_batched_pttrs_u_float) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, test_batched_pttrs_l_double) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + + test_batched_pttrs(); +} +TEST_F(TestCategory, test_batched_pttrs_u_double) { + using algo_tag_type = typename Algo::Pttrs::Unblocked; + using param_tag_type = ::Test::Pttrs::ParamTag; + test_batched_pttrs(); +} +#endif diff --git a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp index 9bf9d43578..586b69dc7f 100644 --- a/batched/dense/unit_test/Test_Batched_SerialSVD.hpp +++ b/batched/dense/unit_test/Test_Batched_SerialSVD.hpp @@ -18,6 +18,8 @@ #include "KokkosBatched_SVD_Decl.hpp" //For testing overall kernel #include "KokkosBatched_SVD_Serial_Internal.hpp" //For unit testing individual components #include "KokkosBatched_SetIdentity_Decl.hpp" +#include "KokkosKernels_TestMatrixUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" namespace Test { template @@ -103,40 +105,6 @@ void verifySVD(const AView& A, const UView& U, const VtView& Vt, const SigmaView } } -template -Matrix createRandomMatrix(int m, int n, int deficiency, double maxval = 1.0) { - using Scalar = typename Matrix::non_const_value_type; - Matrix mat("A", m, n); - auto mhost = Kokkos::create_mirror_view(mat); - // Fill mat with random values first - if (maxval != 0.0) { - Kokkos::Random_XorShift64_Pool rand_pool(13718); - Scalar minrand, maxrand; - Test::getRandomBounds(maxval, minrand, maxrand); - Kokkos::fill_random(mhost, rand_pool, minrand, maxrand); - } - // Apply the rank deficiency. - // If m < n, make some rows a multiple of the first row. - // Otherwise, make some columns a multiple of the first column. - if (m < n) { - for (int i = 0; i < deficiency; i++) { - // make row i + 1 a multiple of row 0 - for (int j = 0; j < n; j++) { - mhost(i + 1, j) = (double)(i + 2) * mhost(0, j); - } - } - } else { - for (int i = 0; i < deficiency; i++) { - // make col i + 1 a multiple of col 0 - for (int j = 0; j < m; j++) { - mhost(j, i + 1) = (double)(i + 2) * mhost(j, 0); - } - } - } - Kokkos::deep_copy(mat, mhost); - return mat; -} - template struct SerialSVDFunctor_Full { SerialSVDFunctor_Full(const Matrix& A_, const Matrix& U_, const Matrix& Vt_, const Vector& sigma_, @@ -172,12 +140,39 @@ struct SerialSVDFunctor_SingularValuesOnly { Vector work; }; +template +Matrix randomMatrixWithRank(int m, int n, int rank) { + Matrix A("A", m, n); + if (rank == Kokkos::min(m, n)) { + // A is full-rank so as a shortcut, fill it with random values directly. + Kokkos::Random_XorShift64_Pool rand_pool(13318); + Kokkos::fill_random(A, rand_pool, -1.0, 1.0); + } else { + // A is rank-deficient, so compute it as a product of two random matrices + using MatrixHost = typename Matrix::HostMirror; + auto Ahost = Kokkos::create_mirror_view(A); + Kokkos::Random_XorShift64_Pool rand_pool(13318); + MatrixHost U("U", m, rank); + MatrixHost Vt("Vt", rank, n); + Kokkos::fill_random(U, rand_pool, -1.0, 1.0); + Kokkos::fill_random(Vt, rand_pool, -1.0, 1.0); + Test::vanillaGEMM(1.0, U, Vt, 0.0, Ahost); + Kokkos::deep_copy(A, Ahost); + } + return A; +} + +template +Matrix randomMatrixWithRank(int m, int n) { + return randomMatrixWithRank(m, n, Kokkos::min(m, n)); +} + template -void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) { +void testSerialSVD(int m, int n, int rank) { using Matrix = Kokkos::View; using Vector = Kokkos::View; using ExecSpace = typename Device::execution_space; - Matrix A = createRandomMatrix(m, n, deficiency, maxval); + Matrix A = randomMatrixWithRank(m, n, rank); // Fill U, Vt, sigma with nonzeros as well to make sure they are properly // overwritten Matrix U("U", m, m); @@ -185,6 +180,8 @@ void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) { int maxrank = std::min(m, n); Vector sigma("sigma", maxrank); Vector work("work", std::max(m, n)); + // Fill these views with an arbitrary value, to make sure SVD + // doesn't rely on them being zero initialized. Kokkos::deep_copy(U, -5.0); Kokkos::deep_copy(Vt, -5.0); Kokkos::deep_copy(sigma, -5.0); @@ -204,12 +201,17 @@ void testSerialSVD(int m, int n, int deficiency, double maxval = 1.0) { verifySVD(Acopy, Uhost, Vthost, sigmaHost); } +template +void testSerialSVD(int m, int n) { + testSerialSVD(m, n, Kokkos::min(m, n)); +} + template void testSerialSVDSingularValuesOnly(int m, int n) { using Matrix = Kokkos::View; using Vector = Kokkos::View; using ExecSpace = typename Device::execution_space; - Matrix A = createRandomMatrix(m, n, 0); + Matrix A = randomMatrixWithRank(m, n); // Fill U, Vt, sigma with nonzeros as well to make sure they are properly // overwritten Matrix U("U", m, m); @@ -248,7 +250,7 @@ void testSerialSVDZeroLastRow(int n) { // Generate a bidiagonal matrix using Matrix = Kokkos::View; using KAT = Kokkos::ArithTraits; - Matrix B = createRandomMatrix(n, n, 0, 1.0); + Matrix B = randomMatrixWithRank(n, n); // Zero out entries to make B bidiagonal for (int i = 0; i < n; i++) { for (int j = 0; j < n; j++) { @@ -265,7 +267,7 @@ void testSerialSVDZeroLastRow(int n) { Matrix BVt("UBVt", n, n); Test::vanillaGEMM(1.0, B, Vt, 0.0, BVt); // Run the routine (just on host) - KokkosBatched::SerialSVDInternal::svdZeroLastColumn(B.data(), n, B.stride(0), B.stride(1), Vt.data(), + KokkosBatched::SerialSVDInternal::svdZeroLastColumn(B.data(), n, B.stride(0), B.stride(1), n, Vt.data(), Vt.stride(0), Vt.stride(1)); // Check that B is still bidiagonal (to a tight tolerance, but not exactly // zero) @@ -298,7 +300,7 @@ void testSerialSVDZeroDiagonal(int n, int row) { using KAT = Kokkos::ArithTraits; int m = n + 2; // Make U somewhat bigger to make sure the Givens transforms // are applied correctly - Matrix B = createRandomMatrix(m, n, 0, 1.0); + Matrix B = randomMatrixWithRank(m, n); // Zero out entries to make B bidiagonal for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { @@ -342,19 +344,18 @@ void testSerialSVDZeroDiagonal(int n, int row) { template void testSVD() { - testSerialSVD(0, 0, 0); - testSerialSVD(1, 0, 0); - testSerialSVD(0, 1, 0); - testSerialSVD(2, 2, 0); + testSerialSVD(0, 0); + testSerialSVD(1, 0); + testSerialSVD(0, 1); + testSerialSVD(2, 2); testSerialSVD(2, 2, 1); - testSerialSVD(10, 8, 0); - testSerialSVD(8, 10, 0); - testSerialSVD(10, 1, 0); + testSerialSVD(10, 8); + testSerialSVD(8, 10); + testSerialSVD(10, 1); testSerialSVD(1, 10, 0); testSerialSVD(10, 8, 3); testSerialSVD(8, 10, 4); - // Test with all-zero matrix - testSerialSVD(8, 10, 0, 0.0); + testSerialSVD(8, 10, 7); // Test some important internal routines which are not called often testSerialSVDZeroLastRow(10); testSerialSVDZeroDiagonal(10, 3); @@ -425,6 +426,119 @@ void testIssue1786() { } } +// Generate specific test cases +template +Kokkos::View getTestCase(int testCase) { + using MatrixHost = Kokkos::View; + MatrixHost Ahost; + int m, n; + switch (testCase) { + case 0: + // Issue #2344 case 1 + m = 3; + n = 3; + Ahost = MatrixHost("A0", m, n); + Ahost(1, 0) = 3.58442287931538747e-02; + Ahost(1, 1) = 3.81743062695684907e-02; + Ahost(2, 2) = -5.55555555555555733e-02; + break; + case 1: + // Test a matrix that is strictly lower triangular (so the diagonal + // is zero) + m = 8; + n = 8; + Ahost = MatrixHost("A1", m, n); + for (int i = 0; i < m; i++) { + for (int j = 0; j < i; j++) { + Ahost(i, j) = 1; + } + } + break; + case 2: + // Test a matrix that's already diagonal, except for one superdiagonal in the middle + m = 10; + n = 5; + Ahost = MatrixHost("A2", m, n); + for (int i = 0; i < n; i++) Ahost(i, i) = 1.0; + Ahost(2, 3) = 2.2; + break; + case 3: + // Test a matrix that is already bidiagonal, and has a zero diagonal in the middle + m = 10; + n = 7; + Ahost = MatrixHost("A3", m, n); + for (int i = 0; i < n; i++) Ahost(i, i) = 1.0; + for (int i = 0; i < n - 1; i++) Ahost(i, i + 1) = 0.7; + Ahost(4, 4) = 0; + break; + case 4: { + // Issue #2344 case 2 + m = 3; + n = 4; + Ahost = MatrixHost("A4", m, n); + Ahost(0, 0) = -2.0305040121856084e-02; + Ahost(1, 0) = 0.0000000000000000e+00; + Ahost(2, 0) = 0.0000000000000000e+00; + Ahost(0, 1) = -0.0000000000000000e+00; + Ahost(1, 1) = -0.0000000000000000e+00; + Ahost(2, 1) = 1.9506119814028472e-02; + Ahost(0, 2) = -2.0305040121856091e-02; + Ahost(1, 2) = 0.0000000000000000e+00; + Ahost(2, 2) = 0.0000000000000000e+00; + Ahost(0, 3) = -0.0000000000000000e+00; + Ahost(1, 3) = -0.0000000000000000e+00; + Ahost(2, 3) = 1.9506119814028472e-02; + break; + } + case 5: { + // Test with all-zero matrix + m = 17; + n = 19; + Ahost = MatrixHost("A5", m, n); + break; + } + default: throw std::runtime_error("Test case out of bounds."); + } + Kokkos::View A(Ahost.label(), m, n); + Kokkos::deep_copy(A, Ahost); + return A; +} + +template +void testSpecialCases() { + using Matrix = Kokkos::View; + using Vector = Kokkos::View; + using ExecSpace = typename Device::execution_space; + for (int i = 0; i < 6; i++) { + Matrix A = getTestCase(i); + int m = A.extent(0); + int n = A.extent(1); + Matrix U("U", m, m); + Matrix Vt("Vt", n, n); + int maxrank = std::min(m, n); + Vector sigma("sigma", maxrank); + Vector work("work", std::max(m, n)); + Kokkos::deep_copy(U, -5.0); + Kokkos::deep_copy(Vt, -5.0); + Kokkos::deep_copy(sigma, -5.0); + Kokkos::deep_copy(work, -5.0); + // Make a copy of A (before SVD) for verification, since the original will be + // overwritten + typename Matrix::HostMirror Acopy("Acopy", m, n); + Kokkos::deep_copy(Acopy, A); + // Run the SVD + Kokkos::parallel_for(Kokkos::RangePolicy(0, 1), + SerialSVDFunctor_Full(A, U, Vt, sigma, work)); + // Get the results back + auto Uhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), U); + auto Vthost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), Vt); + auto sigmaHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), sigma); + + // Verify the SVD is correct + verifySVD(Acopy, Uhost, Vthost, sigmaHost); + } +} + #if defined(KOKKOSKERNELS_INST_DOUBLE) TEST_F(TestCategory, batched_scalar_serial_svd_double) { // Test general SVD on a few different input sizes (full rank randomized) @@ -432,6 +546,8 @@ TEST_F(TestCategory, batched_scalar_serial_svd_double) { testSVD(); testIssue1786(); testIssue1786(); + testSpecialCases(); + testSpecialCases(); } #endif @@ -442,5 +558,7 @@ TEST_F(TestCategory, batched_scalar_serial_svd_float) { testSVD(); testIssue1786(); testIssue1786(); + testSpecialCases(); + testSpecialCases(); } #endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..b1cef3592c --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..9f163222d5 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_nt_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_NT_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_NT_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..eaaa44d6c0 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..616d4f56e1 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_nt_t_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_NT_T_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_NT_T_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..ceeec3e8c1 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..142720514e --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_nt_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_NT_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_NT_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..6c11f79400 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_bll_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLL_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..c84b3d0db6 --- /dev/null +++ b/batched/eti/generated_specializations_hpp/KokkosBatched_Gemm_t_t_blr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBATCHED_GEMM_T_T_BLR_ETI_SPEC_DECL_HPP_ +namespace KokkosBatched { +namespace Impl { +using KokkosBlas::Trans; +@BATCHED_GEMM_T_T_BLR_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 9aa4b95f2c..513e16b5e1 100644 --- a/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_CG_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 82c62624c1..b77a0daf20 100644 --- a/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_CG_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_CG_TEAM_IMPL_HPP +#define KOKKOSBATCHED_CG_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 2d8c0cae00..4095090e4d 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 8d37b2ac5e..89f062dd7a 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_GMRES_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 9fd9e09bd9..601a18c997 100644 --- a/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP +#define KOKKOSBATCHED_GMRES_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp index 3f76ee3d9f..88dac8799e 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Serial_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP +#define KOKKOSBATCHED_SPMV_SERIAL_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp index 4df4b95e2c..d28aba16b8 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_TeamVector_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP +#define KOKKOSBATCHED_SPMV_TEAMVECTOR_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp index 9e32861612..274ee65203 100644 --- a/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp +++ b/batched/sparse/impl/KokkosBatched_Spmv_Team_Impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ -#define __KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP__ +#ifndef KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP +#define KOKKOSBATCHED_SPMV_TEAM_IMPL_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_CG.hpp b/batched/sparse/src/KokkosBatched_CG.hpp index cabf2eae98..0a15d32811 100644 --- a/batched/sparse/src/KokkosBatched_CG.hpp +++ b/batched/sparse/src/KokkosBatched_CG.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CG_HPP__ -#define __KOKKOSBATCHED_CG_HPP__ +#ifndef KOKKOSBATCHED_CG_HPP +#define KOKKOSBATCHED_CG_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp index 0d880cd880..b2477d7efc 100644 --- a/batched/sparse/src/KokkosBatched_CrsMatrix.hpp +++ b/batched/sparse/src/KokkosBatched_CrsMatrix.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_CRSMATRIX_HPP__ -#define __KOKKOSBATCHED_CRSMATRIX_HPP__ +#ifndef KOKKOSBATCHED_CRSMATRIX_HPP +#define KOKKOSBATCHED_CRSMATRIX_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_GMRES.hpp b/batched/sparse/src/KokkosBatched_GMRES.hpp index a3f4eda8d3..7982fa30f2 100644 --- a/batched/sparse/src/KokkosBatched_GMRES.hpp +++ b/batched/sparse/src/KokkosBatched_GMRES.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_GMRES_HPP__ -#define __KOKKOSBATCHED_GMRES_HPP__ +#ifndef KOKKOSBATCHED_GMRES_HPP +#define KOKKOSBATCHED_GMRES_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_Identity.hpp b/batched/sparse/src/KokkosBatched_Identity.hpp index 311ec09d5c..421ad2c210 100644 --- a/batched/sparse/src/KokkosBatched_Identity.hpp +++ b/batched/sparse/src/KokkosBatched_Identity.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_IDENTITY_HPP__ -#define __KOKKOSBATCHED_IDENTITY_HPP__ +#ifndef KOKKOSBATCHED_IDENTITY_HPP +#define KOKKOSBATCHED_IDENTITY_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp index 580f85158b..973794d68d 100644 --- a/batched/sparse/src/KokkosBatched_JacobiPrec.hpp +++ b/batched/sparse/src/KokkosBatched_JacobiPrec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_JACOBIPREC_HPP__ -#define __KOKKOSBATCHED_JACOBIPREC_HPP__ +#ifndef KOKKOSBATCHED_JACOBIPREC_HPP +#define KOKKOSBATCHED_JACOBIPREC_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp index c8e8392e11..5cead801b8 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Handle.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ -#define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ +#ifndef KOKKOSBATCHED_KRYLOV_HANDLE_HPP +#define KOKKOSBATCHED_KRYLOV_HANDLE_HPP #include #include diff --git a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp index b07ed2b973..2f25da35bf 100644 --- a/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp +++ b/batched/sparse/src/KokkosBatched_Krylov_Solvers.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ -#define __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ +#ifndef KOKKOSBATCHED_KRYLOV_SOLVERS_HPP +#define KOKKOSBATCHED_KRYLOV_SOLVERS_HPP namespace KokkosBatched { diff --git a/batched/sparse/src/KokkosBatched_Spmv.hpp b/batched/sparse/src/KokkosBatched_Spmv.hpp index a93d0775be..72b923eeb8 100644 --- a/batched/sparse/src/KokkosBatched_Spmv.hpp +++ b/batched/sparse/src/KokkosBatched_Spmv.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBATCHED_SPMV_HPP__ -#define __KOKKOSBATCHED_SPMV_HPP__ +#ifndef KOKKOSBATCHED_SPMV_HPP +#define KOKKOSBATCHED_SPMV_HPP /// \author Kim Liegeois (knliege@sandia.gov) diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 5bc7217cfd..08263826ab 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -36,7 +36,7 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNE ENDIF() # Include cuda blas TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) +IF (KOKKOSKERNELS_ENABLE_TPL_CUBLAS OR KOKKOSKERNELS_ENABLE_TPL_MAGMA) LIST(APPEND SOURCES blas/tpls/KokkosBlas_Cuda_tpl.cpp ) diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..2780dee8ff --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS1_ABS_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS1_ABS_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..c7af4806be --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_abs_mv_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS1_ABS_MV_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS1_ABS_MV_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in index 66fae7ebd3..bebe667933 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_AXPBY_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..cb42d1c5fd --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_AXPBY_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_AXPBY_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in index 6a46224605..3262fa2485 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_AXPBY_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..94033316cf --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_axpby_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_AXPBY_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in index 525031c93f..01ecc16288 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_DOT_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..827abeef6f --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_DOT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_DOT_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in index 9145ec9461..ca52819bc3 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_DOT_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..96acefb108 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_dot_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_DOT_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_DOT_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..17b61a8857 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_IAMAX_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_IAMAX_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..35d654012e --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_iamax_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_IAMAX_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_IAMAX_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in index 0fa727b8ba..9fe55baf2a 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_MULT_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..8a64db6ba6 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_MULT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_MULT_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in index 56f7771956..b1483c8fb0 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_MULT_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..945d3a837a --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_mult_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_MULT_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_MULT_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..1c9a088122 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM1_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..d2a322a0ad --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm1_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM1_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM1_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in index 5398eb1c34..659c575afa 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_NRM2_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..444776c725 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in index b0446fc62e..1b1c8f3dba 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS1_NRM2_MV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..a75bb102dc --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2_MV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..bd7d1b11b8 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2W_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2W_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..0a0aadc87a --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrm2w_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRM2W_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRM2W_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..3f1e874724 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRMINF_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRMINF_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..17559306bf --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_nrminf_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_NRMINF_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_NRMINF_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..7ac4b74ea4 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_RECIPROCAL_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_RECIPROCAL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..f40958465f --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_reciprocal_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_RECIPROCAL_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_RECIPROCAL_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..5e6b197460 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_rot_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROT_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..214dcd75c2 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotg_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTG_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTG_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..3fdb009574 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotm_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTM_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..83465d83cf --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_rotmg_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_ROTMG_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_ROTMG_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..56d40c9883 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SCAL_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SCAL_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..953f8e6954 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_scal_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SCAL_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..bdac3456e8 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SUM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SUM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..5182f61985 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_sum_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SUM_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_SUM_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e67c630a98 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_swap_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ +#ifndef KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_SWAP_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS1_SWAP_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..cff04c9fbe --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_UPDATE_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_UPDATE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..deec84712b --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas1_update_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS1_UPDATE_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS1_UPDATE_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in index 1a7cd9acce..aa52a7d4b5 100644 --- a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_avail.hpp.in @@ -19,6 +19,6 @@ namespace KokkosBlas { namespace Impl { @BLAS2_GEMV_ETI_AVAIL_BLOCK@ - } //IMPL +} //IMPL } //Kokkos #endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..606ec52ae2 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_gemv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GEMV_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS2_GEMV_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..3ca1a64a8e --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_ger_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_GER_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_GER_ETI_DECL_BLOCK@ +} // namespace Impl +} // namespace KokkosBlas +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..315f62fdfb --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..370c6bf96c --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas2_syr_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR_ETI_DECL_BLOCK@ +} //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..22ea9a1ed1 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas3_gemm_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_GEMM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS3_GEMM_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..e802ccf4fc --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas3_trmm_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ +namespace KokkosBlas { +namespace Impl { + +@BLAS3_TRMM_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS3_TRMM_ETI_SPEC_DECL_HPP_ diff --git a/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in b/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..d1b6384e78 --- /dev/null +++ b/blas/eti/generated_specializations_hpp/KokkosBlas3_trsm_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ +#define KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ + +namespace KokkosBlas { +namespace Impl { + +@BLAS3_TRSM_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosBlas +#endif // KOKKOSBLAS3_TRSM_ETI_SPEC_DECL_HPP_ diff --git a/blas/impl/KokkosBlas1_abs_impl.hpp b/blas/impl/KokkosBlas1_abs_impl.hpp index 0c674f25f5..181708e94f 100644 --- a/blas/impl/KokkosBlas1_abs_impl.hpp +++ b/blas/impl/KokkosBlas1_abs_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_ABS_HPP_ -#define KOKKOS_BLAS1_IMPL_ABS_HPP_ +#ifndef KOKKOSBLAS1_IMPL_ABS_HPP_ +#define KOKKOSBLAS1_IMPL_ABS_HPP_ #include #include @@ -201,4 +201,4 @@ void V_Abs_Generic(const execution_space& space, const RV& R, const XV& X) { } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ +#endif // KOKKOSBLAS1_IMPL_ABS_HPP_ diff --git a/blas/impl/KokkosBlas1_abs_spec.hpp b/blas/impl/KokkosBlas1_abs_spec.hpp index fb6357b38e..a00daf0683 100644 --- a/blas/impl/KokkosBlas1_abs_spec.hpp +++ b/blas/impl/KokkosBlas1_abs_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_ABS_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_ABS_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ #include #include @@ -229,5 +229,7 @@ struct Abs; #include +#include +#include -#endif // KOKKOS_BLAS1_MV_IMPL_ABS_HPP_ +#endif // KOKKOSBLAS1_IMPL_ABS_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_axpby_spec.hpp b/blas/impl/KokkosBlas1_axpby_spec.hpp index f4f85c8f6b..37d3ccd562 100644 --- a/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_AXPBY_SPEC_HPP_ -#define KOKKOS_BLAS1_AXPBY_SPEC_HPP_ +#ifndef KOKKOSBLAS1_AXPBY_SPEC_HPP_ +#define KOKKOSBLAS1_AXPBY_SPEC_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" @@ -513,6 +513,8 @@ struct Axpby, Kokkos::MemoryTraits >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_AXPBY_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Axpby< \ EXEC_SPACE, SCALAR, \ @@ -559,6 +561,8 @@ struct Axpby, Kokkos::MemoryTraits >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_AXPBY_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Axpby< \ EXEC_SPACE, SCALAR, \ @@ -580,4 +584,4 @@ struct Axpby -#endif // KOKKOS_BLAS1_MV_IMPL_AXPBY_HPP_ +#endif // KOKKOSBLAS1_AXPBY_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp index 0a03007801..84d91a106a 100644 --- a/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp +++ b/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ -#define KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#ifndef KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#define KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ #include #include @@ -103,7 +103,7 @@ struct AxpbyUnificationAttemptTraits { // - type names begin with upper case letters // ******************************************************************** public: - static constexpr bool onDevice = KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool onDevice = KokkosKernels::Impl::is_gpu_exec_space_v; private: static constexpr bool onHost = !onDevice; @@ -816,4 +816,4 @@ static void populateRank1Stride1ViewWithScalarOrNonStrideView(T_in const& coeff_ } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#endif // KOKKOSBLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ diff --git a/blas/impl/KokkosBlas1_dot_spec.hpp b/blas/impl/KokkosBlas1_dot_spec.hpp index 982e2eaa0c..e2b2cc28c8 100644 --- a/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/blas/impl/KokkosBlas1_dot_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_DOT_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_DOT_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ #include #include @@ -410,6 +410,8 @@ struct Dot>, \ true>; +#include + #define KOKKOSBLAS1_DOT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Dot>, \ @@ -477,6 +479,8 @@ struct Dot>, \ 1, 2, false, true>; +#include + #define KOKKOSBLAS1_DOT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Dot< \ EXEC_SPACE, \ @@ -508,4 +512,4 @@ struct Dot -#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ +#endif // KOKKOSBLAS1_IMPL_DOT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_iamax_spec.hpp b/blas/impl/KokkosBlas1_iamax_spec.hpp index 80e4cb6036..28978c034e 100644 --- a/blas/impl/KokkosBlas1_iamax_spec.hpp +++ b/blas/impl/KokkosBlas1_iamax_spec.hpp @@ -294,5 +294,7 @@ struct Iamax +#include +#include #endif // KOKKOSBLAS1_IAMAX_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_mult_spec.hpp b/blas/impl/KokkosBlas1_mult_spec.hpp index 3cd847dc1d..3b8904b1df 100644 --- a/blas/impl/KokkosBlas1_mult_spec.hpp +++ b/blas/impl/KokkosBlas1_mult_spec.hpp @@ -229,6 +229,8 @@ struct Mult >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_MULT_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ EXEC_SPACE, \ @@ -257,6 +259,8 @@ struct Mult >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_MULT_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Mult< \ EXEC_SPACE, \ diff --git a/blas/impl/KokkosBlas1_nrm1_spec.hpp b/blas/impl/KokkosBlas1_nrm1_spec.hpp index 3977c5225c..25eeae5ee2 100644 --- a/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -244,5 +244,7 @@ struct Nrm1; #include +#include +#include #endif // KOKKOSBLAS1_NRM1_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrm2_spec.hpp b/blas/impl/KokkosBlas1_nrm2_spec.hpp index 4d0b2e1396..b13290cc1e 100644 --- a/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -199,6 +199,8 @@ struct Nrm2 >, \ 1, false, true>; +#include + // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We @@ -229,6 +231,8 @@ struct Nrm2 >, \ 2, false, true>; +#include + // // Macro for definition of full specialization of // KokkosBlas::Impl::Nrm2 for rank == 2. This is NOT for users!!! We diff --git a/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/blas/impl/KokkosBlas1_nrm2w_spec.hpp index 5660832139..99f82c535b 100644 --- a/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -97,15 +97,15 @@ struct Nrm2w::value, "KokkosBlas::Impl::" - "Nrm2w<1-D>: RMV is not a Kokkos::View."); + "Nrm2w: RMV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" - "Nrm2w<1-D>: XMV is not a Kokkos::View."); + "Nrm2w: XMV is not a Kokkos::View."); static_assert(RMV::rank == 0, - "KokkosBlas::Impl::Nrm2w<1-D>: " + "KokkosBlas::Impl::Nrm2w: " "RMV is not rank 0."); static_assert(XMV::rank == 1, - "KokkosBlas::Impl::Nrm2w<1-D>: " + "KokkosBlas::Impl::Nrm2w: " "XMV is not rank 1."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" : "KokkosBlas::nrm2w[noETI]"); @@ -135,15 +135,15 @@ struct Nrm2w::value, "KokkosBlas::Impl::" - "Nrm2w<2-D>: RV is not a Kokkos::View."); + "Nrm2w_mv: RV is not a Kokkos::View."); static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" - "Nrm2w<2-D>: XMV is not a Kokkos::View."); + "Nrm2w_mv: XMV is not a Kokkos::View."); static_assert(RV::rank == 1, - "KokkosBlas::Impl::Nrm2w<2-D>: " + "KokkosBlas::Impl::Nrm2w_mv: " "RV is not rank 1."); static_assert(XMV::rank == 2, - "KokkosBlas::Impl::Nrm2w<2-D>: " + "KokkosBlas::Impl::Nrm2w_mv: " "XMV is not rank 2."); Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::nrm2w[ETI]" : "KokkosBlas::nrm2w[noETI]"); @@ -191,7 +191,8 @@ struct Nrm2w::mag_type, \ + extern template struct Nrm2w::mag_type, \ LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -240,5 +241,7 @@ struct Nrm2w; #include +#include +#include #endif // KOKKOSBLAS1_NRM2W_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_nrminf_spec.hpp b/blas/impl/KokkosBlas1_nrminf_spec.hpp index e7b365ce85..eee326830b 100644 --- a/blas/impl/KokkosBlas1_nrminf_spec.hpp +++ b/blas/impl/KokkosBlas1_nrminf_spec.hpp @@ -233,5 +233,7 @@ struct NrmInf; #include +#include +#include #endif // KOKKOSBLAS1_NRMINF_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_reciprocal_impl.hpp b/blas/impl/KokkosBlas1_reciprocal_impl.hpp index 7ad6ab95db..b86ef116fb 100644 --- a/blas/impl/KokkosBlas1_reciprocal_impl.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_RECIPROCAL_HPP_ -#define KOKKOS_BLAS1_IMPL_RECIPROCAL_HPP_ +#ifndef KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ +#define KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ #include #include @@ -203,4 +203,4 @@ void V_Reciprocal_Generic(const execution_space& space, const RV& R, const XV& X } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_RECIPROCAL_HPP_ diff --git a/blas/impl/KokkosBlas1_reciprocal_spec.hpp b/blas/impl/KokkosBlas1_reciprocal_spec.hpp index 988043511b..efd6f5a9fe 100644 --- a/blas/impl/KokkosBlas1_reciprocal_spec.hpp +++ b/blas/impl/KokkosBlas1_reciprocal_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_RECIPROCAL_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_RECIPROCAL_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ #include #include @@ -230,5 +230,7 @@ struct Reciprocal; #include +#include +#include -#endif // KOKKOS_BLAS1_MV_IMPL_RECIPROCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_RECIPROCAL_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rot_spec.hpp b/blas/impl/KokkosBlas1_rot_spec.hpp index 4ca4d8d1ef..493cd648cf 100644 --- a/blas/impl/KokkosBlas1_rot_spec.hpp +++ b/blas/impl/KokkosBlas1_rot_spec.hpp @@ -124,5 +124,6 @@ struct Rot; #include +#include #endif // KOKKOSBLAS1_ROT_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotg_spec.hpp b/blas/impl/KokkosBlas1_rotg_spec.hpp index 87618f12c9..12752ab71c 100644 --- a/blas/impl/KokkosBlas1_rotg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotg_spec.hpp @@ -123,5 +123,6 @@ struct Rotg; #include +#include #endif // KOKKOSBLAS1_ROTG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotm_spec.hpp b/blas/impl/KokkosBlas1_rotm_spec.hpp index 5000b35fc3..8442a7b3e4 100644 --- a/blas/impl/KokkosBlas1_rotm_spec.hpp +++ b/blas/impl/KokkosBlas1_rotm_spec.hpp @@ -120,5 +120,6 @@ struct Rotm; #include +#include #endif // KOKKOSBLAS1_ROTM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_rotmg_spec.hpp b/blas/impl/KokkosBlas1_rotmg_spec.hpp index caa44dda5d..631cee114e 100644 --- a/blas/impl/KokkosBlas1_rotmg_spec.hpp +++ b/blas/impl/KokkosBlas1_rotmg_spec.hpp @@ -124,5 +124,6 @@ struct Rotmg; #include +#include #endif // KOKKOSBLAS1_ROTMG_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_scal_spec.hpp b/blas/impl/KokkosBlas1_scal_spec.hpp index 70a95d33e2..a52e9a3b61 100644 --- a/blas/impl/KokkosBlas1_scal_spec.hpp +++ b/blas/impl/KokkosBlas1_scal_spec.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS1_IMPL_SCAL_SPEC_HPP_ -#define KOKKOS_BLAS1_IMPL_SCAL_SPEC_HPP_ +#ifndef KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ +#define KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ #include #include @@ -292,6 +292,8 @@ struct Scal >, \ 1, false, true>; +#include + #define KOKKOSBLAS1_SCAL_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Scal< \ EXEC_SPACE, \ @@ -324,6 +326,8 @@ struct Scal >, \ 2, false, true>; +#include + #define KOKKOSBLAS1_SCAL_MV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct Scal< \ EXEC_SPACE, \ @@ -343,4 +347,4 @@ struct Scal -#endif // KOKKOS_BLAS1_MV_IMPL_SCAL_HPP_ +#endif // KOKKOSBLAS1_IMPL_SCAL_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_set_impl.hpp b/blas/impl/KokkosBlas1_set_impl.hpp index 037720253b..c676246b68 100644 --- a/blas/impl/KokkosBlas1_set_impl.hpp +++ b/blas/impl/KokkosBlas1_set_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef __KOKKOSBLAS_SET_IMPL_HPP__ -#define __KOKKOSBLAS_SET_IMPL_HPP__ +#ifndef KOKKOSBLAS_SET_IMPL_HPP +#define KOKKOSBLAS_SET_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/blas/impl/KokkosBlas1_sum_spec.hpp b/blas/impl/KokkosBlas1_sum_spec.hpp index 6df41e0309..2ade8b49a1 100644 --- a/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/blas/impl/KokkosBlas1_sum_spec.hpp @@ -240,5 +240,7 @@ struct Sum; #include +#include +#include #endif // KOKKOSBLAS1_SUM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_swap_spec.hpp b/blas/impl/KokkosBlas1_swap_spec.hpp index 749552a81c..314795125b 100644 --- a/blas/impl/KokkosBlas1_swap_spec.hpp +++ b/blas/impl/KokkosBlas1_swap_spec.hpp @@ -120,5 +120,6 @@ struct Swap; #include +#include #endif // KOKKOSBLAS1_SWAP_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas1_update_spec.hpp b/blas/impl/KokkosBlas1_update_spec.hpp index b031a529b8..131ace8c25 100644 --- a/blas/impl/KokkosBlas1_update_spec.hpp +++ b/blas/impl/KokkosBlas1_update_spec.hpp @@ -335,5 +335,7 @@ struct Update; #include +#include +#include #endif // KOKKOSBLAS1_UPDATE_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_gemv_impl.hpp b/blas/impl/KokkosBlas2_gemv_impl.hpp index b1976e2622..6ae3c90582 100644 --- a/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ -#define KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ +#ifndef KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ +#define KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" @@ -623,7 +623,7 @@ void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AVie // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { @@ -631,7 +631,7 @@ void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename A } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { @@ -641,4 +641,4 @@ void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename A } // namespace Impl } // namespace KokkosBlas -#endif // KOKKOS_BLAS2_MV_IMPL_GEMV_HPP_ +#endif // KOKKOSBLAS2_MV_IMPL_GEMV_HPP_ diff --git a/blas/impl/KokkosBlas2_gemv_spec.hpp b/blas/impl/KokkosBlas2_gemv_spec.hpp index 05e2d28bc7..b56ac4c1f6 100644 --- a/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -121,6 +121,8 @@ struct GEMV { Kokkos::View, Kokkos::MemoryTraits >, \ false, true>; +#include + #define KOKKOSBLAS2_GEMV_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ template struct GEMV< \ EXEC_SPACE, \ diff --git a/blas/impl/KokkosBlas2_ger_impl.hpp b/blas/impl/KokkosBlas2_ger_impl.hpp index 94eb1868f9..bfcff1fb0e 100644 --- a/blas/impl/KokkosBlas2_ger_impl.hpp +++ b/blas/impl/KokkosBlas2_ger_impl.hpp @@ -208,14 +208,14 @@ void teamParallelGer(const ExecutionSpace& space, const char trans[], const type // The 'enable_if' makes sure unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelGer(space, trans, alpha, x, y, A); } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalGerImpl(const ExecutionSpace& space, const char trans[], const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelGer(space, trans, alpha, x, y, A); diff --git a/blas/impl/KokkosBlas2_ger_spec.hpp b/blas/impl/KokkosBlas2_ger_spec.hpp index 04e25ab422..806678fb25 100644 --- a/blas/impl/KokkosBlas2_ger_spec.hpp +++ b/blas/impl/KokkosBlas2_ger_spec.hpp @@ -125,5 +125,6 @@ struct GER { false, true>; #include +#include #endif // KOKKOSBLAS2_GER_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp index 79f49fdd0e..d9fcb09815 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_impl.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_impl.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP__ -#define __KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP +#define KOKKOSBLAS_GEMV_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp index 1b70413119..1a41ff4db3 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_inner_multiple_dot.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ -#define __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ +#ifndef KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP +#define KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) @@ -356,4 +356,4 @@ KOKKOS_INLINE_FUNCTION int InnerMultipleDotProduct<1>::serial_invoke(const Scala } // namespace Impl } // namespace KokkosBlas -#endif // __KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP__ +#endif // KOKKOSBLAS_INNER_MULTIPLE_DOT_PRODUCT_SERIAL_IMPL_HPP diff --git a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp index 912972c7ee..d0be91e84b 100644 --- a/blas/impl/KokkosBlas2_serial_gemv_internal.hpp +++ b/blas/impl/KokkosBlas2_serial_gemv_internal.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef __KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP__ -#define __KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP__ +#ifndef KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP +#define KOKKOSBLAS_GEMV_SERIAL_INTERNAL_HPP /// \author Kyungjoo Kim (kyukim@sandia.gov) diff --git a/blas/impl/KokkosBlas2_syr2_impl.hpp b/blas/impl/KokkosBlas2_syr2_impl.hpp index 7bcb0069ab..5f134c097d 100644 --- a/blas/impl/KokkosBlas2_syr2_impl.hpp +++ b/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -292,7 +292,7 @@ void teamParallelSyr2(const ExecutionSpace& space, const typename AViewType::con template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { threadParallelSyr2(space, alpha, @@ -301,7 +301,7 @@ void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::cons template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyr2Impl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const YViewType& y, const AViewType& A) { teamParallelSyr2(space, alpha, x, diff --git a/blas/impl/KokkosBlas2_syr2_spec.hpp b/blas/impl/KokkosBlas2_syr2_spec.hpp index a8ae741ede..7e4b1bcab2 100644 --- a/blas/impl/KokkosBlas2_syr2_spec.hpp +++ b/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -154,5 +154,6 @@ struct SYR2 { false, true>; #include +#include #endif // KOKKOSBLAS2_SYR2_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas2_syr_impl.hpp b/blas/impl/KokkosBlas2_syr_impl.hpp index 7685fd4b4b..11aa3a5052 100644 --- a/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/blas/impl/KokkosBlas2_syr_impl.hpp @@ -206,14 +206,14 @@ void teamParallelSyr(const ExecutionSpace& space, const typename AViewType::cons // The 'enable_if' makes sure unused kernels are not instantiated. template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { threadParallelSyr(space, alpha, x, A); } template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> void generalSyrImpl(const ExecutionSpace& space, const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { teamParallelSyr(space, alpha, x, A); diff --git a/blas/impl/KokkosBlas2_syr_spec.hpp b/blas/impl/KokkosBlas2_syr_spec.hpp index 58c7753618..74112c30cd 100644 --- a/blas/impl/KokkosBlas2_syr_spec.hpp +++ b/blas/impl/KokkosBlas2_syr_spec.hpp @@ -146,5 +146,6 @@ struct SYR { false, true>; #include +#include #endif // KOKKOSBLAS2_SYR_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 15c3c74ecd..72a08ec749 100644 --- a/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_ -#define KOKKOS_BLAS3_GEMM_DOTBASED_IMPL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ +#define KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ #include "KokkosBlas_util.hpp" @@ -141,4 +141,4 @@ struct DotBasedGEMM { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSBLAS3_GEMM_DOTBASED_IMPL_HPP_ diff --git a/blas/impl/KokkosBlas3_gemm_impl.hpp b/blas/impl/KokkosBlas3_gemm_impl.hpp index 675ef5d3a4..38f3fe5f8d 100644 --- a/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS3_GEMM_IMPL_HPP_ -#define KOKKOS_BLAS3_GEMM_IMPL_HPP_ +#ifndef KOKKOSBLAS3_GEMM_IMPL_HPP_ +#define KOKKOSBLAS3_GEMM_IMPL_HPP_ #include #include "KokkosKernels_Macros.hpp" @@ -23,7 +23,7 @@ #ifdef KOKKOS_ENABLE_CXX14 #ifdef KOKKOS_COMPILER_GNU #if KOKKOS_COMPILER_GNU <= 740 -#define KOKKOS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND +#define KOKKOSKERNELS_IMPL_BATCHED_GEMM_GCC_CXX14_WORKAROUND #endif #endif #endif @@ -69,11 +69,11 @@ struct impl_deep_copy_matrix_block(C.extent(0)); const int N = static_cast(C.extent(1)); - const bool is_device_space = KokkosKernels::Impl::kk_is_gpu_exec_space(); + const bool is_device_space = KokkosKernels::Impl::is_gpu_exec_space_v; const bool A_is_lr = std::is_same::value; const bool A_is_tr = ((transA[0] == 'T') || (transA[0] == 't') || (transA[0] == 'C') || (transA[0] == 'c')); const bool B_is_tr = ((transB[0] == 'T') || (transB[0] == 't') || (transB[0] == 'C') || (transB[0] == 'c')); @@ -272,5 +272,6 @@ struct GEMM { MEM_SPACE) #include +#include #endif // KOKKOSBLAS3_GEMM_SPEC_HPP_ diff --git a/blas/impl/KokkosBlas3_trmm_spec.hpp b/blas/impl/KokkosBlas3_trmm_spec.hpp index 6399f9e57e..f5520656ad 100644 --- a/blas/impl/KokkosBlas3_trmm_spec.hpp +++ b/blas/impl/KokkosBlas3_trmm_spec.hpp @@ -138,6 +138,8 @@ struct TRMM + #define KOKKOSBLAS3_TRMM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBLAS3_TRMM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) diff --git a/blas/impl/KokkosBlas3_trsm_spec.hpp b/blas/impl/KokkosBlas3_trsm_spec.hpp index 8c9088e970..7c5c6fc3eb 100644 --- a/blas/impl/KokkosBlas3_trsm_spec.hpp +++ b/blas/impl/KokkosBlas3_trsm_spec.hpp @@ -138,6 +138,8 @@ struct TRSM + #define KOKKOSBLAS3_TRSM_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ KOKKOSBLAS3_TRSM_ETI_SPEC_INST_LAYOUTS(SCALAR, LAYOUT, LAYOUT, EXEC_SPACE, MEM_SPACE) diff --git a/blas/impl/KokkosBlas_util.hpp b/blas/impl/KokkosBlas_util.hpp index 885625673f..c0777ac9ea 100644 --- a/blas/impl/KokkosBlas_util.hpp +++ b/blas/impl/KokkosBlas_util.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_BLAS_UTIL_HPP -#define KOKKOS_BLAS_UTIL_HPP +#ifndef KOKKOSBLAS_UTIL_HPP +#define KOKKOSBLAS_UTIL_HPP #include "Kokkos_ArithTraits.hpp" @@ -86,6 +86,7 @@ struct Algo { using QR = Level3; using UTV = Level3; using Pttrf = Level3; + using Pttrs = Level3; struct Level2 { struct Unblocked {}; @@ -118,6 +119,8 @@ struct Algo { using Trsv = Level2; using ApplyQ = Level2; using Tbsv = Level2; + using Pbtrf = Level2; + using Pbtrs = Level2; }; namespace Impl { @@ -174,4 +177,4 @@ struct TakeSqrtFunctor { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOSBLAS_UTIL_HPP diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index 6e1a428b51..477dac864c 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -77,9 +77,9 @@ typename Kokkos::Details::InnerProductSpaceTraits::type; using RVector_Internal = - Kokkos::View>; - using RVector_Result = - Kokkos::View>; + Kokkos::View>; + using RVector_Result = Kokkos::View>; XVector_Internal X = x; YVector_Internal Y = y; diff --git a/blas/src/KokkosBlas1_nrm1.hpp b/blas/src/KokkosBlas1_nrm1.hpp index bf7119a585..807475d8f4 100644 --- a/blas/src/KokkosBlas1_nrm1.hpp +++ b/blas/src/KokkosBlas1_nrm1.hpp @@ -49,8 +49,8 @@ typename Kokkos::Details::InnerProductSpaceTraits::array_layout, typename XVector::device_type, Kokkos::MemoryTraits >; - using RVector_Internal = - Kokkos::View >; + using RVector_Internal = Kokkos::View >; mag_type result; RVector_Internal R = RVector_Internal(&result); diff --git a/blas/src/KokkosBlas1_nrm2_squared.hpp b/blas/src/KokkosBlas1_nrm2_squared.hpp index 748ece3663..e8b0a63d04 100644 --- a/blas/src/KokkosBlas1_nrm2_squared.hpp +++ b/blas/src/KokkosBlas1_nrm2_squared.hpp @@ -55,7 +55,8 @@ typename Kokkos::Details::InnerProductSpaceTraits > XVector_Internal; - typedef Kokkos::View > + typedef Kokkos::View > RVector_Internal; mag_type result; diff --git a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp index 5ab29e632f..34852fa5b8 100644 --- a/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_axpby_tpl_spec_decl.hpp @@ -222,9 +222,9 @@ namespace Impl { const int N = static_cast(numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else \ Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ @@ -258,89 +258,89 @@ namespace Impl { const int N = static_cast(numElems); \ constexpr int one = 1; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSaxpy(s.handle, N, &alpha, X.data(), one, Y.data(), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else \ Axpby::axpby(space, alpha, X, beta, Y); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Axpby, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::complex, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex AV; \ - typedef Kokkos::complex BV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > \ - YV; \ - typedef typename XV::size_type size_type; \ - \ - static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ - axpby_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else \ - Axpby::axpby(space, alpha, X, beta, Y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CAXPBY_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ + template \ + struct Axpby, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::complex, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex AV; \ + typedef Kokkos::complex BV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits > \ + YV; \ + typedef typename XV::size_type size_type; \ + \ + static void axpby(const ExecSpace& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::axpby[TPL_CUBLAS,complex]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (beta == 1.0f)) { \ + axpby_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCaxpy(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else \ + Axpby::axpby(space, alpha, X, beta, Y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DAXPBY_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index fa9d5fafce..8533ca6f85 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -117,11 +117,11 @@ namespace Impl { dot_print_specialization(); \ const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, X, Y); \ } \ @@ -175,11 +175,11 @@ namespace Impl { dot_print_specialization(); \ const rocblas_int N = static_cast(numElems); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1, \ - reinterpret_cast(&R()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(TPL_DOT(s.handle, N, reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, X, Y); \ } \ diff --git a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp index c85de4d186..c9c765215c 100644 --- a/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_iamax_tpl_spec_decl.hpp @@ -146,17 +146,17 @@ using CUBLASUVM_DEVICE_TYPE = Kokkos::Device const int XST = X.stride(0); \ const int LDX = (XST == 0) ? 1 : XST; \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t prevPtrMode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &prevPtrMode)); \ if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_1)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), LDX, \ - reinterpret_cast(R.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ + LDX, reinterpret_cast(R.data()))); \ if (prevPtrMode == CUBLAS_PTR_MODE_2) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_PTR_MODE_2)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } \ } else { \ Iamax::iamax(space, R, X); \ @@ -280,52 +280,53 @@ namespace Impl { using ROCBLAS_DEVICE_TYPE = Kokkos::Device; -#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ - LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ - ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ - template <> \ - struct Iamax >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = Kokkos::HIP; \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void iamax(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { \ - Kokkos::deep_copy(R, 0); \ - return; \ - } \ - if (numElems < static_cast(INT_MAX)) { \ - iamax_print_specialization(); \ - const int N = static_cast(numElems); \ - const int XST = X.stride(0); \ - const int LDX = (XST == 0) ? 1 : XST; \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode prevPtrMode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ - } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(X.data()), \ - LDX, reinterpret_cast(R.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ - } \ - } else { \ - Iamax::iamax(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS_WRAPPER(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, \ + LAYOUT, MEMSPACE, ETI_SPEC_AVAIL, RET_DEVICE_TYPE, \ + ROCBLAS_PTR_MODE_1, ROCBLAS_PTR_MODE_2) \ + template <> \ + struct Iamax >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = Kokkos::HIP; \ + typedef Kokkos::View > RV; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void iamax(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::iamax[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems == 0) { \ + Kokkos::deep_copy(R, 0); \ + return; \ + } \ + if (numElems < static_cast(INT_MAX)) { \ + iamax_print_specialization(); \ + const int N = static_cast(numElems); \ + const int XST = X.stride(0); \ + const int LDX = (XST == 0) ? 1 : XST; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode prevPtrMode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &prevPtrMode)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_1)); \ + } \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, N, \ + reinterpret_cast(X.data()), LDX, \ + reinterpret_cast(R.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + if (prevPtrMode == ROCBLAS_PTR_MODE_2) { \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, ROCBLAS_PTR_MODE_2)); \ + } \ + } else { \ + Iamax::iamax(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_XIAMAX_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, INDEX_TYPE, LAYOUT, \ diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 37876d0129..6de384380e 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -95,7 +95,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLe // oneMKL #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ template \ diff --git a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index 1bf740b3fb..5503445227 100644 --- a/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -119,22 +119,22 @@ void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewTyp constexpr int one = 1; KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( cublasScasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( cublasDzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ @@ -201,22 +201,22 @@ void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, const XViewTy constexpr int one = 1; KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_sasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dasum(s.handle, N, X.data(), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocblas_scasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocblas_dzasum(s.handle, N, reinterpret_cast(X.data()), one, R.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ @@ -263,7 +263,7 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, Kokkos::LayoutLe // oneMKL #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #include #include diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 4d1a238740..b7b70b5edb 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -75,8 +75,7 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSp KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace) #endif diff --git a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp index dfd6150914..3fba6d03e7 100644 --- a/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_nrm2_tpl_spec_decl.hpp @@ -175,37 +175,37 @@ KOKKOSBLAS1_CNRM2_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, fals namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using RT = Kokkos::ArithTraits::mag_type; \ - using RV = Kokkos::View >; \ - using XV = Kokkos::View, \ - Kokkos::MemoryTraits >; \ - using size_type = typename XV::size_type; \ - \ - static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ - const size_type numElems = X.extent(0); \ - if (numElems <= static_cast(std::numeric_limits::max())) { \ - nrm2_print_specialization(); \ - const int N = static_cast(numElems); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - if (!take_sqrt) R() = R() * R(); \ - } else { \ - Nrm2::nrm2(space, R, X, take_sqrt); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, EXECSPACE, MEMSPACE, TPL_NRM2, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct Nrm2::mag_type, LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using RT = Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View >; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using size_type = typename XV::size_type; \ + \ + static void nrm2(const EXECSPACE& space, RV& R, const XV& X, const bool& take_sqrt) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm2[TPL_CUBLAS," + Kokkos::ArithTraits::name() + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems <= static_cast(std::numeric_limits::max())) { \ + nrm2_print_specialization(); \ + const int N = static_cast(numElems); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + if (!take_sqrt) R() = R() * R(); \ + } else { \ + Nrm2::nrm2(space, R, X, take_sqrt); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ @@ -254,9 +254,10 @@ namespace Impl { nrm2_print_specialization(); \ const rocblas_int N = static_cast(numElems); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + TPL_NRM2(s.handle, N, reinterpret_cast(X.data()), 1, &R())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ if (!take_sqrt) R() = R() * R(); \ } else { \ Nrm2::nrm2(space, R, X, take_sqrt); \ @@ -283,8 +284,7 @@ KOKKOSBLAS1_NRM2_TPL_SPEC_DECL_ROCBLAS_EXT(false) #endif -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp index 404c5c0e3b..dfe747bf88 100644 --- a/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rot_tpl_spec_decl.hpp @@ -167,12 +167,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,double]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasDrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -193,12 +193,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,float]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasSrot(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -220,13 +220,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasZdrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -248,13 +248,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rot[TPL_CUBLAS,complex]"); \ rot_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ cublasCsrot(singleton.handle, X.extent_int(0), reinterpret_cast(X.data()), 1, \ reinterpret_cast(Y.data()), 1, c.data(), s.data()); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp index e6583d5ae3..33e855fdf6 100644 --- a/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotg_tpl_spec_decl.hpp @@ -193,12 +193,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,double]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -219,12 +219,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,float]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSrotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -246,14 +246,14 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -275,14 +275,14 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_CUBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCrotg(singleton.handle, reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -351,12 +351,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,double]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_drotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -377,44 +377,44 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,float]"); \ rotg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_srotg(singleton.handle, a.data(), b.data(), c.data(), s.data())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Rotg< \ - EXECSPACE, \ - Kokkos::View, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using MViewType = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ - SViewType const& s) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ - rotg_print_specialization, EXECSPACE>(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zrotg(singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Rotg< \ + EXECSPACE, \ + Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using SViewType = Kokkos::View, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using MViewType = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void rotg(EXECSPACE const& space, SViewType const& a, SViewType const& b, MViewType const& c, \ + SViewType const& s) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ + rotg_print_specialization, EXECSPACE>(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zrotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_CROTG_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ @@ -434,15 +434,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotg[TPL_ROCBLAS,complex]"); \ rotg_print_specialization, EXECSPACE>(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_crotg(singleton.handle, \ - reinterpret_cast(a.data()), \ - reinterpret_cast(b.data()), c.data(), \ - reinterpret_cast(s.data()))); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(singleton.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_crotg(singleton.handle, \ + reinterpret_cast(a.data()), \ + reinterpret_cast(b.data()), c.data(), \ + reinterpret_cast(s.data()))); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(singleton.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp index 7bde6d0835..41b1719e71 100644 --- a/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotm_tpl_spec_decl.hpp @@ -108,12 +108,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,double]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -140,12 +140,12 @@ KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_CUBLAS,float]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSrotm(s.handle, X.extent(0), X.data(), 1, Y.data(), 1, param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -184,13 +184,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,double]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_drotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -217,13 +217,13 @@ KOKKOSBLAS1_DROTM_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos Kokkos::Profiling::pushRegion("KokkosBlas::rotm[TPL_ROCBLAS,float]"); \ rotm_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_srotm(s.handle, static_cast(X.extent(0)), X.data(), 1, Y.data(), 1, param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp index 0271cfd981..9ededf9916 100644 --- a/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_rotmg_tpl_spec_decl.hpp @@ -115,12 +115,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,double]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -151,12 +152,13 @@ KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokko Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_CUBLAS,float]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ cublasPointerMode_t pointer_mode; \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasGetPointerMode(s.handle, &pointer_mode)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetPointerMode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasGetPointerMode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, CUBLAS_POINTER_MODE_DEVICE)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSrotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetPointerMode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -199,13 +201,13 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,double]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_drotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -236,13 +238,13 @@ KOKKOSBLAS1_DROTMG_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokko Kokkos::Profiling::pushRegion("KokkosBlas::rotmg[TPL_ROCBLAS,float]"); \ rotmg_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_device)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_srotmg(s.handle, d1.data(), d2.data(), x1.data(), y1.data(), param.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 7083e28730..5ed846e1c7 100644 --- a/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -108,42 +108,42 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, fals namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Scal::scal(space, R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_CUBLAS(SCALAR_TYPE, CUDA_SCALAR_TYPE, CUBLAS_FN, LAYOUT, MEMSPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const ExecSpace& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_CUBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ @@ -196,46 +196,47 @@ KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ - MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Scal, \ - Kokkos::MemoryTraits >, \ - SCALAR_TYPE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef SCALAR_TYPE AS; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - const size_type numElems = X.extent(0); \ - if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ - scal_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - rocblas_pointer_mode pointer_mode; \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, N, reinterpret_cast(&alpha), \ - reinterpret_cast(R.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ - } else { \ - Scal::scal(space, R, alpha, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_XSCAL_TPL_SPEC_DECL_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, EXECSPACE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Scal, \ + Kokkos::MemoryTraits >, \ + SCALAR_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, true, ETI_SPEC_AVAIL> { \ + using execution_space = EXECSPACE; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + RV; \ + typedef SCALAR_TYPE AS; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XV; \ + typedef typename XV::size_type size_type; \ + \ + static void scal(const execution_space& space, const RV& R, const AS& alpha, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::scal[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + const size_type numElems = X.extent(0); \ + if ((numElems < static_cast(INT_MAX)) && (R.data() == X.data())) { \ + scal_print_specialization(); \ + const int N = static_cast(numElems); \ + constexpr int one = 1; \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + rocblas_pointer_mode pointer_mode; \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_get_pointer_mode(s.handle, &pointer_mode)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(R.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, pointer_mode)); \ + } else { \ + Scal::scal(space, R, alpha, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ diff --git a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp index e74b498c33..0b475c722f 100644 --- a/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_swap_tpl_spec_decl.hpp @@ -174,8 +174,8 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,double]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -195,8 +195,8 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,float]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -217,10 +217,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -241,10 +241,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_CUBLAS,complex]"); \ swap_print_specialization(); \ KokkosBlas::Impl::CudaBlasSingleton& singleton = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(singleton.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(singleton.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -310,79 +310,79 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,double]"); \ swap_print_specialization(); \ KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap< \ - EXECSPACE, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - Kokkos::View, Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View, Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_SSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View, Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_sswap(singleton.handle, X.extent_int(0), X.data(), 1, Y.data(), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_ZSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_double]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct Swap*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - true, ETI_SPEC_AVAIL> { \ - using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>; \ - static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ - swap_print_specialization(); \ - KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cswap(singleton.handle, X.extent_int(0), \ - reinterpret_cast(X.data()), 1, \ - reinterpret_cast(Y.data()), 1)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS1_CSWAP_TPL_SPEC_DECL_ROCBLAS(LAYOUT, EXECSPACE, MEMSPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct Swap*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using XVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + static void swap(EXECSPACE const& space, XVector const& X, YVector const& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::swap[TPL_ROCBLAS,complex_float]"); \ + swap_print_specialization(); \ + KokkosBlas::Impl::RocBlasSingleton& singleton = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(singleton.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cswap(singleton.handle, X.extent_int(0), \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS1_DSWAP_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index 709f261b63..679a5ddace 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -129,7 +129,7 @@ KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, Kokkos::LayoutRi #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, LAYOUT) \ template \ diff --git a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 4234afbd77..fdfd17e454 100644 --- a/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -253,10 +253,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -288,10 +288,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -323,12 +323,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemv( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgemv( \ s.handle, transa, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -360,12 +360,12 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasCgemv(s.handle, transa, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(X.data()), \ one, reinterpret_cast(&beta), reinterpret_cast(Y.data()), one)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -448,10 +448,10 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,double]"); \ KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -483,90 +483,90 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,float]"); \ KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), one, &beta, Y.data(), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const ExecSpace& space, const char trans[], typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, true) @@ -594,8 +594,7 @@ KOKKOSBLAS2_CGEMV_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIPSpace, false) #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS // ONEMKL -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) && \ - defined(KOKKOS_ENABLE_SYCL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) #include #include #include diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index fdb09d1c91..6bc5eaef62 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -56,52 +56,56 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,double]"); \ KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasDger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct GER< \ - EXEC_SPACE, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ - const XViewType& X, const YViewType& Y, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct GER< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void ger(const EXEC_SPACE& space, const char /*trans*/[], typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ + } else { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ + cublasSger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ + } \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_ZGER_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -131,31 +135,34 @@ namespace Impl { KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgerc(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgerc(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZgeru(s.handle, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -187,31 +194,31 @@ namespace Impl { KOKKOSBLAS2_GER_CUBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index 26a0da5864..d1171208c5 100644 --- a/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -56,15 +56,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,double]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -95,15 +95,15 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::ger[TPL_ROCBLAS,float]"); \ KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sger(s.handle, M, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_sger(s.handle, M, N, &alpha, Y.data(), one, X.data(), one, A.data(), LDA)); \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -135,34 +135,34 @@ namespace Impl { KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgerc(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -194,34 +194,34 @@ namespace Impl { KOKKOSBLAS2_GER_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ if (A_is_ll) { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgerc(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgerc(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } \ } else { \ if (justTranspose) { \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgeru(s.handle, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cgeru(s.handle, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ } else { \ /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index d894433540..54c86491e6 100644 --- a/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -35,7 +35,7 @@ namespace Impl { // Note: using GEMM because there is no GEMV in MKL compact routines -#define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ +#define KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ inline void kk_mkl_gemm_compact(MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, MKL_INT m, MKL_INT n, \ MKL_INT k, SCALAR alpha, const SCALAR *a, MKL_INT ldap, const SCALAR *b, \ MKL_INT ldbp, SCALAR beta, SCALAR *c, MKL_INT ldcp, MKL_COMPACT_PACK format, \ @@ -43,12 +43,12 @@ namespace Impl { MKL_ROUTINE(layout, transa, transb, m, n, k, alpha, a, ldap, b, ldbp, beta, c, ldcp, format, nm); \ } -__IMPL_KK_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) -__IMPL_KK_MKL_DGEMM_COMPACT(float, mkl_sgemm_compact) +KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(double, mkl_dgemm_compact) +KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT(float, mkl_sgemm_compact) // Note: MKL compact format packs real and imaginary components separately // which makes it not directly compatible with our Vector types -#undef __IMPL_KK_MKL_DGEMM_COMPACT +#undef KOKKOSBLAS_IMPL_MKL_DGEMM_COMPACT template inline MKL_COMPACT_PACK mkl_compact_format() { diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp index 4dd95aa79a..6b3df91afe 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -58,10 +58,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -99,10 +99,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -142,13 +142,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -157,13 +157,13 @@ namespace Impl { } else { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -204,12 +204,12 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCsyr2( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -218,12 +218,12 @@ namespace Impl { } else { \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCher2( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ diff --git a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp index 84085224ac..72f0115b07 100644 --- a/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -58,10 +58,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -99,10 +99,10 @@ namespace Impl { KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, Y.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -142,13 +142,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zsyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -157,13 +157,13 @@ namespace Impl { } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -204,13 +204,13 @@ namespace Impl { if (justTranspose) { \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_csyr2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ @@ -219,13 +219,13 @@ namespace Impl { } else { \ if (A_is_ll && (alpha.imag() == 0.)) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_cher2(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ SYR2::syr2(space, trans, uplo, alpha, X, \ diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp index 43b177d9a5..0843d6c50f 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_cublas.hpp @@ -52,9 +52,9 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasDsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasDsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -85,9 +85,9 @@ namespace Impl { KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } else { \ /* cublasSsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -96,109 +96,109 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZsyr(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCsyr( \ + s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCher(s.handle, fillMode, N, &alpha_val, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DSYR_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, true) diff --git a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp index 59c99c1225..3780abefef 100644 --- a/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp +++ b/blas/tpls/KokkosBlas2_syr_tpl_spec_decl_rocblas.hpp @@ -52,10 +52,10 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_dsyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_dsyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -86,10 +86,10 @@ namespace Impl { KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ if (A_is_ll) { \ KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_ssyr(s.handle, fillMode, N, &alpha, X.data(), one, A.data(), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ } else { \ /* rocblas_ssyr() + ~A_ll => call kokkos-kernels' implementation */ \ SYR::syr(space, trans, uplo, alpha, X, A); \ @@ -98,114 +98,114 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const double alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_zsyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const double alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_zher( \ + s.handle, fillMode, N, &alpha_val, reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ - template <> \ - struct SYR*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - \ - static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ - typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ - KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ - bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ - if (justTranspose) { \ - if (A_is_ll) { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr(s.handle, fillMode, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } else { \ - if (A_is_ll && (alpha.imag() == 0.)) { \ - const float alpha_val = alpha.real(); \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher(s.handle, fillMode, N, &alpha_val, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(A.data()), LDA)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ - * implementation */ \ - SYR::syr(space, trans, uplo, alpha, X, A); \ - } \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CSYR_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr(const typename AViewType::execution_space& space, const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, const XViewType& X, const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( \ + rocblas_csyr(s.handle, fillMode, N, reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + const float alpha_val = alpha.real(); \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_pointer_mode(s.handle, rocblas_pointer_mode_host)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_cher( \ + s.handle, fillMode, N, &alpha_val, reinterpret_cast(X.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher() + [~A_ll or ~real alpha]=> call kokkos-kernels' \ + * implementation */ \ + SYR::syr(space, trans, uplo, alpha, X, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DSYR_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, true) diff --git a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 52123a9daf..010411ef80 100644 --- a/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -175,20 +175,20 @@ namespace Impl { gemm.run(space, conjT); \ } else { \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, transa, transb, M, N, K, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, \ reinterpret_cast(B.data()), LDB, \ reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ if (A_is_lr && B_is_lr && C_is_lr) \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, transb, transa, N, M, K, reinterpret_cast(&alpha), \ reinterpret_cast(B.data()), LDB, \ reinterpret_cast(A.data()), LDA, \ reinterpret_cast(&beta), reinterpret_cast(C.data()), LDC)); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ } \ Kokkos::Profiling::popRegion(); \ } \ @@ -260,78 +260,78 @@ KOKKOSBLAS3_CGEMM_CUBLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::Layou namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GEMM, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ - \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool is_lr = std::is_same::value; \ - \ - const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ - const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ - const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ - \ - rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ - rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ - M * N < numDotsLayoutLeftThreshold) || \ - (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm(alpha, A, B, beta, C); \ - bool conjT = (std::is_same::value || std::is_same::value) \ - ? false \ - : (transa == rocblas_operation_conjugate_transpose ? true : false); \ - gemm.run(space, conjT); \ - } else { \ - KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); \ - if (!is_lr) \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - else \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC)); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_XGEMM_ROCBLAS(SCALAR_TYPE, ROCBLAS_SCALAR_TYPE, ROCBLAS_FN, LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct GEMM, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef SCALAR_TYPE SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_ROCBLAS," #SCALAR_TYPE "]"); \ + \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool is_lr = std::is_same::value; \ + \ + const int AST = is_lr ? A.stride(0) : A.stride(1), LDA = AST == 0 ? 1 : AST; \ + const int BST = is_lr ? B.stride(0) : B.stride(1), LDB = BST == 0 ? 1 : BST; \ + const int CST = is_lr ? C.stride(0) : C.stride(1), LDC = CST == 0 ? 1 : CST; \ + \ + rocblas_operation transa = trans_mode_kk_to_rocblas(transA); \ + rocblas_operation transb = trans_mode_kk_to_rocblas(transB); \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutLeftThreshold) || \ + (is_lr && transa != rocblas_operation_none && transb == rocblas_operation_none && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm(alpha, A, B, beta, C); \ + bool conjT = (std::is_same::value || std::is_same::value) \ + ? false \ + : (transa == rocblas_operation_conjugate_transpose ? true : false); \ + gemm.run(space, conjT); \ + } else { \ + KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); \ + if (!is_lr) \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + else \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(ROCBLAS_FN(s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_DGEMM_ROCBLAS(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ diff --git a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp index 4e68c08dec..669477e3aa 100644 --- a/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trmm_tpl_spec_decl.hpp @@ -199,19 +199,19 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_layout_left) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ LDB, reinterpret_cast(B.data()), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(CUBLAS_FN( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(CUBLAS_FN( \ s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), \ LDB, reinterpret_cast(B.data()), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp index 7074a4e0e2..b0b8f0de73 100644 --- a/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas3_trsm_tpl_spec_decl.hpp @@ -370,15 +370,15 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasDtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -449,102 +449,102 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, M, N, &alpha, A.data(), LDA, B.data(), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL( \ cublasStrsm(s.handle, side_, uplo_, trans_, diag_, N, M, &alpha, A.data(), LDA, B.data(), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct TRSM**, LAYOUTA, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - \ - static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ - const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ - const BViewType& B) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ - const int M = static_cast(B.extent(0)); \ - const int N = static_cast(B.extent(1)); \ - \ - bool A_is_ll = std::is_same::value; \ - bool B_is_ll = std::is_same::value; \ - \ - const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ - const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ - \ - cublasSideMode_t side_; \ - cublasFillMode_t uplo_; \ - cublasOperation_t trans_; \ - cublasDiagType_t diag_; \ - \ - if (A_is_ll) { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_LEFT; \ - else \ - side_ = CUBLAS_SIDE_RIGHT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - } else { \ - if ((side[0] == 'L') || (side[0] == 'l')) \ - side_ = CUBLAS_SIDE_RIGHT; \ - else \ - side_ = CUBLAS_SIDE_LEFT; \ - if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ - uplo_ = CUBLAS_FILL_MODE_UPPER; \ - else \ - uplo_ = CUBLAS_FILL_MODE_LOWER; \ - } \ - \ - if ((trans[0] == 'N') || (trans[0] == 'n')) \ - trans_ = CUBLAS_OP_N; \ - else if ((trans[0] == 'T') || (trans[0] == 't')) \ - trans_ = CUBLAS_OP_T; \ - else \ - trans_ = CUBLAS_OP_C; \ - if ((diag[0] == 'U') || (diag[0] == 'u')) \ - diag_ = CUBLAS_DIAG_UNIT; \ - else \ - diag_ = CUBLAS_DIAG_NON_UNIT; \ - \ - KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ - if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB)); \ - } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_ZTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ + template \ + struct TRSM**, LAYOUTA, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + \ + static void trsm(const ExecSpace& space, const char side[], const char uplo[], const char trans[], \ + const char diag[], typename BViewType::const_value_type& alpha, const AViewType& A, \ + const BViewType& B) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::trsm[TPL_CUBLAS,complex]"); \ + const int M = static_cast(B.extent(0)); \ + const int N = static_cast(B.extent(1)); \ + \ + bool A_is_ll = std::is_same::value; \ + bool B_is_ll = std::is_same::value; \ + \ + const int AST = A_is_ll ? A.stride(1) : A.stride(0), LDA = (AST == 0) ? 1 : AST; \ + const int BST = B_is_ll ? B.stride(1) : B.stride(0), LDB = (BST == 0) ? 1 : BST; \ + \ + cublasSideMode_t side_; \ + cublasFillMode_t uplo_; \ + cublasOperation_t trans_; \ + cublasDiagType_t diag_; \ + \ + if (A_is_ll) { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_LEFT; \ + else \ + side_ = CUBLAS_SIDE_RIGHT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + } else { \ + if ((side[0] == 'L') || (side[0] == 'l')) \ + side_ = CUBLAS_SIDE_RIGHT; \ + else \ + side_ = CUBLAS_SIDE_LEFT; \ + if ((uplo[0] == 'L') || (uplo[0] == 'l')) \ + uplo_ = CUBLAS_FILL_MODE_UPPER; \ + else \ + uplo_ = CUBLAS_FILL_MODE_LOWER; \ + } \ + \ + if ((trans[0] == 'N') || (trans[0] == 'n')) \ + trans_ = CUBLAS_OP_N; \ + else if ((trans[0] == 'T') || (trans[0] == 't')) \ + trans_ = CUBLAS_OP_T; \ + else \ + trans_ = CUBLAS_OP_C; \ + if ((diag[0] == 'U') || (diag[0] == 'u')) \ + diag_ = CUBLAS_DIAG_UNIT; \ + else \ + diag_ = CUBLAS_DIAG_NON_UNIT; \ + \ + KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ + if (A_is_ll) { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } else { \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasZtrsm(s.handle, side_, uplo_, trans_, diag_, N, M, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB)); \ + } \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS3_CTRSM_CUBLAS(LAYOUTA, LAYOUTB, MEM_SPACE, ETI_SPEC_AVAIL) \ @@ -613,17 +613,17 @@ namespace Impl { diag_ = CUBLAS_DIAG_NON_UNIT; \ \ KokkosBlas::Impl::CudaBlasSingleton& s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, space.cuda_stream())); \ if (A_is_ll) { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCtrsm( \ s.handle, side_, uplo_, trans_, diag_, M, N, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ } else { \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCtrsm( \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasCtrsm( \ s.handle, side_, uplo_, trans_, diag_, N, M, reinterpret_cast(&alpha), \ reinterpret_cast(A.data()), LDA, reinterpret_cast(B.data()), LDB)); \ } \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(cublasSetStream(s.handle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/blas/tpls/KokkosBlas_Cuda_tpl.hpp b/blas/tpls/KokkosBlas_Cuda_tpl.hpp index d80e3a23d8..fa2749c980 100644 --- a/blas/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/blas/tpls/KokkosBlas_Cuda_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { CudaBlasSingleton::CudaBlasSingleton() { cublasStatus_t stat = cublasCreate(&handle); if (stat != CUBLAS_STATUS_SUCCESS) Kokkos::abort("CUBLAS initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { cublasDestroy(handle); }); } CudaBlasSingleton& CudaBlasSingleton::singleton() { - static CudaBlasSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + cublasDestroy(instance->handle); + instance.reset(); + }); + } + return *instance; +} + +bool CudaBlasSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CudaBlasSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/blas/tpls/KokkosBlas_Host_tpl.cpp b/blas/tpls/KokkosBlas_Host_tpl.cpp index 6989aea34d..c163dc726d 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -790,6 +790,18 @@ double HostBlas >::nrm2(KK_INT n, const std::complex double HostBlas >::asum(KK_INT n, const std::complex* x, KK_INT x_inc) { + // see issue 2005 + // On some platforms with OpenBLAS < 0.3.26, dzasum on vectors less than 16 entries is producing 0. + // this has been observed on some (not all) systems with: + // clang 14.0.6 / 15.0.7 AND OpenBLAS 0.3.23 AND Sapphire Rapids CPU + // unfortunately, it's not clear exactly what the trigger is + if (n > 0 && n < 16) { + double ret = 0.0; + for (int i = 0; i < n; ++i) { + ret += Kokkos::abs(x[i].real()) + Kokkos::abs(x[i].imag()); + } + return ret; + } return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> diff --git a/blas/tpls/KokkosBlas_Host_tpl.hpp b/blas/tpls/KokkosBlas_Host_tpl.hpp index 3ccf2f822a..576fde8471 100644 --- a/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -32,7 +32,7 @@ namespace KokkosBlas { namespace Impl { -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(MKL_PROVIDES_BLAS_LAPACK) using KK_INT = MKL_INT; #else using KK_INT = int; diff --git a/blas/tpls/KokkosBlas_Magma_tpl.hpp b/blas/tpls/KokkosBlas_Magma_tpl.hpp index f149a790df..bce5d4057a 100644 --- a/blas/tpls/KokkosBlas_Magma_tpl.hpp +++ b/blas/tpls/KokkosBlas_Magma_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { MagmaSingleton::MagmaSingleton() { magma_int_t stat = magma_init(); if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); } MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + magma_finalize(); + instance.reset(); + }); + } + return *instance; +} + +bool MagmaSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& MagmaSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/blas/tpls/KokkosBlas_Rocm_tpl.hpp b/blas/tpls/KokkosBlas_Rocm_tpl.hpp index b5a7dabf6f..8285570c2b 100644 --- a/blas/tpls/KokkosBlas_Rocm_tpl.hpp +++ b/blas/tpls/KokkosBlas_Rocm_tpl.hpp @@ -22,14 +22,24 @@ namespace KokkosBlas { namespace Impl { -RocBlasSingleton::RocBlasSingleton() { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_create_handle(&handle)); +RocBlasSingleton::RocBlasSingleton() { KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_create_handle(&handle)); } - Kokkos::push_finalize_hook([&]() { KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_destroy_handle(handle)); }); +RocBlasSingleton& RocBlasSingleton::singleton() { + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_destroy_handle(instance->handle)); + instance.reset(); + }); + } + return *instance; } -RocBlasSingleton& RocBlasSingleton::singleton() { - static RocBlasSingleton s; +bool RocBlasSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& RocBlasSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/blas/tpls/KokkosBlas_magma.hpp b/blas/tpls/KokkosBlas_magma.hpp index 5f5fcfe4e1..e86d2ee2cd 100644 --- a/blas/tpls/KokkosBlas_magma.hpp +++ b/blas/tpls/KokkosBlas_magma.hpp @@ -27,7 +27,11 @@ namespace Impl { struct MagmaSingleton { MagmaSingleton(); + static bool is_initialized(); static MagmaSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/blas/tpls/KokkosBlas_tpl_spec.hpp b/blas/tpls/KokkosBlas_tpl_spec.hpp index 7f40edf435..c8593e9199 100644 --- a/blas/tpls/KokkosBlas_tpl_spec.hpp +++ b/blas/tpls/KokkosBlas_tpl_spec.hpp @@ -29,7 +29,11 @@ struct CudaBlasSingleton { CudaBlasSingleton(); + static bool is_initialized(); static CudaBlasSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; inline void cublas_internal_error_throw(cublasStatus_t cublasState, const char* name, const char* file, @@ -82,7 +86,8 @@ inline void cublas_internal_safe_call(cublasStatus_t cublasState, const char* na // The macro below defines the interface for the safe cublas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) +#define KOKKOSBLAS_IMPL_CUBLAS_SAFE_CALL(call) \ + KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to cuBLAS transpose mode inline cublasOperation_t trans_mode_kk_to_cublas(const char kkMode[]) { @@ -111,7 +116,12 @@ struct RocBlasSingleton { RocBlasSingleton(); + static bool is_initialized(); + static RocBlasSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; inline void rocblas_internal_error_throw(rocblas_status rocblasState, const char* name, const char* file, @@ -171,7 +181,7 @@ inline void rocblas_internal_safe_call(rocblas_status rocblasState, const char* // The macro below defines the interface for the safe rocblas calls. // The functions themselves are protected by impl namespace and this // is not meant to be used by external application or libraries. -#define KOKKOS_ROCBLAS_SAFE_CALL_IMPL(call) \ +#define KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(call) \ KokkosBlas::Impl::rocblas_internal_safe_call(call, #call, __FILE__, __LINE__) /// \brief This function converts KK transpose mode to rocBLAS transpose mode diff --git a/blas/unit_test/Test_Blas2_gemv.hpp b/blas/unit_test/Test_Blas2_gemv.hpp index d70935c2ac..be6e4a6d51 100644 --- a/blas/unit_test/Test_Blas2_gemv.hpp +++ b/blas/unit_test/Test_Blas2_gemv.hpp @@ -19,6 +19,7 @@ #include #include #include +#include "KokkosKernels_TestVanilla.hpp" namespace Test { template diff --git a/blas/unit_test/Test_Blas2_ger.hpp b/blas/unit_test/Test_Blas2_ger.hpp index 6e975532e1..19e69a1c98 100644 --- a/blas/unit_test/Test_Blas2_ger.hpp +++ b/blas/unit_test/Test_Blas2_ger.hpp @@ -153,7 +153,7 @@ GerTester::GerT std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr && _testIsGpu) diff --git a/blas/unit_test/Test_Blas2_syr.hpp b/blas/unit_test/Test_Blas2_syr.hpp index 8dc7cadf51..cc1d011351 100644 --- a/blas/unit_test/Test_Blas2_syr.hpp +++ b/blas/unit_test/Test_Blas2_syr.hpp @@ -153,7 +153,7 @@ SyrTester::SyrTester() std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) diff --git a/blas/unit_test/Test_Blas2_syr2.hpp b/blas/unit_test/Test_Blas2_syr2.hpp index 2d6792f8c8..de475b5340 100644 --- a/blas/unit_test/Test_Blas2_syr2.hpp +++ b/blas/unit_test/Test_Blas2_syr2.hpp @@ -164,7 +164,7 @@ Syr2Tester::Syr std::is_same>::value), _A_is_lr(std::is_same::value), _A_is_ll(std::is_same::value), - _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space()) + _testIsGpu(KokkosKernels::Impl::is_gpu_exec_space_v) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS , _vanillaUsesDifferentOrderOfOps(_A_is_lr) diff --git a/blas/unit_test/Test_Blas_rocblas.hpp b/blas/unit_test/Test_Blas_rocblas.hpp index 091fac7259..aac11b180b 100644 --- a/blas/unit_test/Test_Blas_rocblas.hpp +++ b/blas/unit_test/Test_Blas_rocblas.hpp @@ -41,11 +41,11 @@ void test_rocblas_safe_call() { bool caught_exception = false; rocblas_status myStatus = rocblas_status_success; - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(myStatus); try { myStatus = rocblas_status_internal_error; - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(myStatus); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(myStatus); } catch (std::runtime_error& e) { caught_exception = true; } diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index ef8fea78b8..fa9f556847 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -29,7 +29,6 @@ requires (a) header file(s) as well, and may use functions other than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE #cmakedefine KOKKOSKERNELS_ENABLE_BENCHMARK @@ -154,6 +153,9 @@ #endif #endif +/* Whether MKL is providing the BLAS and LAPACK implementation */ +#cmakedefine MKL_PROVIDES_BLAS_LAPACK + #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOSKERNELS_ENABLE_HOST_ONLY @@ -172,4 +174,12 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY false #endif +/* Enabled components */ +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_BATCHED +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_GRAPH +#cmakedefine KOKKOSKERNELS_ENABLE_COMPONENT_ODE + #endif // KOKKOSKERNELS_CONFIG_H diff --git a/cmake/Modules/FindTPLBLAS.cmake b/cmake/Modules/FindTPLBLAS.cmake index 0bc73fc73f..67e4cc9a08 100644 --- a/cmake/Modules/FindTPLBLAS.cmake +++ b/cmake/Modules/FindTPLBLAS.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(BLAS REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(BLAS INTERFACE LINK_LIBRARIES ${BLAS_LIBRARIES}) ENDIF() - diff --git a/cmake/Modules/FindTPLLAPACK.cmake b/cmake/Modules/FindTPLLAPACK.cmake index 463f61afeb..f6d345d5ee 100644 --- a/cmake/Modules/FindTPLLAPACK.cmake +++ b/cmake/Modules/FindTPLLAPACK.cmake @@ -8,4 +8,3 @@ ELSE() FIND_PACKAGE(LAPACK REQUIRED) KOKKOSKERNELS_CREATE_IMPORTED_TPL(LAPACK INTERFACE LINK_LIBRARIES ${LAPACK_LIBRARIES}) ENDIF() - diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 52f4571976..1ecd882e71 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -74,3 +74,7 @@ ELSE() ) ENDIF() ENDIF() +# This logic to find MKL is only used in non-Trilinos builds. +# In this case, MKL can always be used as the host BLAS/LAPACK implementation +# (whether MKL_INT is 32- or 64-bit). +set (MKL_PROVIDES_BLAS_LAPACK ON INTERNAL) diff --git a/cmake/kokkoskernels_components.cmake b/cmake/kokkoskernels_components.cmake index 16a784bd1f..951900e9a9 100644 --- a/cmake/kokkoskernels_components.cmake +++ b/cmake/kokkoskernels_components.cmake @@ -102,4 +102,13 @@ IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED ELSE() SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) ENDIF() -mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) +# Now that component enables are finalized, also set upper-case +# versions of component enables for the config.h + +SET(KOKKOSKERNELS_ENABLE_COMPONENT_BATCHED ${KokkosKernels_ENABLE_COMPONENT_BATCHED}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_BLAS ${KokkosKernels_ENABLE_COMPONENT_BLAS}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK ${KokkosKernels_ENABLE_COMPONENT_LAPACK}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_GRAPH ${KokkosKernels_ENABLE_COMPONENT_GRAPH}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE ${KokkosKernels_ENABLE_COMPONENT_SPARSE}) +SET(KOKKOSKERNELS_ENABLE_COMPONENT_ODE ${KokkosKernels_ENABLE_COMPONENT_ODE}) diff --git a/cmake/kokkoskernels_eti.cmake b/cmake/kokkoskernels_eti.cmake index 524cad11f9..6c7629900d 100644 --- a/cmake/kokkoskernels_eti.cmake +++ b/cmake/kokkoskernels_eti.cmake @@ -131,6 +131,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") + SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") # if this is tied to particular components @@ -152,6 +153,7 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) #Make a single header file for all instances LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") + LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") #Make a different source file for each instance SET(INST_SOURCE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") SET(INST_TEMPLATE "${ETI_COMPONENTS}/eti/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") @@ -174,4 +176,14 @@ MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) + + SET(DECL_HEADER "${ETI_COMPONENTS}/eti/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") + SET(DECL_TEMPLATE "${DECL_HEADER}.in") + + STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_DECL_BLOCK "${${UPPER_NAME}_ETI_DECL_LIST}") + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) + + LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index b8267c4955..49d1adcdcb 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -31,10 +31,6 @@ MACRO(KOKKOSKERNELS_ADD_TPL_OPTION NAME DEFAULT_VALUE DOCSTRING) SET(ROOT_DEFAULT $ENV{${_NAME_ORIG}_ROOT}) KOKKOSKERNELS_ADD_OPTION(${_NAME_ORIG}_ROOT "${ROOT_DEFAULT}" PATH "Location of ${_NAME} install root. Default: None or the value of the environment variable ${_NAME}_ROOT if set") IF (DEFINED TPL_ENABLE_${_NAME}) - IF (${_NAME} STREQUAL MKL AND KOKKOSKERNELS_HAS_TRILINOS) - MESSAGE("Trilinos has enabled MKL and SYCL but it does not detect oneMKL correctly so we disable it!") - SET(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE ON) - ENDIF () IF (TPL_ENABLE_${_NAME} AND NOT KOKKOSKERNELS_ENABLE_TPL_${_NAME}) MESSAGE("Overriding KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG}=OFF with TPL_ENABLE_${_NAME}=ON") SET(KOKKOSKERNELS_ENABLE_TPL_${_NAME_ORIG} ON) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index b065869296..fd180f7827 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -2,3 +2,5 @@ LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/src) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/impl) LIST(APPEND KK_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/common/unit_test) + +LIST(APPEND SOURCES common/src/KokkosKernels_EagerInitialize.cpp) diff --git a/common/src/KokkosKernels_EagerInitialize.cpp b/common/src/KokkosKernels_EagerInitialize.cpp new file mode 100644 index 0000000000..214de93109 --- /dev/null +++ b/common/src/KokkosKernels_EagerInitialize.cpp @@ -0,0 +1,78 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosKernels_EagerInitialize.hpp" +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +// Include the minimal set of headers that declare all TPL singletons +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#include "KokkosBlas_tpl_spec.hpp" //cuBLAS, rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosBlas_magma.hpp" +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +// note: this file declares both cuSPARSE and rocSPARSE singletons +#include "KokkosKernels_tpl_handles_decl.hpp" +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosLapack_magma.hpp" +#endif +#endif + +namespace KokkosKernels { +void eager_initialize() { + if (!Kokkos::is_initialized()) { + throw std::runtime_error("Kokkos::intialize must be called before KokkosKernels::eager_initialize"); + } +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + (void)KokkosBlas::Impl::CudaBlasSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + (void)KokkosBlas::Impl::RocBlasSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + (void)KokkosBlas::Impl::MagmaSingleton::singleton(); +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + (void)KokkosKernels::Impl::CusparseSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + (void)KokkosKernels::Impl::RocsparseSingleton::singleton(); +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + (void)KokkosLapack::Impl::CudaLapackSingleton::singleton(); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + (void)KokkosLapack::Impl::MagmaSingleton::singleton(); +#endif +#endif +} +} // namespace KokkosKernels diff --git a/common/src/KokkosKernels_EagerInitialize.hpp b/common/src/KokkosKernels_EagerInitialize.hpp new file mode 100644 index 0000000000..83ddba74ee --- /dev/null +++ b/common/src/KokkosKernels_EagerInitialize.hpp @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOKERNELS_EAGER_INITIALIZE_HPP +#define KOKKOKERNELS_EAGER_INITIALIZE_HPP + +namespace KokkosKernels { +// \brief Eagerly initialize handles for all enabled TPLs, as well +// as any other globally shared resources that would otherwise be lazily initialized. +// +// Eagerly initializing a TPL means that it doesn't have to be +// lazily initialized when first calling a kernel that uses it. +// For example, \c eager_initialize() will call \c cusparseCreate() upfront +// so that the first call to \c KokkosSparse::spmv doesn't have to. +// This can add a significant amount of apparent runtime to that first kernel +// call, even though the added time isn't really spent in the kernel. +// +// Calling this before using any kernels/TPLs is optional. +// This function is idempotent (any calls after the first have no effect). +// +// \pre \c Kokkos::initialize() has been called. +void eager_initialize(); +} // namespace KokkosKernels + +#endif diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2d167f5c73..625fa3e710 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -78,29 +78,21 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { //////////////////////////////////////////////////////////////////////////////// template -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return false; -} +constexpr inline bool is_gpu_exec_space_v = false; #ifdef KOKKOS_ENABLE_CUDA template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif #ifdef KOKKOS_ENABLE_HIP template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif #ifdef KOKKOS_ENABLE_SYCL template <> -constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { - return true; -} +constexpr inline bool is_gpu_exec_space_v = true; #endif //////////////////////////////////////////////////////////////////////////////// diff --git a/common/src/KokkosKernels_HashmapAccumulator.hpp b/common/src/KokkosKernels_HashmapAccumulator.hpp index c57dfa83fd..c5ea080e05 100644 --- a/common/src/KokkosKernels_HashmapAccumulator.hpp +++ b/common/src/KokkosKernels_HashmapAccumulator.hpp @@ -516,7 +516,7 @@ struct HashmapAccumulator { Kokkos::atomic_add(values + hash, value); return __insert_success; } else if (keys[hash] == -1) { - if (Kokkos::atomic_compare_exchange_strong(keys + hash, -1, key)) { + if (-1 == Kokkos::atomic_compare_exchange(keys + hash, -1, key)) { // should only be here if we used a new hash used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = hash; Kokkos::atomic_add(values + hash, value); diff --git a/common/src/KokkosKernels_Sorting.hpp b/common/src/KokkosKernels_Sorting.hpp index f91f11c164..c8e7ee4771 100644 --- a/common/src/KokkosKernels_Sorting.hpp +++ b/common/src/KokkosKernels_Sorting.hpp @@ -17,8 +17,9 @@ #define _KOKKOSKERNELS_SORTING_HPP #include "Kokkos_Core.hpp" +#include "Kokkos_Sort.hpp" #include "KokkosKernels_SimpleUtils.hpp" //for kk_exclusive_parallel_prefix_sum -#include "KokkosKernels_ExecSpaceUtils.hpp" //for kk_is_gpu_exec_space +#include "KokkosKernels_ExecSpaceUtils.hpp" //for is_gpu_exec_space #include namespace KokkosKernels { @@ -59,30 +60,13 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // Team-level parallel sorting (callable inside any TeamPolicy kernel) // ------------------------------------------------------------------- -// Comparison based sorting that uses the entire team (described by mem) to sort -// raw array according to the comparator. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template > -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()); - namespace Impl { // Functor that sorts a view on one team template struct BitonicSingleTeamFunctor { BitonicSingleTeamFunctor(View& v_, const Comparator& comp_) : v(v_), comp(comp_) {} - KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { - KokkosKernels::TeamBitonicSort(v.data(), v.extent(0), t, - comp); - }; + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember t) const { Kokkos::Experimental::sort_team(t, v, comp); }; View v; Comparator comp; }; @@ -97,8 +81,7 @@ struct BitonicChunkFunctor { Ordinal chunkStart = chunk * chunkSize; Ordinal n = chunkSize; if (chunkStart + n > Ordinal(v.extent(0))) n = v.extent(0) - chunkStart; - KokkosKernels::TeamBitonicSort(v.data() + chunkStart, n, - t, comp); + Kokkos::Experimental::sort_team(t, Kokkos::subview(v, Kokkos::make_pair(chunkStart, chunkStart + n)), comp); }; View v; Comparator comp; @@ -217,10 +200,11 @@ void bitonicSort(View v, const Comparator& comp) { Ordinal npot = 1; while (npot < n) npot <<= 1; // Partition the data equally among fixed number of teams - Ordinal chunkSize = 512; - Ordinal numTeams = npot / chunkSize; + Ordinal chunkSize = 512; + Ordinal numTeamsChunkSort = (n + chunkSize - 1) / chunkSize; + Ordinal numTeams = npot / chunkSize; // First, sort within teams - Kokkos::parallel_for(team_policy(numTeams, Kokkos::AUTO()), + Kokkos::parallel_for(team_policy(numTeamsChunkSort, Kokkos::AUTO()), Impl::BitonicChunkFunctor(v, comp, chunkSize)); for (int teamsPerBox = 2; teamsPerBox <= npot / chunkSize; teamsPerBox *= 2) { Ordinal boxSize = teamsPerBox * chunkSize; @@ -388,165 +372,23 @@ KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* value // trivially-copyable) Pros: In-place, plenty of parallelism for GPUs, and // memory references are coalesced Con: O(n log^2(n)) serial time is bad on CPUs // Good diagram of the algorithm at https://en.wikipedia.org/wiki/Bitonic_sorter -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// Sort "values", while applying the same swaps to "perm" -template -KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, - const Comparator& comp) { - // Algorithm only works on power-of-two input size only. - // If n is not a power-of-two, will implicitly pretend - // that values[i] for i >= n is just the max for ValueType, so it never gets - // swapped - Ordinal npot = 1; - Ordinal levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - for (Ordinal i = 0; i < levels; i++) { - for (Ordinal j = 0; j <= i; j++) { - // n/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::TeamVectorRange(mem, npot / 2), [=](const Ordinal t) { - // How big are the brown/pink boxes? - Ordinal boxSize = Ordinal(2) << (i - j); - // Which box contains this thread? - Ordinal boxID = t >> (i - j); // t * 2 / boxSize; - Ordinal boxStart = boxID << (1 + i - j); // boxID * boxSize - Ordinal boxOffset = t - (boxStart >> 1); // t - boxID * boxSize / - // 2; - Ordinal elem1 = boxStart + boxOffset; - if (j == 0) { - // first phase (brown box): within a block, compare with the - // opposite value in the box - Ordinal elem2 = boxStart + boxSize - 1 - boxOffset; - if (elem2 < n) { - // both elements in bounds, so compare them and swap if out of - // order - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } else { - // later phases (pink box): within a block, compare with fixed - // distance (boxSize / 2) apart - Ordinal elem2 = elem1 + boxSize / 2; - if (elem2 < n) { - if (comp(values[elem2], values[elem1])) { - ValueType temp1 = values[elem1]; - values[elem1] = values[elem2]; - values[elem2] = temp1; - PermType temp2 = perm[elem1]; - perm[elem1] = perm[elem2]; - perm[elem2] = temp2; - } - } - } - }); - mem.team_barrier(); - } - } -} - -// For backward compatibility: keep the public interface accessible in -// KokkosKernels::Impl:: -namespace Impl { - -template > -[[deprecated]] void bitonicSort(View v, const Comparator& comp = Comparator()) { - KokkosKernels::bitonicSort(v, comp); -} - -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort(ValueType* values, ValueType* valuesAux, Ordinal n) { - KokkosKernels::SerialRadixSort(values, valuesAux, n); -} - -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. -template -[[deprecated]] KOKKOS_INLINE_FUNCTION void SerialRadixSort2(ValueType* values, ValueType* valuesAux, PermType* perm, - PermType* permAux, Ordinal n) { - KokkosKernels::SerialRadixSort2(values, valuesAux, perm, permAux, n); -} - template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort(ValueType* values, Ordinal n, const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort(values, n, mem, comp); +[[deprecated("Use Kokkos::Experimental::sort_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort( + ValueType* values, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::Experimental::sort_team(mem, valuesView, comp); } -// Same as SerialRadixSort, but also permutes perm[0...n] as it sorts -// values[0...n]. +// Sort "values", while applying the same swaps to "perm" template > -[[deprecated]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, Ordinal n, - const TeamMember mem, - const Comparator& comp = Comparator()) { - KokkosKernels::TeamBitonicSort2(values, perm, n, mem, comp); +[[deprecated("Use Kokkos::Experimental::sort_by_key_team instead")]] KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( + ValueType* values, PermType* perm, Ordinal n, const TeamMember mem, const Comparator& comp = Comparator()) { + Kokkos::View valuesView(values, n); + Kokkos::View permView(perm, n); + Kokkos::Experimental::sort_by_key_team(mem, valuesView, permView, comp); } -} // namespace Impl } // namespace KokkosKernels diff --git a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp index aa477815d6..caf8e6b307 100644 --- a/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp +++ b/common/src/KokkosKernels_Uniform_Initialized_MemoryPool.hpp @@ -294,7 +294,7 @@ class UniformMemoryPool { data_type *get_arbitrary_free_chunk(const size_t &thread_index, const size_t max_tries) const { size_t chunk_index = thread_index & modular_num_chunks; size_t num_try = 0; - while (!Kokkos::atomic_compare_exchange_strong(pchunk_locks + chunk_index, 0, 1)) { + while (0 != Kokkos::atomic_compare_exchange(pchunk_locks + chunk_index, 0, 1)) { chunk_index = (chunk_index + 1) & modular_num_chunks; ++num_try; if (num_try > max_tries) { diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index f0add80c50..f24b6362eb 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -58,7 +58,7 @@ void get_suggested_vector_size(int &suggested_vector_size_, idx nr, idx nnz) { template int get_suggested_team_size(Functor &f, int vector_size) { using execution_space = typename team_policy_t::traits::execution_space; - if (kk_is_gpu_exec_space()) { + if (is_gpu_exec_space_v) { team_policy_t temp(1, 1, vector_size); return temp.team_size_recommended(f, ParallelTag()); } else @@ -68,7 +68,7 @@ int get_suggested_team_size(Functor &f, int vector_size) { template int get_suggested_team_size(Functor &f, int vector_size, size_t sharedPerTeam, size_t sharedPerThread) { using execution_space = typename team_policy_t::traits::execution_space; - if (kk_is_gpu_exec_space()) { + if (is_gpu_exec_space_v) { team_policy_t temp = team_policy_t(1, 1, vector_size) .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam), Kokkos::PerThread(sharedPerThread)); return temp.team_size_recommended(f, ParallelTag()); diff --git a/common/src/KokkosKernels_default_types.hpp b/common/src/KokkosKernels_default_types.hpp index 1da965a082..0a55be6d0b 100644 --- a/common/src/KokkosKernels_default_types.hpp +++ b/common/src/KokkosKernels_default_types.hpp @@ -20,57 +20,77 @@ #include "Kokkos_Core.hpp" //for LayoutLeft/LayoutRight #include //for all the ETI #cmakedefine macros +// define a deprecated symbol = type in the global namespace +// and a non-deprecated version in Kokkos Kernels +// these deprecations were done in 4.4. +// Intel 19 doesn't seem to like deprecating a type alias +#if defined(KOKKOS_COMPILER_INTEL) && (KOKKOS_COMPILER_INTEL < 2000) +#define KK_IMPL_MAKE_TYPE_ALIAS(symbol, type) \ + using symbol = type; \ + namespace KokkosKernels { \ + using symbol = type; \ + } +#else +#define KK_IMPL_MAKE_TYPE_ALIAS(symbol, type) \ + using symbol [[deprecated("use KokkosKernels::" #symbol ".")]] = type; \ + namespace KokkosKernels { \ + using symbol = type; \ + } +#endif + #if defined(KOKKOSKERNELS_INST_ORDINAL_INT) -using default_lno_t = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int) #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) -using default_lno_t = int64_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int64_t) #else // Non-ETI build: default to int -using default_lno_t = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_lno_t, int) #endif // Prefer int as the default offset type, because cuSPARSE doesn't support // size_t for rowptrs. #if defined(KOKKOSKERNELS_INST_OFFSET_INT) -using default_size_type = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, int) #elif defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T) -using default_size_type = size_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, size_t) #else // Non-ETI build: default to int -using default_size_type = int; +KK_IMPL_MAKE_TYPE_ALIAS(default_size_type, int) #endif #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -using default_layout = Kokkos::LayoutLeft; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutLeft) #elif defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -using default_layout = Kokkos::LayoutRight; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutRight) #else -using default_layout = Kokkos::LayoutLeft; +KK_IMPL_MAKE_TYPE_ALIAS(default_layout, Kokkos::LayoutLeft) #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -using default_scalar = double; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, double) #elif defined(KOKKOSKERNELS_INST_FLOAT) -using default_scalar = float; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, float) #elif defined(KOKKOSKERNELS_INST_HALF) -using default_scalar = Kokkos::Experimental::half_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, Kokkos::Experimental::half_t) #elif defined(KOKKOSKERNELS_INST_BHALF) -using default_scalar = Kokkos::Experimental::bhalf_t; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, Kokkos::Experimental::bhalf_t) #else -using default_scalar = double; +KK_IMPL_MAKE_TYPE_ALIAS(default_scalar, double) #endif #if defined(KOKKOS_ENABLE_CUDA) -using default_device = Kokkos::Cuda; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Cuda) #elif defined(KOKKOS_ENABLE_HIP) -using default_device = Kokkos::HIP; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::HIP) #elif defined(KOKKOS_ENABLE_OPENMPTARGET) -using default_device = Kokkos::Experimental::OpenMPTarget; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Experimental::OpenMPTarget) #elif defined(KOKKOS_ENABLE_OPENMP) -using default_device = Kokkos::OpenMP; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::OpenMP) #elif defined(KOKKOS_ENABLE_THREADS) -using default_device = Kokkos::Threads; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Threads) #else -using default_device = Kokkos::Serial; +KK_IMPL_MAKE_TYPE_ALIAS(default_device, Kokkos::Serial) #endif +#undef KK_IMPL_MAKE_TYPE_ALIAS + #endif // KOKKOSKERNELS_DEFAULT_TYPES_H diff --git a/common/src/Kokkos_ArithTraits.hpp b/common/src/Kokkos_ArithTraits.hpp index 25089613d4..f2d84edf81 100644 --- a/common/src/Kokkos_ArithTraits.hpp +++ b/common/src/Kokkos_ArithTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_ARITHTRAITS_HPP -#define KOKKOS_ARITHTRAITS_HPP +#ifndef KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP +#define KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP /// \file Kokkos_ArithTraits.hpp /// \brief Declaration and definition of Kokkos::ArithTraits @@ -1641,4 +1641,4 @@ using ArithTraits [[deprecated("Use Kokkos::ArithTraits instead")]] = ::Kokkos:: } // namespace Details } // namespace Kokkos -#endif // KOKKOS_ARITHTRAITS_HPP +#endif // KOKKOSKERNELS_KOKKOS_ARITHTRAITS_HPP diff --git a/common/src/Kokkos_InnerProductSpaceTraits.hpp b/common/src/Kokkos_InnerProductSpaceTraits.hpp index 25337c925f..d190009168 100644 --- a/common/src/Kokkos_InnerProductSpaceTraits.hpp +++ b/common/src/Kokkos_InnerProductSpaceTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_INNERPRODUCTSPACETRAITS_HPP -#define KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#ifndef KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#define KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP /// \file Kokkos_InnerProductSpaceTraits.hpp /// \brief Declaration and definition of @@ -238,13 +238,12 @@ KOKKOS_INLINE_FUNCTION void updateDot(ResultType& sum, const InputType1& x, cons KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const double x, const double y) { sum += x * y; } -KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += x * y; } +KOKKOS_INLINE_FUNCTION void updateDot(double& sum, const float x, const float y) { sum += static_cast(x) * y; } // This exists because complex += complex is not defined. KOKKOS_INLINE_FUNCTION void updateDot(Kokkos::complex& sum, const Kokkos::complex x, const Kokkos::complex y) { - const auto tmp = Kokkos::conj(x) * y; - sum += Kokkos::complex(tmp.real(), tmp.imag()); + sum += Kokkos::conj(Kokkos::complex(x)) * Kokkos::complex(y); } // This exists in case people call the overload of KokkosBlas::dot @@ -272,4 +271,4 @@ struct CastPossiblyComplex, Kokkos::complex> { } // namespace Details } // namespace Kokkos -#endif // KOKKOS_INNERPRODUCTSPACETRAITS_HPP +#endif // KOKKOSKERNELS_KOKKOS_INNERPRODUCTSPACETRAITS_HPP diff --git a/common/unit_test/CMakeLists.txt b/common/unit_test/CMakeLists.txt index c0d8fc116f..c963e908e5 100644 --- a/common/unit_test/CMakeLists.txt +++ b/common/unit_test/CMakeLists.txt @@ -95,3 +95,10 @@ IF (KOKKOS_ENABLE_THREADS) ) ENDIF () +# Add eager_initialize test, which is not backend-specific +KOKKOSKERNELS_ADD_UNIT_TEST( + common_eager_initialize + SOURCES Test_Common_EagerInitialize.cpp + COMPONENTS common +) + diff --git a/common/unit_test/Test_Common_ArithTraits.hpp b/common/unit_test/Test_Common_ArithTraits.hpp index 73a4ebfefe..b1fe860f8a 100644 --- a/common/unit_test/Test_Common_ArithTraits.hpp +++ b/common/unit_test/Test_Common_ArithTraits.hpp @@ -26,8 +26,8 @@ /// use Kokkos::ArithTraits, so it may be useful for users to /// read it. -#ifndef KOKKOS_ARITHTRAITSTEST_HPP -#define KOKKOS_ARITHTRAITSTEST_HPP +#ifndef KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP +#define KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP #include #include "Kokkos_ArithTraits.hpp" @@ -1693,4 +1693,4 @@ void test_ArithTraits() { } TEST_F(TestCategory, common_ArithTraits) { test_ArithTraits(); } -#endif // KOKKOS_ARITHTRAITSTEST_HPP +#endif // KOKKOSKERNELS_TEST_COMMON_ARITHTRAITSTEST_HPP diff --git a/common/unit_test/Test_Common_EagerInitialize.cpp b/common/unit_test/Test_Common_EagerInitialize.cpp new file mode 100644 index 0000000000..fc495e78fc --- /dev/null +++ b/common/unit_test/Test_Common_EagerInitialize.cpp @@ -0,0 +1,113 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KK_EAGERINIT_TEST_HPP +#define KK_EAGERINIT_TEST_HPP + +#include +#include "Kokkos_Core.hpp" +#include "KokkosKernels_config.h" +#include "KokkosKernels_EagerInitialize.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#include "KokkosBlas_tpl_spec.hpp" //cuBLAS, rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosBlas_magma.hpp" +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +// note: this file declares both cuSPARSE and rocSPARSE singletons +#include "KokkosKernels_tpl_handles_decl.hpp" +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "KokkosLapack_magma.hpp" +#endif +#endif + +// Count the number of singletons which are currently initialized, +// and the numInitialized number of singleton classes that are currently enabled +// (based on which TPLs and components were enabled at configure-time) +void countSingletons(int& numInitialized, int& numEnabled) { + numInitialized = 0; + numEnabled = 0; +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + numEnabled++; + if (KokkosBlas::Impl::CudaBlasSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + numEnabled++; + if (KokkosBlas::Impl::RocBlasSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + numEnabled++; + if (KokkosBlas::Impl::MagmaSingleton::is_initialized()) numInitialized++; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_SPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + numEnabled++; + if (KokkosKernels::Impl::CusparseSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + numEnabled++; + if (KokkosKernels::Impl::RocsparseSingleton::is_initialized()) numInitialized++; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_COMPONENT_LAPACK +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + numEnabled++; + if (KokkosLapack::Impl::CudaLapackSingleton::is_initialized()) numInitialized++; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA + numEnabled++; + if (KokkosLapack::Impl::MagmaSingleton::is_initialized()) numInitialized++; +#endif +#endif +} + +int main() { + int numInitialized, numEnabled; + Kokkos::initialize(); + { + // Check that no singletons are already initialized. + countSingletons(numInitialized, numEnabled); + if (numInitialized != 0) + throw std::runtime_error("At least one singleton was initialized before it should have been"); + KokkosKernels::eager_initialize(); + // Check that all singletons are now initialized. + countSingletons(numInitialized, numEnabled); + std::cout << "Kokkos::eager_initialize() set up " << numInitialized << " of " << numEnabled << " TPL singletons.\n"; + if (numInitialized != numEnabled) + throw std::runtime_error("At least one singleton was not initialized by eager_initialize()"); + } + Kokkos::finalize(); + // Finally, make sure that all singletons were finalized during Kokkos::finalize(). + countSingletons(numInitialized, numEnabled); + if (numInitialized != 0) + throw std::runtime_error("At least one singleton was not correctly finalized by Kokkos::finalize()"); + return 0; +} + +#endif diff --git a/common/unit_test/Test_Common_IOUtils.hpp b/common/unit_test/Test_Common_IOUtils.hpp index 1219304421..db59a0ee69 100644 --- a/common/unit_test/Test_Common_IOUtils.hpp +++ b/common/unit_test/Test_Common_IOUtils.hpp @@ -42,7 +42,7 @@ class ViewPrintHelper { template void testPrintView() { - using scalar_t = default_scalar; + using scalar_t = KokkosKernels::default_scalar; using Unmanaged = Kokkos::MemoryTraits; using rank0_view = Kokkos::View; using rank1_view = Kokkos::View; diff --git a/common/unit_test/Test_Common_LowerBound.hpp b/common/unit_test/Test_Common_LowerBound.hpp index d471801a30..54ce1c2e00 100644 --- a/common/unit_test/Test_Common_LowerBound.hpp +++ b/common/unit_test/Test_Common_LowerBound.hpp @@ -117,7 +117,7 @@ void test_lower_bound_team(const std::vector &_haystack, const T _needle) { // test lower_bound search const int leagueSize = 1; - const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::is_gpu_exec_space_v ? 64 : 1; int errCount; Kokkos::parallel_reduce(Policy(leagueSize, teamSize), TeamLowerBoundFunctor(expected, haystack, _needle), errCount); diff --git a/common/unit_test/Test_Common_Sorting.hpp b/common/unit_test/Test_Common_Sorting.hpp index 30623a8691..e4e62e5936 100644 --- a/common/unit_test/Test_Common_Sorting.hpp +++ b/common/unit_test/Test_Common_Sorting.hpp @@ -248,125 +248,6 @@ void testSerialRadixSort2(size_t k, size_t subArraySize) { } } -template -struct TestTeamBitonicFunctor { - typedef typename ValView::value_type Value; - - TestTeamBitonicFunctor(ValView& values_, OrdView& counts_, OrdView& offsets_) - : values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort(values.data() + offsets(i), counts(i), t); - } - - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -struct TestTeamBitonic2Functor { - typedef typename KeyView::value_type Key; - typedef typename ValView::value_type Value; - - TestTeamBitonic2Functor(KeyView& keys_, ValView& values_, OrdView& counts_, OrdView& offsets_) - : keys(keys_), values(values_), counts(counts_), offsets(offsets_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - int i = t.league_rank(); - KokkosKernels::TeamBitonicSort2(keys.data() + offsets(i), values.data() + offsets(i), - counts(i), t); - } - - KeyView keys; - ValView values; - OrdView counts; - OrdView offsets; -}; - -template -void testTeamBitonicSort(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - ValView data("Bitonic sort testing data", n); - fillRandom(data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, data); - // Run the sorting on device in all sub-arrays in parallel - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonicFunctor(data, counts, offsets)); - // Copy result to host - auto dataHost = Kokkos::create_mirror_view(data); - Kokkos::deep_copy(dataHost, data); - // Sort using std::sort on host to do correctness test - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - for (size_t i = 0; i < k; i++) { - Scalar* begin = gold.data() + offsetsHost(i); - Scalar* end = begin + countsHost(i); - std::sort(begin, end); - } - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(dataHost(i), gold(i)); - } -} - -template -void testTeamBitonicSort2(size_t k, size_t subArraySize) { - // Create a view of randomized data - typedef typename Device::execution_space exec_space; - typedef typename Device::memory_space mem_space; - typedef Kokkos::View OrdView; - typedef Kokkos::View KeyView; - typedef Kokkos::View ValView; - OrdView counts("Subarray Sizes", k); - OrdView offsets("Subarray Offsets", k); - // Generate k sub-array sizes, each with size about 20 - size_t n = generateRandomOffsets(counts, offsets, k, subArraySize); - KeyView keys("Bitonic test keys", n); - ValView data("Bitonic test data", n); - // The keys are randomized - fillRandom(keys, data); - Kokkos::View gold("Host sorted", n); - Kokkos::deep_copy(gold, keys); - // Run the sorting on device in all sub-arrays in parallel, just using vector - // loops Deliberately using a weird number for vector length - Kokkos::parallel_for(Kokkos::TeamPolicy(k, Kokkos::AUTO()), - TestTeamBitonic2Functor(keys, data, counts, offsets)); - exec_space().fence(); - auto countsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), counts); - auto offsetsHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), offsets); - // Sort using std::sort on host to do correctness test - for (size_t i = 0; i < k; i++) { - Key* begin = gold.data() + offsetsHost(i); - Key* end = begin + countsHost(i); - std::sort(begin, end); - } - // Copy results to host - auto keysHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), keys); - auto dataHost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), data); - // Make sure keys are sorted exactly (stability of sort doesn't matter) - for (size_t i = 0; i < n; i++) { - ASSERT_EQ(keysHost(i), gold(i)); - } - // Make sure the hashes of each key still matches the corresponding value - for (size_t i = 0; i < n; i++) { - auto correctHash = kvHash()(keysHost(i)); - ASSERT_EQ(dataHost(i), correctHash); - } -} - template struct CheckSortedFunctor { CheckSortedFunctor(View& v_) : v(v_) {} @@ -480,27 +361,6 @@ TEST_F(TestCategory, common_serial_radix2) { } } -TEST_F(TestCategory, common_team_bitonic) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort(numArrays, arrayMax); - testTeamBitonicSort(numArrays, arrayMax); - } -} - -TEST_F(TestCategory, common_team_bitonic2) { - // Test team-level bitonic over some contiguous medium arrays - // 1st arg is #arrays, 2nd arg is max subarray size - size_t numArrays = 20; - for (size_t arrayMax = 0; arrayMax < 10000; arrayMax = 1 + 4 * arrayMax) { - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2(numArrays, arrayMax); - testTeamBitonicSort2>(numArrays, arrayMax); - } -} - TEST_F(TestCategory, common_device_bitonic) { // Test device-level bitonic with some larger arrays testBitonicSort(243743); diff --git a/common/unit_test/Test_Common_UpperBound.hpp b/common/unit_test/Test_Common_UpperBound.hpp index abd4cf655a..ad84c8272d 100644 --- a/common/unit_test/Test_Common_UpperBound.hpp +++ b/common/unit_test/Test_Common_UpperBound.hpp @@ -117,7 +117,7 @@ void test_upper_bound_team(const std::vector &_haystack, const T _needle) { // test upper_bound search const int leagueSize = 1; - const int teamSize = KokkosKernels::Impl::kk_is_gpu_exec_space() ? 64 : 1; + const int teamSize = KokkosKernels::Impl::is_gpu_exec_space_v ? 64 : 1; int errCount; Kokkos::parallel_reduce(Policy(leagueSize, teamSize), TeamUpperBoundFunctor(expected, haystack, _needle), errCount); diff --git a/common/unit_test/Test_Common_float128.hpp b/common/unit_test/Test_Common_float128.hpp index 063fd06d80..b117753493 100644 --- a/common/unit_test/Test_Common_float128.hpp +++ b/common/unit_test/Test_Common_float128.hpp @@ -131,8 +131,8 @@ void testfloat128() { } // Assign to the first entry, atomically. - Kokkos::atomic_assign(&view(0), z); - cout << "view(0) after atomic_assign (z) = " << view(0) << endl; + Kokkos::atomic_store(&view(0), z); + cout << "view(0) after atomic_store (z) = " << view(0) << endl; if (view(0) != z) { success = false; } diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 942dc176b6..c557a735bd 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -126,6 +126,7 @@ int main(int argc, char* argv[]) { if (endRes < convTol && numIters == 1) { pass = true; } + delete myPrec; } Kokkos::finalize(); diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index cf3b5767f7..f2c56383ca 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -94,7 +94,7 @@ int main(int argc, char **argv) { return 1; } using LayoutType = Kokkos::LayoutLeft; - using DeviceType = default_device; + using DeviceType = KokkosKernels::default_device; size_t n = atoi(argv[1]); bool time_only = static_cast(atoi(argv[2])); do_xpy(n, time_only); diff --git a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp index 2137bf09e5..000fa18c25 100644 --- a/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp +++ b/example/wiki/graph/KokkosGraph_wiki_9pt_stencil.hpp @@ -25,16 +25,16 @@ #include #include -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; using ExecSpace = Kokkos::DefaultExecutionSpace; using DeviceSpace = typename ExecSpace::memory_space; using Kokkos::HostSpace; using RowmapType = Kokkos::View; using ColindsType = Kokkos::View; -using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; +using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; namespace GraphDemo { Ordinal gridX = 15; diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp index 49721e595e..0e129e2baa 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix.cpp @@ -23,10 +23,10 @@ #include "KokkosSparse_BsrMatrix.hpp" #include "KokkosSparse_CrsMatrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp index 527b0d56c4..83d1ea665e 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_bsrmatrix_2.cpp @@ -23,10 +23,10 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_BsrMatrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; template struct bsr_fill { diff --git a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp index 21257d8034..228958908b 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_crsmatrix.cpp @@ -21,10 +21,10 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_spmv.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 31ccea3b0a..0ecf5c1828 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -30,17 +30,17 @@ // Helper to print out colors in the shape of the grid int main() { - using Scalar = default_scalar; + using Scalar = KokkosKernels::default_scalar; using Mag = Kokkos::ArithTraits::mag_type; - using Ordinal = default_lno_t; - using Offset = default_size_type; + using Ordinal = KokkosKernels::default_lno_t; + using Offset = KokkosKernels::default_size_type; using ExecSpace = Kokkos::DefaultExecutionSpace; using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - using Handle = - KokkosKernels::Experimental::KokkosKernelsHandle; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using Handle = KokkosKernels::Experimental::KokkosKernelsHandle; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; const Scalar one = Kokkos::ArithTraits::one(); const Mag magOne = Kokkos::ArithTraits::one(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp index c9edd7bc0c..6282b8f250 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spadd.cpp @@ -20,10 +20,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp index 2b3ccd13d2..2e622f8b61 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spgemm.cpp @@ -20,10 +20,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; int main() { Kokkos::initialize(); diff --git a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp index 5778684a8a..1caeec7e54 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_spmv.cpp @@ -21,10 +21,10 @@ #include "KokkosKernels_Test_Structured_Matrix.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; template struct check_spmv_functor { diff --git a/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..23e1699557 --- /dev/null +++ b/graph/eti/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 2abc5c76e4..8d3808453b 100644 --- a/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2529,7 +2529,7 @@ class GraphColor_EB : public GraphColor(&(color_ban(uncolored_vertex)), src_col | dst_col); + Kokkos::atomic_fetch_or(&(color_ban(uncolored_vertex)), src_col | dst_col); edge_conflict_marker(work_index) = 0; } } @@ -2616,7 +2616,7 @@ class GraphColor_EB : public GraphColor(&(tentative_color_ban(smaller_index)), -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(smaller_index)), -src_col); nnz_lno_t banned_colors = ~(color_ban(smaller_index) | tentative_color_ban(smaller_index)); nnz_lno_t larger_col = banned_colors & (-banned_colors); kokcolors(smaller_index) = -(larger_col); @@ -2625,16 +2625,14 @@ class GraphColor_EB : public GraphColor(&(color_ban(dst_id)), - // -src_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); + // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), -src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(dst_id)), -src_col); } else if (dst_col != 0) { // if it is dst tentatively colors, but src is not colored, // then we send the dst color info to src's tentative_ban - // Kokkos::atomic_fetch_or(&(color_ban(src_id)), - // -dst_col); - Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); + // Kokkos::atomic_fetch_or(&(color_ban(src_id)), -dst_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(src_id)), -dst_col); } else { // idx smaller_index = src_id < dst_id > 0 ? src_id: dst_id; // idx larger_index = src_id < dst_id > 0 ? dst_id : src_id; @@ -2660,9 +2658,8 @@ class GraphColor_EB : public GraphColor(&(tentative_color_ban(larger_index)), src_col); - // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), - // src_col); + Kokkos::atomic_fetch_or(&(tentative_color_ban(larger_index)), src_col); + // Kokkos::atomic_fetch_or(&(color_ban(dst_id)), src_col); } } } diff --git a/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/graph/impl/KokkosGraph_Distance2Color_impl.hpp index cfa5186283..2a65158515 100644 --- a/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -616,7 +616,7 @@ class GraphColorDistance2 { const lno_t numVerts = this->nr; const lno_t numCols = this->nc; // note: relying on forbidden and colors_out being initialized to 0 - forbidden_view forbidden("Forbidden", batch * numCols); + forbidden_view forbidden("Forbidden", static_cast(batch) * numCols); int iter = 0; Kokkos::Timer timer; lno_t currentWork = this->nr; diff --git a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index e39e1e7ad3..89562fd901 100644 --- a/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -317,7 +317,7 @@ struct D2_MIS_RandomPriority { KokkosKernels::Impl::sequential_fill(colWorklist); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::is_gpu_exec_space_v && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; lno_t rowWorkLen = numVerts; @@ -396,7 +396,7 @@ struct D2_MIS_RandomPriority { Kokkos::deep_copy(rowStatus, ~(status_t(0))); worklist_t thirdWorklist = Kokkos::subview(allWorklists, Kokkos::ALL(), 2); auto execSpaceEnum = KokkosKernels::Impl::kk_get_exec_space_type(); - bool useTeams = KokkosKernels::Impl::kk_is_gpu_exec_space() && (entries.extent(0) / numVerts >= 16); + bool useTeams = KokkosKernels::Impl::is_gpu_exec_space_v && (entries.extent(0) / numVerts >= 16); int vectorLength = KokkosKernels::Impl::kk_get_suggested_vector_size(numVerts, entries.extent(0), execSpaceEnum); int round = 0; int refreshColTeamSize = 0; @@ -963,7 +963,7 @@ struct D2_MIS_Aggregation { KOKKOS_INLINE_FUNCTION void operator()(lno_t i) const { lno_t agg = labels_(i); if (agg != -1) { - Kokkos::atomic_increment(&aggSizes_(agg)); + Kokkos::atomic_inc(&aggSizes_(agg)); // compute connectivity of i size_type rowBegin = rowmap_(i); size_type rowEnd = rowmap_(i + 1); diff --git a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp index dc0e802485..7aff62f318 100644 --- a/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp +++ b/graph/impl/KokkosGraph_ExplicitCoarsening_impl.hpp @@ -33,7 +33,7 @@ struct ExplicitGraphCoarsening { struct ClusterSizeFunctor { ClusterSizeFunctor(const ordinal_view_t& counts_, const labels_t& vertClusters_) : counts(counts_), vertClusters(vertClusters_) {} - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_increment(&counts(vertClusters(i))); } + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { Kokkos::atomic_inc(&counts(vertClusters(i))); } ordinal_view_t counts; labels_t vertClusters; }; @@ -98,7 +98,7 @@ struct ExplicitGraphCoarsening { KOKKOS_INLINE_FUNCTION bool insert(lno_t cluster, lno_t nei, int* table) const { unsigned h = xorshiftHash(nei); for (unsigned i = h; i < h + 2; i++) { - if (Kokkos::atomic_compare_exchange_strong(&table[i % tableSize()], cluster, nei)) return true; + if (cluster == Kokkos::atomic_compare_exchange(&table[i % tableSize()], cluster, nei)) return true; } return false; } diff --git a/graph/impl/KokkosGraph_color_d1_spec.hpp b/graph/impl/KokkosGraph_color_d1_spec.hpp index 178fdd9182..21b9dae2c7 100644 --- a/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -103,4 +103,6 @@ struct COLOR_D1>, \ false, true>; +#include + #endif diff --git a/graph/src/KokkosGraph_CoarsenConstruct.hpp b/graph/src/KokkosGraph_CoarsenConstruct.hpp index 8e1cce3ddb..d0a48a6f88 100644 --- a/graph/src/KokkosGraph_CoarsenConstruct.hpp +++ b/graph/src/KokkosGraph_CoarsenConstruct.hpp @@ -73,8 +73,8 @@ struct SortLowDegreeCrsMatrixFunctor { Kokkos::single(Kokkos::PerTeam(t), [&]() { reducer++; }); return; } - KokkosKernels::TeamBitonicSort2(entries.data() + rowStart, - values.data() + rowStart, rowNum, t); + Kokkos::Experimental::sort_by_key_team(t, Kokkos::subview(entries, Kokkos::make_pair(rowStart, rowEnd)), + Kokkos::subview(values, Kokkos::make_pair(rowStart, rowEnd))); } rowmap_t rowmap; @@ -96,7 +96,7 @@ typename entries_t::non_const_value_type sort_low_degree_rows_crs_matrix( const typename entries_t::non_const_value_type degreeLimit) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + bool useRadix = !KokkosKernels::Impl::is_gpu_exec_space_v; Impl::SortLowDegreeCrsMatrixFunctor funct(useRadix, rowmap, entries, values, degreeLimit); lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; diff --git a/graph/src/KokkosGraph_CoarsenHeuristics.hpp b/graph/src/KokkosGraph_CoarsenHeuristics.hpp index f136882d89..3a6dfbda6d 100644 --- a/graph/src/KokkosGraph_CoarsenHeuristics.hpp +++ b/graph/src/KokkosGraph_CoarsenHeuristics.hpp @@ -86,7 +86,7 @@ class coarsen_heuristics { if (bucket >= t_buckets) bucket -= t_buckets; if (buckets(bucket) == ORD_MAX) { // attempt to insert into bucket - if (Kokkos::atomic_compare_exchange_strong(&buckets(bucket), ORD_MAX, i)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&buckets(bucket), ORD_MAX, i)) { break; } } @@ -140,8 +140,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; vcmap(v) = cv; @@ -201,8 +201,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = u; if (v < u) { cv = v; @@ -859,8 +859,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (u == v || Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (u == v || ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { // u == v avoids problems if there is a self-loop edge ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; @@ -1084,8 +1084,8 @@ class coarsen_heuristics { // need to enforce an ordering condition to allow hard-stall // conditions to be broken if (condition ^ swap) { - if (Kokkos::atomic_compare_exchange_strong(&match(u), ORD_MAX, v)) { - if (Kokkos::atomic_compare_exchange_strong(&match(v), ORD_MAX, u)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(u), ORD_MAX, v)) { + if (ORD_MAX == Kokkos::atomic_compare_exchange(&match(v), ORD_MAX, u)) { ordinal_t cv = Kokkos::atomic_fetch_add(&nvertices_coarse(), 1); vcmap(u) = cv; vcmap(v) = cv; diff --git a/graph/src/KokkosGraph_Distance1ColorHandle.hpp b/graph/src/KokkosGraph_Distance1ColorHandle.hpp index 1eefd07c4d..bcc71f147f 100644 --- a/graph/src/KokkosGraph_Distance1ColorHandle.hpp +++ b/graph/src/KokkosGraph_Distance1ColorHandle.hpp @@ -226,7 +226,7 @@ class GraphColoringHandle { #ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; #endif - } else if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + } else if (KokkosKernels::Impl::is_gpu_exec_space_v) { this->coloring_algorithm_type = COLORING_EB; #ifdef VERBOSE std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; @@ -402,7 +402,7 @@ class GraphColoringHandle { size_type_temp_work_view_t lower_count("LowerXADJ", nv + 1); size_type new_num_edge = 0; typedef Kokkos::RangePolicy my_exec_space; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { int teamSizeMax = 0; int vector_size = 0; diff --git a/graph/unit_test/CMakeLists.txt b/graph/unit_test/CMakeLists.txt index b497953159..00fa135481 100644 --- a/graph/unit_test/CMakeLists.txt +++ b/graph/unit_test/CMakeLists.txt @@ -10,12 +10,17 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_C # # ##################### +SET(KK_ENABLE_GRAPH_TESTS ON) + IF (KokkosKernels_TEST_ETI_ONLY) IF (NOT KokkosKernels_INST_DOUBLE AND NOT KokkosKernels_INST_FLOAT) - MESSAGE(FATAL_ERROR "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + MESSAGE(WARNING "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + SET(KK_ENABLE_GRAPH_TESTS OFF) ENDIF () ENDIF () +IF(KK_ENABLE_GRAPH_TESTS) + ##################### # # # Add GPU backends # @@ -97,4 +102,4 @@ IF (KOKKOS_ENABLE_THREADS) COMPONENTS graph ) ENDIF () - +ENDIF () diff --git a/graph/unit_test/Test_Graph_graph_color.hpp b/graph/unit_test/Test_Graph_graph_color.hpp index 3ddfa7c9b0..e53261edf7 100644 --- a/graph/unit_test/Test_Graph_graph_color.hpp +++ b/graph/unit_test/Test_Graph_graph_color.hpp @@ -169,30 +169,30 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size // device::execution_space::finalize(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, graph##_##graph_color##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring(50000, 50000 * 30, 200, 10); \ - test_coloring(50000, 50000 * 30, 100, 10); \ +#define EXECUTE_TEST(ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color##_default_scalar_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring(50000, 50000 * 30, 200, 10); \ + test_coloring(50000, 50000 * 30, 100, 10); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestDevice) +EXECUTE_TEST(int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) +EXECUTE_TEST(int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestDevice) +EXECUTE_TEST(int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) +EXECUTE_TEST(int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp index 87771de84f..66cda713dd 100644 --- a/graph/unit_test/Test_Graph_graph_color_deterministic.hpp +++ b/graph/unit_test/Test_Graph_graph_color_deterministic.hpp @@ -222,30 +222,30 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, graph##_##graph_color_deterministic##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_coloring_deterministic(18, 74); \ - test_coloring_deterministic(18, 74); \ +#define EXECUTE_TEST(ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, graph##_##graph_color_deterministic##_default_scalar_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_coloring_deterministic(18, 74); \ + test_coloring_deterministic(18, 74); \ } #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, int, TestDevice) +EXECUTE_TEST(int, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, int, TestDevice) +EXECUTE_TEST(int64_t, int, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int, size_t, TestDevice) +EXECUTE_TEST(int, size_t, TestDevice) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(default_scalar, int64_t, size_t, TestDevice) +EXECUTE_TEST(int64_t, size_t, TestDevice) #endif #undef EXECUTE_TEST diff --git a/graph/unit_test/Test_Graph_rcm.hpp b/graph/unit_test/Test_Graph_rcm.hpp index 0a9543367a..096bdb6961 100644 --- a/graph/unit_test/Test_Graph_rcm.hpp +++ b/graph/unit_test/Test_Graph_rcm.hpp @@ -119,7 +119,7 @@ void test_rcm(const rowmap_t& rowmap, const entries_t& entries, bool expectBandw template void test_rcm_zerorows() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -129,7 +129,7 @@ void test_rcm_zerorows() { template void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthReduced) { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap; @@ -140,7 +140,7 @@ void test_rcm_7pt(lno_t gridX, lno_t gridY, lno_t gridZ, bool expectBandwidthRed template void test_rcm_4clique() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; rowmap_t rowmap("rowmap", 5); @@ -156,7 +156,7 @@ void test_rcm_4clique() { template void test_rcm_multiple_components() { - using graph_t = Kokkos::StaticCrsGraph; + using graph_t = Kokkos::StaticCrsGraph; using rowmap_t = typename graph_t::row_map_type::non_const_type; using entries_t = typename graph_t::entries_type::non_const_type; // Generate a single 3D grid first diff --git a/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..89f5ac3cc1 --- /dev/null +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_gesv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_GESV_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_GESV_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_GESV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..a6d110429b --- /dev/null +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..3e82f0f8e6 --- /dev/null +++ b/lapack/eti/generated_specializations_hpp/KokkosLapack_trtri_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ +namespace KokkosLapack { +namespace Impl { + +@LAPACK_TRTRI_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosLapack +#endif // KOKKOSLAPACK_TRTRI_ETI_SPEC_DECL_HPP_ diff --git a/lapack/impl/KokkosLapack_gesv_spec.hpp b/lapack/impl/KokkosLapack_gesv_spec.hpp index 60a69e72b3..ebed196e3b 100644 --- a/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -120,5 +120,6 @@ struct GESV; #include +#include #endif // KOKKOSLAPACK_IMPL_GESV_SPEC_HPP_ diff --git a/lapack/impl/KokkosLapack_svd_spec.hpp b/lapack/impl/KokkosLapack_svd_spec.hpp index b0dfe3d091..470a8c0e8e 100644 --- a/lapack/impl/KokkosLapack_svd_spec.hpp +++ b/lapack/impl/KokkosLapack_svd_spec.hpp @@ -128,5 +128,6 @@ struct SVD; #include +#include #endif // KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ diff --git a/lapack/impl/KokkosLapack_trtri_spec.hpp b/lapack/impl/KokkosLapack_trtri_spec.hpp index ef458f7e57..0af55f4c6c 100644 --- a/lapack/impl/KokkosLapack_trtri_spec.hpp +++ b/lapack/impl/KokkosLapack_trtri_spec.hpp @@ -114,5 +114,6 @@ struct TRTRI { false, true>; #include +#include #endif // KOKKOSLAPACK_TRTRI_SPEC_HPP_ diff --git a/lapack/src/KokkosLapack_gesv.hpp b/lapack/src/KokkosLapack_gesv.hpp index 281d6a5651..1b183981fe 100644 --- a/lapack/src/KokkosLapack_gesv.hpp +++ b/lapack/src/KokkosLapack_gesv.hpp @@ -63,9 +63,15 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP static_assert(Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible); #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) +#if defined(KOKKOS_ENABLE_CUDA) if constexpr (!std::is_same_v) { static_assert(Kokkos::SpaceAccessibility::accessible); } +#elif defined(KOKKOS_ENABLE_HIP) + if constexpr (!std::is_same_v) { + static_assert(Kokkos::SpaceAccessibility::accessible); + } +#endif #else static_assert(Kokkos::SpaceAccessibility::accessible); #endif @@ -96,6 +102,7 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP // Check for no pivoting case. Only MAGMA supports no pivoting interface #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) if ((!std::is_same::value) && (IPIV0 == 0) && (IPIV.data() == nullptr)) { std::ostringstream os; @@ -103,6 +110,15 @@ void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, const IP << "LAPACK TPL does not support no pivoting."; KokkosKernels::Impl::throw_runtime_exception(os.str()); } +#elif defined(KOKKOS_ENABLE_HIP) + if ((!std::is_same::value) && (IPIV0 == 0) && + (IPIV.data() == nullptr)) { + std::ostringstream os; + os << "KokkosLapack::gesv: IPIV: " << IPIV0 << ". " + << "LAPACK TPL does not support no pivoting."; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +#endif #endif #else // not have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // but have LAPACK TPL diff --git a/lapack/src/KokkosLapack_svd.hpp b/lapack/src/KokkosLapack_svd.hpp index c0c962fb19..e050e75b59 100644 --- a/lapack/src/KokkosLapack_svd.hpp +++ b/lapack/src/KokkosLapack_svd.hpp @@ -127,20 +127,34 @@ void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], con is_extent_invalid = true; os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " << rankA << ".\n"; } - if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || (jobu[0] == 's')) { + if ((jobu[0] == 'A') || (jobu[0] == 'a')) { if (U.extent_int(0) != m || U.extent_int(1) != m) { is_extent_invalid = true; os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; } } - if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || (jobvt[0] == 's')) { + if ((jobu[0] == 'S') || (jobu[0] == 's')) { + if (U.extent_int(0) != m || U.extent_int(1) != std::min(m, n)) { + is_extent_invalid = true; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " << U.extent(1) << ") instead of (" << m << ", " + << std::min(m, n) << ").\n"; + } + } + if ((jobvt[0] == 'A') || (jobvt[0] == 'a')) { if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { is_extent_invalid = true; os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; } } + if ((jobvt[0] == 'S') || (jobvt[0] == 's')) { + if (Vt.extent_int(0) != std::min(m, n) || Vt.extent_int(1) != n) { + is_extent_invalid = true; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " << Vt.extent(1) << ") instead of (" + << std::min(m, n) << ", " << n << ").\n"; + } + } if (is_extent_invalid) { KokkosKernels::Impl::throw_runtime_exception(os.str()); } diff --git a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 3ead12d5f4..e3191ea93b 100644 --- a/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { CudaLapackSingleton::CudaLapackSingleton() { cusolverStatus_t stat = cusolverDnCreate(&handle); if (stat != CUSOLVER_STATUS_SUCCESS) Kokkos::abort("CUSOLVER initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); } CudaLapackSingleton& CudaLapackSingleton::singleton() { - static CudaLapackSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + cusolverDnDestroy(instance->handle); + instance.reset(); + }); + } + return *instance; +} + +bool CudaLapackSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CudaLapackSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/lapack/tpls/KokkosLapack_Magma_tpl.hpp b/lapack/tpls/KokkosLapack_Magma_tpl.hpp index 636c40735d..542f681281 100644 --- a/lapack/tpls/KokkosLapack_Magma_tpl.hpp +++ b/lapack/tpls/KokkosLapack_Magma_tpl.hpp @@ -25,12 +25,24 @@ namespace Impl { MagmaSingleton::MagmaSingleton() { magma_int_t stat = magma_init(); if (stat != MAGMA_SUCCESS) Kokkos::abort("MAGMA initialization failed\n"); - - Kokkos::push_finalize_hook([&]() { magma_finalize(); }); } MagmaSingleton& MagmaSingleton::singleton() { - static MagmaSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + magma_finalize(); + instance.reset(); + }); + } + return *instance; +} + +bool MagmaSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& MagmaSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/lapack/tpls/KokkosLapack_cusolver.hpp b/lapack/tpls/KokkosLapack_cusolver.hpp index 272fb8b3b8..0084b70429 100644 --- a/lapack/tpls/KokkosLapack_cusolver.hpp +++ b/lapack/tpls/KokkosLapack_cusolver.hpp @@ -32,6 +32,11 @@ struct CudaLapackSingleton { CudaLapackSingleton(); static CudaLapackSingleton& singleton(); + + static bool is_initialized(); + + private: + static std::unique_ptr& get_instance(); }; inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, const char* name, const char* file, @@ -69,7 +74,7 @@ inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, const c // The macro below defines is the public interface for the safe cusolver calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ +#define KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(call) \ KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, __LINE__) } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index 472b79ce85..50a6863b80 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -52,23 +52,28 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLe namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template <> \ - struct gesv_tpl_spec_avail< \ - Kokkos::Cuda, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; - -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +#if defined(KOKKOS_ENABLE_CUDA) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +#endif +#if defined(KOKKOS_ENABLE_HIP) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +#endif } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA diff --git a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 559f5d0509..7185f385cd 100644 --- a/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -197,42 +197,48 @@ void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, const BViewTyp Kokkos::Profiling::popRegion(); } -#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ - template <> \ - struct GESV, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, \ - gesv_eti_spec_avail, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>>::value> { \ - using AViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using BViewType = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using PViewType = \ - Kokkos::View, \ - Kokkos::MemoryTraits>; \ - \ - static void gesv(const Kokkos::Cuda& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ - magmaGesvWrapper(space, A, B, IPIV); \ - } \ +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct GESV< \ + EXEC_SPACE, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const EXEC_SPACE& space, const AViewType& A, const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ + } \ }; -KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) - +#if defined(KOKKOS_ENABLE_CUDA) +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace) +#endif +#if defined(KOKKOS_ENABLE_HIP) +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace) +#endif } // namespace Impl } // namespace KokkosLapack #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA @@ -264,54 +270,54 @@ void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, Kokkos::View info("getrf info"); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnSgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnDgetrf(s.handle, m, n, A.data(), lda, Workspace.data(), IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnCgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgetrs(s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnZgetrf_bufferSize(s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); Kokkos::View Workspace("getrf workspace", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), lda, - reinterpret_cast(Workspace.data()), IPIV.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgetrf(s.handle, m, n, reinterpret_cast(A.data()), + lda, reinterpret_cast(Workspace.data()), + IPIV.data(), info.data())); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, - reinterpret_cast(A.data()), lda, IPIV.data(), - reinterpret_cast(B.data()), ldb, info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( + cusolverDnZgetrs(s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, NULL)); } #define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ @@ -385,26 +391,26 @@ void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, Kokkos::View info("rocsolver info"); KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_sgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_dgesv(s.handle, N, nrhs, A.data(), lda, IPIV.data(), B.data(), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), - lda, IPIV.data(), reinterpret_cast(B.data()), - ldb, info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( + rocsolver_cgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL( rocsolver_zgesv(s.handle, N, nrhs, reinterpret_cast(A.data()), lda, IPIV.data(), reinterpret_cast(B.data()), ldb, info.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ diff --git a/lapack/tpls/KokkosLapack_magma.hpp b/lapack/tpls/KokkosLapack_magma.hpp index dfde113fa6..b1b7bb1ab6 100644 --- a/lapack/tpls/KokkosLapack_magma.hpp +++ b/lapack/tpls/KokkosLapack_magma.hpp @@ -30,6 +30,11 @@ struct MagmaSingleton { MagmaSingleton(); static MagmaSingleton& singleton(); + + static bool is_initialized(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp index 01255bf427..03bc8d9c30 100644 --- a/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp +++ b/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -324,42 +324,42 @@ void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const ch Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); CudaLapackSingleton& s = CudaLapackSingleton::singleton(); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, space.cuda_stream())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, space.cuda_stream())); if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, + rwork.data(), info.data())); } if constexpr (std::is_same_v) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), - info.data())); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnDgesvd(s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), lwork, + rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL( cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); Kokkos::View work("svd work buffer", lwork); - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgesvd( + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnZgesvd( s.handle, jobu[0], jobvt[0], m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, reinterpret_cast(work.data()), lwork, rwork.data(), info.data())); } - KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); + KOKKOSLAPACK_IMPL_CUSOLVER_SAFE_CALL(cusolverDnSetStream(s.handle, NULL)); } #define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ @@ -473,30 +473,30 @@ void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], const c Kokkos::View rwork("svd rwork buffer", Kokkos::min(m, n) - 1); KokkosBlas::Impl::RocBlasSingleton& s = KokkosBlas::Impl::RocBlasSingleton::singleton(); - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, space.hip_stream())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, space.hip_stream())); if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, - info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_sgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), - U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, - info.data())); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_dgesvd(s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, rwork.data(), WorkMode, + info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_cgesvd( s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, rwork.data(), WorkMode, info.data())); } if constexpr (std::is_same_v>) { - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocsolver_zgesvd( s.handle, UVecMode, VVecMode, m, n, reinterpret_cast(A.data()), lda, S.data(), reinterpret_cast(U.data()), ldu, reinterpret_cast(Vt.data()), ldvt, rwork.data(), WorkMode, info.data())); } - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); + KOKKOSBLAS_IMPL_ROCBLAS_SAFE_CALL(rocblas_set_stream(s.handle, NULL)); } #define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ diff --git a/lapack/unit_test/Test_Lapack_gesv.hpp b/lapack/unit_test/Test_Lapack_gesv.hpp index 653ed2cbf2..fb3f371927 100644 --- a/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/lapack/unit_test/Test_Lapack_gesv.hpp @@ -15,11 +15,12 @@ //@HEADER // only enable this test where KokkosLapack supports gesv: -// CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ - (defined(TEST_HIP_LAPACK_CPP) && defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ +// CUDA+(MAGMA or CUSOLVER), HIP+(MAGMA or ROCSOLVER) and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER))) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) #include @@ -97,8 +98,13 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#elif defined(KOKKOS_ENABLE_HIP) + nopivot_runtime_err = (!std::is_same::value) && + (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#endif notpl_runtime_err = false; #else notpl_runtime_err = true; @@ -200,8 +206,13 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) bool notpl_runtime_err = false; #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA // have MAGMA TPL #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK // and have LAPACK TPL +#if defined(KOKKOS_ENABLE_CUDA) nopivot_runtime_err = (!std::is_same::value) && (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#elif defined(KOKKOS_ENABLE_HIP) + nopivot_runtime_err = (!std::is_same::value) && + (ipiv.extent(0) == 0) && (ipiv.data() == nullptr); +#endif notpl_runtime_err = false; #else notpl_runtime_err = true; @@ -222,9 +233,9 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) // Get the solution vector. Kokkos::deep_copy(h_B, B); - // Checking vs ref on CPU, this eps is about 10^-9 + // Checking vs ref on CPU, this eps is about 10^-8 typedef typename ats::mag_type mag_type; - const mag_type eps = 1.0e7 * ats::epsilon(); + const mag_type eps = 1.0e8 * ats::epsilon(); bool test_flag = true; for (int j = 0; j < nrhs; j++) { for (int i = 0; i < N; i++) { @@ -268,6 +279,19 @@ int test_gesv(const char* mode) { Test::impl_test_gesv(&mode[0], "N", 64); // no padding Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + Test::impl_test_gesv(&mode[0], "Y", + 13); // padding + Test::impl_test_gesv(&mode[0], "Y", + 179); // padding + } +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_HIP) + if constexpr (std::is_same_v) { + Test::impl_test_gesv(&mode[0], "N", 2); // no padding + Test::impl_test_gesv(&mode[0], "N", 13); // no padding + Test::impl_test_gesv(&mode[0], "N", 179); // no padding + Test::impl_test_gesv(&mode[0], "N", 64); // no padding + Test::impl_test_gesv(&mode[0], "N", 1024); // no padding + Test::impl_test_gesv(&mode[0], "Y", 13); // padding Test::impl_test_gesv(&mode[0], "Y", @@ -307,6 +331,17 @@ int test_gesv_mrhs(const char* mode) { Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding + } +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_HIP) + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs(&mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "N", 1024, 5); // no padding + Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5); // padding Test::impl_test_gesv_mrhs(&mode[0], "Y", 179, 5); // padding } diff --git a/master_history.txt b/master_history.txt index c712462dd8..a02c157740 100644 --- a/master_history.txt +++ b/master_history.txt @@ -28,3 +28,4 @@ tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b tag: 4.4.00 date: 08/08/2024 master: d1a91b8a release: 1145f529 tag: 4.4.01 date: 09/12/2024 master: 0608a337 release: 6b340287 +tag: 4.5.00 date: 11/11/2024 master: 0b43169e release: 4a7590af diff --git a/ode/impl/KokkosODE_BDF_impl.hpp b/ode/impl/KokkosODE_BDF_impl.hpp index 3119ff0e3a..3e817563da 100644 --- a/ode/impl/KokkosODE_BDF_impl.hpp +++ b/ode/impl/KokkosODE_BDF_impl.hpp @@ -142,7 +142,7 @@ struct BDF_system_wrapper2 { if (compute_jac) { mySys.evaluate_jacobian(t, dt, y, jac); - // J = I - dt*(dy/dy) + // J = I - dt*(df/dy) for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { for (int colIdx = 0; colIdx < neqs; ++colIdx) { jac(rowIdx, colIdx) = -dt * jac(rowIdx, colIdx); diff --git a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp index 6a0770d1a7..f5b844a358 100644 --- a/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp +++ b/ode/impl/KokkosODE_RungeKuttaTables_impl.hpp @@ -257,6 +257,59 @@ struct ButcherTableau<4, 6> // Referred to as DOPRI5 or RKDP 11.0 / 84.0 - 187.0 / 2100.0, -1.0 / 40.0}}; }; +// Coefficients obtained from: +// J. H. Verner +// "Explicit Runge-Kutta methods with estimates of the local truncation error", +// Journal of Numerical Analysis, Volume 15, Issue 4, 1978, +// https://doi.org/10.1137/0715051. +template <> +struct ButcherTableau<5, 7> // Referred to as Verner 5-6 or VER56 +{ + static constexpr int order = 6; + static constexpr int nstages = 8; + Kokkos::Array a{{0.0, + 1.0 / 6.0, + 0.0, + 4.0 / 75.0, + 16.0 / 75.0, + 0.0, + 5.0 / 6.0, + -8.0 / 3.0, + 5.0 / 2.0, + 0.0, + -165.0 / 64.0, + 55.0 / 6.0, + -425.0 / 64.0, + 85.0 / 96.0, + 0.0, + 12.0 / 5.0, + -8.0, + 4015.0 / 612.0, + -11.0 / 36.0, + 88.0 / 255.0, + 0.0, + -8263.0 / 15000.0, + 124.0 / 75.0, + -643.0 / 680.0, + -81.0 / 250.0, + 2484.0 / 10625.0, + 0.0, + 0.0, + 3501.0 / 1720.0, + -300.0 / 43.0, + 297275.0 / 52632.0, + -319.0 / 2322.0, + 24068.0 / 84065.0, + 3850.0 / 26703.0, + 0.0}}; + Kokkos::Array b{ + {3.0 / 4.0, 0.0, 875.0 / 2244.0, 23.0 / 72.0, 264.0 / 1955.0, 0.0, 125.0 / 11592.0, 43.0 / 616.0}}; + Kokkos::Array c{{0.0, 1.0 / 6.0, 4.0 / 15.0, 2.0 / 3.0, 5.0 / 6.0, 1.0, 1.0 / 15.0, 1.0}}; + Kokkos::Array e{{3.0 / 4.0 - 13.0 / 160.0, 0.0, 875.0 / 2244.0 - 2375.0 / 5984.0, + 23.0 / 72.0 - 5.0 / 16.0, 264.0 / 1955.0 - 12.0 / 85.0, -3.0 / 44.0, + 125.0 / 11592.0, 43.0 / 616.0}}; +}; + } // namespace Impl } // namespace KokkosODE diff --git a/ode/impl/KokkosODE_RungeKutta_impl.hpp b/ode/impl/KokkosODE_RungeKutta_impl.hpp index 83ab76758f..533914477d 100644 --- a/ode/impl/KokkosODE_RungeKutta_impl.hpp +++ b/ode/impl/KokkosODE_RungeKutta_impl.hpp @@ -23,19 +23,84 @@ #include "KokkosODE_RungeKuttaTables_impl.hpp" #include "KokkosODE_Types.hpp" +#include "iostream" + namespace KokkosODE { namespace Impl { +// This algorithm is mostly derived from +// E. Hairer, S. P. Norsett G. Wanner, +// "Solving Ordinary Differential Equations I: +// Nonstiff Problems", Sec. II.4. +// Note that all floating point values below +// have been heuristically selected for +// convergence performance. +template +KOKKOS_FUNCTION void first_step_size(const ode_type ode, const int order, const scalar_type t0, const scalar_type atol, + const scalar_type rtol, const vec_type& y0, const res_type& f0, const vec_type y1, + const mat_type temp, scalar_type& dt_ini) { + using KAT = Kokkos::ArithTraits; + + // Extract subviews to store intermediate data + auto f1 = Kokkos::subview(temp, 1, Kokkos::ALL()); + + // Compute norms for y0 and f0 + double n0 = KAT::zero(), n1 = KAT::zero(), dt0, scale; + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale = atol + rtol * Kokkos::abs(y0(eqIdx)); + n0 += Kokkos::pow(y0(eqIdx) / scale, 2); + n1 += Kokkos::pow(f0(eqIdx) / scale, 2); + } + n0 = Kokkos::sqrt(n0) / Kokkos::sqrt(ode.neqs); + n1 = Kokkos::sqrt(n1) / Kokkos::sqrt(ode.neqs); + + // Select dt0 + if ((n0 < 1e-5) || (n1 < 1e-5)) { + dt0 = 1e-6; + } else { + dt0 = 0.01 * n0 / n1; + } + + // Estimate y at t0 + dt0 + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y1(eqIdx) = y0(eqIdx) + dt0 * f0(eqIdx); + } + + // Compute f at t0+dt0 and y1, + // then compute the norm of f(t0+dt0, y1) - f(t0, y0) + scalar_type n2 = KAT::zero(); + ode.evaluate_function(t0 + dt0, dt0, y1, f1); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + n2 += Kokkos::pow((f1(eqIdx) - f0(eqIdx)) / (atol + rtol * Kokkos::abs(y0(eqIdx))), 2); + } + n2 = Kokkos::sqrt(n2) / (dt0 * Kokkos::sqrt(ode.neqs)); + + // Finally select initial time step dt_ini + if ((n1 <= 1e-15) && (n2 <= 1e-15)) { + dt_ini = Kokkos::max(1e-6, dt0 * 1e-3); + } else { + dt_ini = Kokkos::pow(0.01 / Kokkos::max(n1, n2), KAT::one() / order); + } + + dt_ini = Kokkos::min(100 * dt0, dt_ini); + + // Zero out temp variables just to be safe... + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + f0(eqIdx) = 0.0; + y1(eqIdx) = 0.0; + f1(eqIdx) = 0.0; + } +} // first_step_size + // y_new = y_old + dt*sum(b_i*k_i) i in [1, nstages] // k_i = f(t+c_i*dt, y_old+sum(a_{ij}*k_i)) j in [1, i-1] // we need to compute the k_i and store them as we go // to use them for k_{i+1} computation. template -KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool adaptivity, scalar_type t, - scalar_type dt, const vec_type& y_old, const vec_type& y_new, const vec_type& temp, - const mv_type& k_vecs) { - const int neqs = ode.neqs; - const int nstages = table.nstages; +KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, const vec_type& temp, const mv_type& k_vecs) { + const int neqs = ode.neqs; + constexpr int nstages = table_type::nstages; // first set y_new = y_old for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -72,34 +137,42 @@ KOKKOS_FUNCTION void RKStep(ode_type& ode, const table_type& table, const bool a y_new(eqIdx) += dt * table.b[stageIdx] * k(eqIdx); } } - - // Compute estimation of the error using k_vecs and table.e - if (adaptivity == true) { - for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { - temp(eqIdx) = 0; - for (int stageIdx = 0; stageIdx < nstages; ++stageIdx) { - temp(eqIdx) += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); - } - } - } } // RKStep +// Note that the control values for +// time step increase/decrease are +// heuristically chosen based on +// L. F. Shampine and M. W. Reichelt +// "The Matlab ODE suite" SIAM J. Sci. +// Comput. Vol. 18, No. 1, pp. 1-22 +// Jan. 1997 template KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, const table_type& table, const KokkosODE::Experimental::ODE_params& params, const scalar_type t_start, const scalar_type t_end, const vec_type& y0, const vec_type& y, const vec_type& temp, - const mv_type& k_vecs) { + const mv_type& k_vecs, int* const step_count) { constexpr scalar_type error_threshold = 1; - bool adapt = params.adaptivity; + scalar_type error_n; + bool adapt = params.adaptivity; bool dt_was_reduced; - if (std::is_same_v>) { + if constexpr (std::is_same_v>) { adapt = false; } // Set current time and initial time step - scalar_type t_now = t_start; - scalar_type dt = (t_end - t_start) / params.max_steps; + scalar_type t_now = t_start, dt = 0.0; + if (adapt == true) { + ode.evaluate_function(t_start, 0, y0, temp); + first_step_size(ode, table_type::order, t_start, params.abs_tol, params.rel_tol, y0, temp, y, k_vecs, dt); + if (dt < params.min_step_size) { + dt = params.min_step_size; + } + } else { + dt = (t_end - t_start) / params.num_steps; + } + + *step_count = 0; // Loop over time steps to integrate ODE for (int stepIdx = 0; (stepIdx < params.max_steps) && (t_now <= t_end); ++stepIdx) { @@ -119,28 +192,33 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con // Take tentative steps until the requested error // is met. This of course only works for adaptive // solvers, for fix time steps we simply do not - // compute and check what error of the current step + // compute and check the error of the current step while (error_threshold < error) { // Take a step of Runge-Kutta integrator - RKStep(ode, table, adapt, t_now, dt, y0, y, temp, k_vecs); + RKStep(ode, table, t_now, dt, y0, y, temp, k_vecs); // Compute the largest error and decide on // the size of the next time step to take. error = 0; - if (adapt) { + + // Compute estimation of the error using k_vecs and table.e + if (adapt == true) { // Compute the error for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { - error = Kokkos::max(error, Kokkos::abs(temp(eqIdx))); - tol = Kokkos::max( - tol, params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx)))); + tol = params.abs_tol + params.rel_tol * Kokkos::max(Kokkos::abs(y(eqIdx)), Kokkos::abs(y0(eqIdx))); + error_n = 0; + for (int stageIdx = 0; stageIdx < table.nstages; ++stageIdx) { + error_n += dt * table.e[stageIdx] * k_vecs(stageIdx, eqIdx); + } + error += (error_n * error_n) / (tol * tol); } - error = error / tol; + error = Kokkos::sqrt(error / ode.neqs); // Reduce the time step if error // is too large and current step // is rejected. if (error > 1) { - dt = dt * Kokkos::max(0.2, 0.8 / Kokkos::pow(error, 1 / table.order)); + dt = dt * Kokkos::max(0.2, 0.8 * Kokkos::pow(error, -1.0 / table.order)); dt_was_reduced = true; } @@ -150,6 +228,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con // Update time and initial condition for next time step t_now += dt; + *step_count += 1; for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y0(eqIdx) = y(eqIdx); } @@ -157,7 +236,7 @@ KOKKOS_FUNCTION Experimental::ode_solver_status RKSolve(const ode_type& ode, con if (t_now < t_end) { if (adapt && !dt_was_reduced && error < 0.5) { // Compute new time increment - dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, 1 / table.order))); + dt = dt * Kokkos::min(10.0, Kokkos::max(2.0, 0.9 * Kokkos::pow(error, -1.0 / table.order))); } } else { return Experimental::ode_solver_status::SUCCESS; diff --git a/ode/src/KokkosODE_BDF.hpp b/ode/src/KokkosODE_BDF.hpp index 419316ba45..2afb6e46e2 100644 --- a/ode/src/KokkosODE_BDF.hpp +++ b/ode/src/KokkosODE_BDF.hpp @@ -93,6 +93,7 @@ struct BDF { const double dt = (t_end - t_start) / num_steps; double t = t_start; + int count = 0; // Load y0 into y_vecs(:, 0) for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { @@ -107,7 +108,7 @@ struct BDF { } KokkosODE::Experimental::ODE_params params(table.order - 1); for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { - KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack); + KokkosODE::Experimental::RungeKutta::Solve(ode, params, t, t + dt, y0, y, update, kstack, &count); for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); diff --git a/ode/src/KokkosODE_RungeKutta.hpp b/ode/src/KokkosODE_RungeKutta.hpp index 2d298a6568..39c1800b6c 100644 --- a/ode/src/KokkosODE_RungeKutta.hpp +++ b/ode/src/KokkosODE_RungeKutta.hpp @@ -38,7 +38,8 @@ enum RK_type : int { RK4 = 4, ///< Runge-Kutta classic order 4 method RKF45 = 5, ///< Fehlberg order 5 method RKCK = 6, ///< Cash-Karp method - RKDP = 7 ///< Dormand-Prince method + RKDP = 7, ///< Dormand-Prince method + VER56 = 8 ///< Verner order 6 method }; template @@ -86,6 +87,11 @@ struct RK_Tableau_helper { using table_type = KokkosODE::Impl::ButcherTableau<4, 6>; }; +template <> +struct RK_Tableau_helper { + using table_type = KokkosODE::Impl::ButcherTableau<5, 7>; +}; + /// \brief Unspecialized version of the RungeKutta solvers /// /// \tparam RK_type an RK_type enum value used to specify @@ -128,9 +134,10 @@ struct RungeKutta { template KOKKOS_FUNCTION static ode_solver_status Solve(const ode_type& ode, const KokkosODE::Experimental::ODE_params& params, const scalar_type t_start, const scalar_type t_end, const vec_type& y0, - const vec_type& y, const vec_type& temp, const mv_type& k_vecs) { + const vec_type& y, const vec_type& temp, const mv_type& k_vecs, + int* const count) { table_type table; - return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs); + return KokkosODE::Impl::RKSolve(ode, table, params, t_start, t_end, y0, y, temp, k_vecs, count); } }; diff --git a/ode/src/KokkosODE_Types.hpp b/ode/src/KokkosODE_Types.hpp index 2145afb718..e28bcb82f0 100644 --- a/ode/src/KokkosODE_Types.hpp +++ b/ode/src/KokkosODE_Types.hpp @@ -27,12 +27,21 @@ struct ODE_params { int num_steps, max_steps; double abs_tol, rel_tol, min_step_size; + KOKKOS_FUNCTION + ODE_params() + : adaptivity(true), num_steps(100), max_steps(10000), abs_tol(1e-12), rel_tol(1e-6), min_step_size(1e-9) {} + // Constructor that only specify the desired number of steps. // In this case no adaptivity is provided, the time step will // be constant such that dt = (tend - tstart) / num_steps; KOKKOS_FUNCTION ODE_params(const int num_steps_) - : adaptivity(false), num_steps(num_steps_), max_steps(num_steps_), abs_tol(0), rel_tol(0), min_step_size(0) {} + : adaptivity(false), + num_steps(num_steps_), + max_steps(num_steps_ + 1), + abs_tol(1e-12), + rel_tol(1e-6), + min_step_size(0) {} /// ODE_parms construtor for adaptive time stepping. KOKKOS_FUNCTION diff --git a/ode/unit_test/Test_ODE.hpp b/ode/unit_test/Test_ODE.hpp index 1b55171581..f30ff39bd2 100644 --- a/ode/unit_test/Test_ODE.hpp +++ b/ode/unit_test/Test_ODE.hpp @@ -19,6 +19,7 @@ // Explicit integrators #include "Test_ODE_RK.hpp" #include "Test_ODE_RK_chem.hpp" +#include "Test_ODE_RK_counts.hpp" // Implicit integrators #include "Test_ODE_Newton.hpp" diff --git a/ode/unit_test/Test_ODE_BDF.hpp b/ode/unit_test/Test_ODE_BDF.hpp index 8f8319cb1d..dfee9b62a0 100644 --- a/ode/unit_test/Test_ODE_BDF.hpp +++ b/ode/unit_test/Test_ODE_BDF.hpp @@ -545,7 +545,7 @@ void update_D(const int order, const scalar_type factor, const mat_type& coeffs, template void test_Nordsieck() { using execution_space = Kokkos::HostSpace; - StiffChemistry mySys{}; + [[maybe_unused]] StiffChemistry mySys{}; Kokkos::View R("coeffs", 6, 6), U("coeffs", 6, 6); Kokkos::View D("D", 8, mySys.neqs), tempD("tmp", 8, mySys.neqs); diff --git a/ode/unit_test/Test_ODE_RK.hpp b/ode/unit_test/Test_ODE_RK.hpp index 90bec0e184..6d6f7877fe 100644 --- a/ode/unit_test/Test_ODE_RK.hpp +++ b/ode/unit_test/Test_ODE_RK.hpp @@ -21,12 +21,14 @@ namespace Test { -// damped harmonic undriven oscillator +// damped undriven harmonic oscillator // m y'' + c y' + k y = 0 -// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + -// phi) omega_0 = sqrt(k/m); xi = c / sqrt(4*m*k) A and phi depend on y(0) and -// y'(0); Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * -// omega_0 * t) Change of variables: X = [x ] +// solution: y=A * exp(-xi * omega_0 * t) * sin(sqrt(1-xi^2) * omega_0 * t + phi) +// omega_0 = sqrt(k/m) +// xi = c / sqrt(4*m*k) +// A and phi depend on y(0) and y'(0); +// Change of variables: x(t) = y(t)*exp(-c/(2m)*t) = y(t)*exp(-xi * omega_0 * t) +// Change of variables: X = [x ] // [x'] // Leads to X' = A*X with A = [ 0 1] // [-d 0] @@ -74,7 +76,8 @@ struct solution_wrapper { void operator()(const int /*idx*/) const { ode.solution(t, y_old, y_ref); } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -84,10 +87,11 @@ struct RKSolve_wrapper { int max_steps; vec_type y_old, y_new, tmp; mv_type kstack; + count_type count; RKSolve_wrapper(const ode_type& my_ode_, const ode_params& params_, const scalar_type tstart_, const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, const vec_type& tmp_, - const mv_type& kstack_) + const mv_type& kstack_, const count_type& count_) : my_ode(my_ode_), params(params_), tstart(tstart_), @@ -95,11 +99,13 @@ struct RKSolve_wrapper { y_old(y_old_), y_new(y_new_), tmp(tmp_), - kstack(kstack_) {} + kstack(kstack_), + count(count_) {} KOKKOS_FUNCTION void operator()(const int /*idx*/) const { - KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack); + KokkosODE::Experimental::RungeKutta::Solve(my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, + count.data()); } }; @@ -110,14 +116,16 @@ void test_method(const std::string label, ode_type& my_ode, const scalar_type& t const Kokkos::View& sol, typename vec_type::HostMirror y_ref_h) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", my_ode.neqs); mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); + count_type count("time step count", 1); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, y_old, - y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror_view(y_new); @@ -284,7 +292,9 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t typename vec_type::HostMirror& y_ref_h, typename vec_type::HostMirror& error) { using execution_space = typename vec_type::execution_space; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; + count_type count("time step count", 1); vec_type tmp("tmp vector", my_ode.neqs); mv_type kstack("k stack", solver_type::num_stages(), my_ode.neqs); @@ -295,10 +305,11 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t Kokkos::RangePolicy my_policy(0, 1); for (int idx = 0; idx < num_steps.extent_int(0); ++idx) { KokkosODE::Experimental::ODE_params params(num_steps(idx)); + params.adaptivity = false; Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper solve_wrapper(my_ode, params, tstart, tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_ode, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(y_new_h, y_new); @@ -306,8 +317,8 @@ void test_rate(ode_type& my_ode, const scalar_type& tstart, const scalar_type& t #if defined(HAVE_KOKKOSKERNELS_DEBUG) scalar_type dt = (tend - tstart) / num_steps(idx); - std::cout << "dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) << ", " << y_new_h(1) << "}" - << std::endl; + std::cout << "count=" << count(0) << ", dt=" << dt << ", error=" << error(idx) << ", solution: {" << y_new_h(0) + << ", " << y_new_h(1) << "}" << std::endl; #endif } @@ -424,9 +435,11 @@ void test_adaptivity() { using RK_type = KokkosODE::Experimental::RK_type; using vec_type = Kokkos::View; using mv_type = Kokkos::View; + using count_type = Kokkos::View; duho my_oscillator(1, 1, 4); const int neqs = my_oscillator.neqs; + count_type count("time step count", 1); vec_type y("solution", neqs), f("function", neqs); auto y_h = Kokkos::create_mirror(y); @@ -471,8 +484,8 @@ void test_adaptivity() { KokkosODE::Experimental::ODE_params params(numSteps, maxSteps, absTol, relTol, minStepSize); Kokkos::deep_copy(y_old, y_old_h); Kokkos::deep_copy(y_new, y_old_h); - RKSolve_wrapper solve_wrapper(my_oscillator, params, tstart, tend, - y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + my_oscillator, params, tstart, tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); diff --git a/ode/unit_test/Test_ODE_RK_chem.hpp b/ode/unit_test/Test_ODE_RK_chem.hpp index 690e271c84..aabdcbb490 100644 --- a/ode/unit_test/Test_ODE_RK_chem.hpp +++ b/ode/unit_test/Test_ODE_RK_chem.hpp @@ -92,6 +92,7 @@ void test_chem() { using mv_type = Kokkos::View; using RK_type = KokkosODE::Experimental::RK_type; using solver_type = KokkosODE::Experimental::RungeKutta; + using count_type = Kokkos::View; { chem_model_1 chem_model; @@ -101,6 +102,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); mv_type kstack("k stack", solver_type::num_stages(), neqs); + count_type count("time steps count", 1); // Set initial conditions vec_type y_new("solution", neqs); @@ -112,8 +114,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper( - chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); @@ -137,6 +139,7 @@ void test_chem() { KokkosODE::Experimental::ODE_params params(num_steps); vec_type tmp("tmp vector", neqs); mv_type kstack("k stack", solver_type::num_stages(), neqs); + count_type count("time steps count", 1); // Set initial conditions vec_type y_new("solution", neqs); @@ -153,8 +156,8 @@ void test_chem() { Kokkos::deep_copy(y_new, y_old_h); Kokkos::RangePolicy my_policy(0, 1); - RKSolve_wrapper solve_wrapper( - chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack); + RKSolve_wrapper solve_wrapper( + chem_model, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, kstack, count); Kokkos::parallel_for(my_policy, solve_wrapper); auto y_new_h = Kokkos::create_mirror(y_new); diff --git a/ode/unit_test/Test_ODE_RK_counts.hpp b/ode/unit_test/Test_ODE_RK_counts.hpp new file mode 100644 index 0000000000..f76fcb0134 --- /dev/null +++ b/ode/unit_test/Test_ODE_RK_counts.hpp @@ -0,0 +1,162 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_RungeKutta.hpp" +#include "Test_ODE_TestProblems.hpp" + +namespace Test { + +std::string RK_type_to_name(const KokkosODE::Experimental::RK_type RK) { + std::string name; + + switch (RK) { + case KokkosODE::Experimental::RK_type::RKFE: name = "Forward-Euler"; break; + case KokkosODE::Experimental::RK_type::RKEH: name = "Euler-Heun"; break; + case KokkosODE::Experimental::RK_type::RKF12: name = "Fehlberg 1-2"; break; + case KokkosODE::Experimental::RK_type::RKBS: name = "Bogacki-Shampine"; break; + case KokkosODE::Experimental::RK_type::RK4: name = "Classic RK order 4"; break; + case KokkosODE::Experimental::RK_type::RKF45: name = "Fehlberg 4-5"; break; + case KokkosODE::Experimental::RK_type::RKCK: name = "Cash-Karp"; break; + case KokkosODE::Experimental::RK_type::RKDP: name = "Dormand-Prince"; break; + default: name = "Unknown Runge-Kutta method"; + } + + return name; +} + +template +void RK_Count(const Device, const OdeType myODE, const double relTol, const double absTol, + const int /*expected_count*/) { + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using count_type = Kokkos::View; + + constexpr int neqs = myODE.neqs; + + constexpr double tstart = myODE.tstart(), tend = myODE.tend(); + constexpr int num_steps = myODE.numsteps(); + constexpr int maxSteps = 1e6; + constexpr double minStepSize = (tend - tstart) / (100 * maxSteps); + KokkosODE::Experimental::ODE_params params( + num_steps, maxSteps, 1.0e-12, (RK == KokkosODE::Experimental::RK_type::RKF12) ? 1.0e-8 : 1.0e-6, minStepSize); + + vec_type y("solution", neqs), f("function", neqs); + vec_type y_new("y new", neqs), y_old("y old", neqs); + count_type count("time step count", 1); + + auto y_h = Kokkos::create_mirror_view(y); + typename vec_type::HostMirror y_old_h = Kokkos::create_mirror(y_old); + auto y_ref_h = Kokkos::create_mirror(y); + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + y_h(dofIdx) = myODE.expected_val(tstart, dofIdx); + y_old_h(dofIdx) = y_h(dofIdx); + y_ref_h(dofIdx) = myODE.expected_val(tend, dofIdx); + } + Kokkos::deep_copy(y, y_h); + + vec_type tmp("tmp vector", neqs); + mv_type kstack("k stack", KokkosODE::Experimental::RungeKutta::num_stages(), neqs); + + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::deep_copy(y_old, y_old_h); + Kokkos::deep_copy(y_new, y_old_h); + RKSolve_wrapper solve_wrapper(myODE, params, tstart, tend, y_old, + y_new, tmp, kstack, count); + Kokkos::parallel_for(my_policy, solve_wrapper); + + auto y_new_h = Kokkos::create_mirror(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + typename count_type::HostMirror count_h = Kokkos::create_mirror_view(count); + Kokkos::deep_copy(count_h, count); + + double error = 0.0; + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + error += Kokkos::pow(y_ref_h(eqIdx) - y_new_h(eqIdx), 2.0) / + Kokkos::pow(absTol + relTol * Kokkos::abs(y_new_h(eqIdx)), 2.0); + } + error = Kokkos::sqrt(error / neqs); + + std::string msg = std::string(OdeType::name) + ", " + RK_type_to_name(RK); + EXPECT_LE(error, 1.0) << msg.c_str(); + // EXPECT_LE(count_h(0), expected_count); +} // RK_Count + +} // namespace Test + +template +void test_RK_count() { + // RK_Count (Device, OdeType, relTol, absTol, /*expected_count*/) + Test::RK_Count(TestDevice(), TestProblem::DegreeOnePoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeTwoPoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeThreePoly(), 1.0e-6, 1e-12, 2); + Test::RK_Count(TestDevice(), TestProblem::DegreeFivePoly(), 1.0e-6, 1e-12, 5); + Test::RK_Count(TestDevice(), TestProblem::Exponential(0.7), 2.0e-6, 1e-12, 4); + Test::RK_Count(TestDevice(), TestProblem::SpringMassDamper(1001., 1000.), 1.0e-4, 0.0, 272); + Test::RK_Count(TestDevice(), TestProblem::CosExp(-10., 2., 1.), 5.3e-5, 0.0, 25); + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::StiffChemicalDecayProcess(1e4, 1.), 4e-9, 1e-9, 2786); + } else { + Test::RK_Count(TestDevice(), TestProblem::StiffChemicalDecayProcess(1e4, 1.), 4e-9, 1.8e-10, 2786); + } + Test::RK_Count(TestDevice(), TestProblem::Tracer(10.0), 0.0, 1e-3, 10); + Test::RK_Count(TestDevice(), TestProblem::EnrightB5(), 1.3e-2, 0.0, 90); + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightC1(), 1.e-4, 1e-14, 90); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightC1(), 1.e-5, 1e-14, 90); + } + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightC5(), 1.e-4, 1e-14, 97); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightC5(), 1.e-5, 1e-14, 97); + } + if constexpr (RK == KokkosODE::Experimental::RK_type::RKF12) { + Test::RK_Count(TestDevice(), TestProblem::EnrightD2(), 2.e-4, 0.0, 590); + } else { + Test::RK_Count(TestDevice(), TestProblem::EnrightD2(), 1.e-5, 0.0, 590); + } + Test::RK_Count(TestDevice(), TestProblem::EnrightD4(), 1.e-5, 1.e-9, 932); +#if defined(KOKKOS_ENABLE_SYCL) + if constexpr ((RK != KokkosODE::Experimental::RK_type::RKF12) && + !std::is_same_v) { +#else + if constexpr (RK != KokkosODE::Experimental::RK_type::RKF12) { +#endif + Test::RK_Count(TestDevice(), TestProblem::KKStiffChemistry(), 1e-5, 0.0, 1); + } +} + +void test_count() { + using RK_type = KokkosODE::Experimental::RK_type; + + // test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + test_RK_count(); + // test_RK_count(); +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, RK_Count) { test_count(); } +#endif diff --git a/ode/unit_test/Test_ODE_TestProblems.hpp b/ode/unit_test/Test_ODE_TestProblems.hpp new file mode 100644 index 0000000000..2b66ec682d --- /dev/null +++ b/ode/unit_test/Test_ODE_TestProblems.hpp @@ -0,0 +1,649 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef TEST_ODE_TESTPROBLEMS_HPP +#define TEST_ODE_TESTPROBLEMS_HPP + +namespace TestProblem { + +struct DegreeOnePoly { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return t + 1.0; } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeOnePoly"; +}; + +struct DegreeTwoPoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return 0.5 * t * t + t + 1.0; } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeTwoPoly"; +}; + +struct DegreeThreePoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = (t * t) + t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return (1. / 3) * (t * t * t) + (1. / 2) * (t * t) + t + 1; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeThreePoly"; +}; + +struct DegreeFivePoly { + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& /*y*/, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = (t * t * t * t) + (t * t * t) + (t * t) + t + 1; + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return (1. / 5) * (t * t * t * t * t) + (1. / 4) * (t * t * t * t) + (1. / 3) * (t * t * t) + (1. / 2) * (t * t) + + t + 1; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + static constexpr char name[] = "DegreeFivePoly"; +}; + +struct Exponential { + Exponential(double rate_) : rate(rate_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + for (int dofIdx = 0; dofIdx < neqs; ++dofIdx) { + dydt(dofIdx) = rate * y(dofIdx); + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0; + } + } + + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + jac(rowIdx, rowIdx) = rate; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { return Kokkos::exp(rate * t); } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 1; + const double rate; + static constexpr char name[] = "Exponential"; +}; + +struct SpringMassDamper { + SpringMassDamper(double c_, double k_) + : c(c_), + k(k_), + lambda1((-c + Kokkos::pow(c * c - 4. * k, 0.5)) / 2.), + lambda2((-c - Kokkos::pow(c * c - 4. * k, 0.5)) / 2.) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = y[1]; + dydt[1] = -k * y[0] - c * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = 0.; + jac(0, 1) = 1.; + jac(1, 0) = -k; + jac(1, 1) = -c; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 1.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::exp; + + const double det = lambda1 - lambda2; + double val = 0; + + if (n == 0) { + val = -(lambda2 / det) * exp(lambda1 * t) + (lambda1 / det) * exp(lambda2 * t); + } else { + val = -(lambda2 * lambda1 / det) * exp(lambda1 * t) + (lambda1 * lambda2 / det) * exp(lambda2 * t); + } + + return val; + } + + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 2; + const double c; + const double k; + const double lambda1; + const double lambda2; + static constexpr char name[] = "SpringMassDamper"; +}; + +// Example 8.1 from Leveque + +struct CosExp { + CosExp(double lambda_, double t0_, double eta_) : lambda(lambda_), t0(t0_), eta(eta_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double t, double /*dt*/, View1& y, View2& dydt) const { + for (int i = 0; i < neqs; i++) { + dydt(i) = lambda * (y(i) - Kokkos::cos(t)) - Kokkos::sin(t); + } + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = 0.0; + + for (int i = 0; i < neqs; ++i) { + jac(i, i) = lambda; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 10.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int /*n*/) const { + return Kokkos::exp(lambda * (t - t0)) * (eta - Kokkos::cos(t0)) + Kokkos::cos(t); + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 1; + const double lambda; + const double t0; + const double eta; + static constexpr char name[] = "CosExp"; +}; + +// Example 7.9 in LeVeque + +struct StiffChemicalDecayProcess { + StiffChemicalDecayProcess(double K1_, double K2_) : K1(K1_), K2(K2_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -K1 * y[0]; + dydt[1] = K1 * y[0] - K2 * y[1]; + dydt[2] = K2 * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + jac(0, 0) = -K1; + jac(0, 1) = 0.; + jac(0, 2) = 0.; + + jac(1, 0) = K1; + jac(1, 1) = -K2; + jac(1, 2) = 0.; + + jac(2, 0) = 0.; + jac(2, 1) = K2; + jac(2, 2) = 0.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 0.2; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::exp; + + const double C21 = y1_init * K1 / (K2 - K1); + const double C22 = y2_init - C21; + + double val = 0.0; + + if (n == 0) { + val = y1_init * exp(-K1 * t); + } else if (n == 1) { + val = C21 * exp(-K1 * t) + C22 * exp(-K2 * t); + } else { + const double C31 = -K2 * C21 / K1; + const double C32 = -C22; + const double C33 = y1_init + y2_init + y3_init; + + val = C31 * exp(-K1 * t) + C32 * exp(-K2 * t) + C33; + } + + return val; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 3; + const double y1_init = 3.0; + const double y2_init = 4.0; + const double y3_init = 2.0; + const double K1; + const double K2; + static constexpr char name[] = "StiffChemicalDecay"; +}; + +struct Tracer { + Tracer(double rate_) : rate(rate_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + for (int i = 0; i < neqs; i += 2) { + // const double R = Kokkos::sqrt(y[i] * y[i] + y[i + 1] * y[i + 1]); + // dydt[i] = -rate * y[i + 1] / R; + // dydt[i + 1] = rate * y[i] / R; + dydt[i] = -rate * y[i + 1]; + dydt[i + 1] = rate * y[i]; + } + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 2.0 * pi; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + const double theta = rate * t; + double val = 0.0; + + if (n % 2 == 0) { + val = Kokkos::cos(theta); + } else { + val = Kokkos::sin(theta); + } + return val; + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + + static constexpr int neqs = 2; + static constexpr double pi = 3.14159265358979323846; + const double rate; + static constexpr char name[] = "Tracer"; +}; + +struct EnrightB5 { + EnrightB5(double alpha_ = 100.0) : alpha(alpha_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -10. * y[0] + alpha * y[1]; + dydt[1] = -alpha * y[0] - 10. * y[1]; + dydt[2] = -4. * y[2]; + dydt[3] = -y[3]; + dydt[4] = -0.5 * y[4]; + dydt[5] = -0.1 * y[5]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& /*y*/, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -10.; + jac(0, 1) = alpha; + jac(1, 0) = -alpha; + jac(1, 1) = -10.; + jac(2, 2) = -4.; + jac(3, 3) = -1.; + jac(4, 4) = -0.5; + jac(5, 5) = -0.1; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 0.2; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + using Kokkos::cos; + using Kokkos::exp; + using Kokkos::sin; + + double val = 0.0; + + const double c1 = 1.0; + const double c2 = -1.0; + + const double a[2] = {0, 1}; + const double b[2] = {-1, 0}; + + if (n < 2) { + val = exp(-10. * t) * (c1 * (a[n] * cos(alpha * t) - b[n] * sin(alpha * t)) + + c2 * (a[n] * sin(alpha * t) + b[n] * cos(alpha * t))); + } else if (n == 2) { + val = exp(-4. * t); + } else if (n == 3) { + val = exp(-t); + } else if (n == 4) { + val = exp(-0.5 * t); + } else { + val = exp(-0.1 * t); + } + + return val; + } + + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 6; + const double alpha; + static constexpr char name[] = "EnrightB5"; +}; // EnrightB5 + +struct EnrightC1 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -y[0] + y[1] * y[1] + y[2] * y[2] + y[3] * y[3]; + dydt[1] = -10. * y[1] + 10. * (y[2] * y[2] + y[3] * y[3]); + dydt[2] = -40. * y[2] + 40. * y[3] * y[3]; + dydt[3] = -100.0 * y[3] + 2.; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -1.; + jac(0, 1) = 2. * y[1]; + jac(0, 2) = 2. * y[2]; + jac(0, 3) = 2. * y[3]; + + jac(1, 1) = -10.; + jac(1, 2) = 20. * y[2]; + jac(1, 3) = 20. * y[3]; + + jac(2, 2) = -40.; + jac(2, 3) = 80. * y[3]; + + jac(3, 3) = -100.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 20.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return 1.; + } else { + // cvode sol + constexpr Kokkos::Array y = {4.003235e-04, 4.001600e-04, 4.000000e-04, 2.000000e-02}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 4; + static constexpr char name[] = "EnrightC1"; +}; + +struct EnrightC5 { + EnrightC5(const double beta_ = 20.0) : beta(beta_) {} + + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -y[0] + 2.; + dydt[1] = -10. * y[1] + beta * y[0] * y[0]; + dydt[2] = -40. * y[2] + 4. * beta * (y[0] * y[0] + y[1] * y[1]); + dydt[3] = -100.0 * y[3] + 10. * beta * (y[0] * y[0] + y[1] * y[1] + y[2] * y[2]); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -1.; + + jac(1, 0) = 2 * beta * y[0]; + jac(1, 1) = -10.; + + jac(2, 0) = 8. * beta * y[0]; + jac(2, 1) = 8. * beta * y[1]; + jac(2, 2) = -40.; + + jac(3, 0) = 20. * beta * y[0]; + jac(3, 1) = 20. * beta * y[1]; + jac(3, 2) = 20. * beta * y[2]; + jac(3, 3) = -100.; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 20.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return 1.; + } else { + // cvode sol + constexpr Kokkos::Array y = {2.000000e+00, 8.000000e+00, 1.360000e+02, 3.712800e+04}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 4; + const double beta; + static constexpr char name[] = "EnrightC5"; +}; + +struct EnrightD2 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -0.04 * y[0] + 0.01 * y[1] * y[2]; + dydt[1] = 400.0 * y[0] - 100.0 * y[1] * y[2] - 3000. * y[1] * y[1]; + dydt[2] = 30. * y[1] * y[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -0.04; + jac(0, 1) = 0.01 * y[2]; + jac(0, 2) = 0.01 * y[1]; + + jac(1, 0) = 400.; + jac(1, 1) = -100. * y[2] - 6000. * y[1]; + jac(1, 2) = -100. * y[1]; + + jac(2, 1) = 60. * y[1]; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 40.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 100; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0.) { + constexpr Kokkos::Array y = {1., 0., 0.}; + return y[n]; + } else { + // cvode solution + constexpr Kokkos::Array y = {7.158278e-01, 9.185559e-02, 2.841630e+01}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "EnrightD2"; +}; + +struct EnrightD4 { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt[0] = -0.013 * y[0] - 1000. * y[0] * y[2]; + dydt[1] = -2500. * y[1] * y[2]; + dydt[2] = dydt[0] + dydt[1]; + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = 0.0; + } + } + + jac(0, 0) = -0.013 - 1000. * y[2]; + jac(0, 2) = -1000. * y[0]; + + jac(1, 1) = -2500. * y[2]; + jac(1, 2) = -2500. * y[1]; + + jac(2, 0) = jac(0, 0); + jac(2, 1) = jac(1, 1); + jac(2, 2) = jac(0, 2) + jac(1, 2); + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 50.0; } // 50.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 10; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0.) { + constexpr Kokkos::Array y = {1., 1., 0}; + return y[n]; + } else { + // cvode solution at tend + constexpr Kokkos::Array y = {5.976506e-01, 1.402347e+00, -1.893371e-06}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "EnrightD4"; +}; + +// Robertson Autocatalytic reaction +struct KKStiffChemistry { + template + KOKKOS_FUNCTION void evaluate_function(double /*t*/, double /*dt*/, View1& y, View2& dydt) const { + dydt(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + dydt(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + dydt(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(double /*t*/, double /*dt*/, View1& y, View2& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } + + KOKKOS_FUNCTION constexpr double tstart() const { return 0.0; } + KOKKOS_FUNCTION constexpr double tend() const { return 500.0; } + KOKKOS_FUNCTION constexpr int numsteps() const { return 1000; } + KOKKOS_FUNCTION double expected_val(const double t, const int n) const { + if (t == 0) { + return n == 0 ? 1. : 0.; + } else { + // cvode solution + constexpr Kokkos::Array y = {4.226713e-01, 2.885221e-06, 5.773258e-01}; + return y[n]; + } + } + KOKKOS_FUNCTION static constexpr int num_equations() { return neqs; } + static constexpr int neqs = 3; + static constexpr char name[] = "Robertson Autocatalytic"; +}; + +} // namespace TestProblem + +#endif // TEST_ODE_TESTPROBLEMS_HPP diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index 28271dfb0d..8d740966f0 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -11,21 +11,6 @@ if (KokkosKernels_ENABLE_PERFTESTS) #build correctly with or without MPI, but only run them with a single #MPI process. - SET(GTEST_SOURCE_DIR ${PACKAGE_SOURCE_DIR}/tpls/gtest) - - KOKKOSKERNELS_ADD_TEST_LIBRARY( - kokkoskernelsperf_gtest - HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h - SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc - ) - #Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. - - TARGET_COMPILE_DEFINITIONS(kokkoskernelsperf_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0") - TARGET_INCLUDE_DIRECTORIES(kokkoskernelsperf_gtest PUBLIC $) - - #Gtest minimally requires C++ 11 - TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) - KOKKOSKERNELS_INCLUDE_DIRECTORIES(sparse) if(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index ec767c68f7..6d11e021a9 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -20,7 +20,7 @@ #ifndef KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP #define KOKKOSKERNELS_PERF_TEST_UTILITIES_HPP -#include "KokkosKernels_TestUtils.hpp" // for string_compare_no_case +#include "KokkosKernels_TestStringUtils.hpp" // for string_compare_no_case // Namepsace that defines common utilities // for performance tests diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 23d53ba106..49c7a76210 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -14,10 +14,12 @@ // //@HEADER +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp index d4e3754f14..9dcf5d6515 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test_benchmark.cpp @@ -85,7 +85,7 @@ static void run(benchmark::State& state) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index a383739aac..36f11733ce 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -14,12 +14,14 @@ // //@HEADER +#include + #include #include // For RPS implementation #include "KokkosBlas_dot_perf_test.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; @@ -110,7 +112,7 @@ void run(int m, int repeat) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp index be622ffaec..10d3aa82d7 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test_benchmark.cpp @@ -84,7 +84,7 @@ static void run(benchmark::State& state) { using MemSpace = typename ExecSpace::memory_space; using Device = Kokkos::Device; - std::cout << "Running BLAS Level 1 DOT perfomrance experiment (" << ExecSpace::name() << ")\n"; + std::cout << "Running BLAS Level 1 DOT performance experiment (" << ExecSpace::name() << ")\n"; std::cout << "Each test input vector has a length of " << m << std::endl; diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp index 16dadd34b6..ca30ed50ac 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp @@ -14,10 +14,12 @@ // //@HEADER +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp index 776c291ad0..ab0750e5b6 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test_benchmark.cpp @@ -42,10 +42,12 @@ //@HEADER */ +#include + #include #include #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index 924b63ecee..90fb065741 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas2_gemv.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp index cdae173ce0..2ff22b7078 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test_benchmark.cpp @@ -19,7 +19,7 @@ #include "KokkosBlas2_gemv.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp b/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp index 8efd9920b1..2feeb2c2ff 100644 --- a/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_ger_perf_test_benchmark.cpp @@ -21,7 +21,7 @@ #include "KokkosBlas2_ger.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index 80c9d25c1c..4dbf0ae5dd 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -4,7 +4,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp - TESTONLYLIBS kokkoskernelsperf_gtest ) KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 3f13dbf8d8..1916eb5c4a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -42,14 +42,14 @@ /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { std::string trmm_args; - default_scalar alpha; + KokkosKernels::default_scalar alpha; }; typedef struct perf_test_trmm_args pt_trmm_args_t; struct perf_test_gemm_args { std::string gemm_args; //[N,T,C][N,T,C] for transA and transB - default_scalar alpha; - default_scalar beta; + KokkosKernels::default_scalar alpha; + KokkosKernels::default_scalar beta; }; typedef struct perf_test_gemm_args pt_gemm_args_t; // ADD MORE BLAS3 ROUTINE ARG STRUCTS HERE. diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 6710091e60..75a3c4c004 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -25,11 +25,12 @@ #include +#include "Kokkos_ArithTraits.hpp" #include "KokkosBatched_HostLevel_Gemm.hpp" #include "KokkosBatched_Gemm_Decl.hpp" #include "KokkosBatched_Util.hpp" -#include "gtest/gtest.h" // EXPECT_NEAR -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" +#include "KokkosKernels_TestVanilla.hpp" #include @@ -107,19 +108,24 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_GEMM_ALPHA 1.0 #define DEFAULT_GEMM_BETA 1.0 -using view_type_3d = Kokkos::View; -using view_type_4d = Kokkos::View; -using view_type_5d = Kokkos::View; +using view_type_3d = + Kokkos::View; +using view_type_4d = + Kokkos::View; +using view_type_5d = + Kokkos::View; // Construct the vector type -using memory_space = typename default_device::execution_space::memory_space; -constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; +using memory_space = typename KokkosKernels::default_device::execution_space::memory_space; +constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; constexpr int simd_internal_vector_size = - KokkosBatched::DefaultInternalVectorLength::value; -using vector_type = KokkosBatched::Vector, simd_vector_size>; -using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; -using vector_view_type_3d = Kokkos::View; -using internal_vector_view_type_4d = Kokkos::View; + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, simd_internal_vector_size>; +using vector_view_type_3d = Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -163,7 +169,7 @@ typedef struct gemm_simd_args gemm_simd_args_t; * https://developer.arm.com/documentation/101004/2100/Interleave-batch-functions/armpl-dgemm-interleave-batch. */ struct gemm_armpl_args { - default_scalar *mat; + KokkosKernels::default_scalar *mat; armpl_int_t jstrd, istrd, bstrd; }; typedef struct gemm_armpl_args gemm_armpl_args_t; @@ -199,8 +205,8 @@ typedef struct gemm_armpl_args gemm_armpl_args_t; */ struct gemm_args { char transA, transB; - default_scalar alpha; - default_scalar beta; + KokkosKernels::default_scalar alpha; + KokkosKernels::default_scalar beta; view_type_3d A, B, C; batched_params_t bp; // Below are matrices for simd tests @@ -224,9 +230,10 @@ static std::string gemm_csv_header_str = // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { // TODO: if not Kokkos::complex. - if (std::is_same::value || std::is_same::value || - std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each @@ -285,9 +292,9 @@ static void __print_gemm_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.gemm.gemm_args = %s\n", options.blas_args.gemm.gemm_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - if (std::is_same::value) + if (std::is_same::value) printf("options.alpha = %lf\n", options.blas_args.gemm.alpha); - else if (std::is_same::value) + else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.gemm.alpha); return; } @@ -1140,7 +1147,7 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; using simd_type = KokkosBatched::Vector, simd_vector_size>; - using simd_view_type = Kokkos::View; + using simd_view_type = Kokkos::View; using functor_type = parallel_batched_gemm_experiment5; uint32_t warm_up_n = options.warm_up_n; @@ -1229,9 +1236,9 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { KokkosBatched::DefaultVectorLength::value; constexpr int il = KokkosBatched::DefaultInternalVectorLength::value; - using view_type = Kokkos::View; - using vector_view_type = Kokkos::View; - using internal_vector_view_type = Kokkos::View; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; using functor_type = parallel_batched_gemm_experiment6; @@ -1301,7 +1308,7 @@ void __do_gemm_armpl(options_t options, gemm_args_t gemm_args) { char transa = std::is_same::value ? 'N' : 'T'; char transb = std::is_same::value ? 'N' : 'T'; - if (!std::is_same::value) FATAL_ERROR("only double scalars are supported!"); + if (!std::is_same::value) FATAL_ERROR("only double scalars are supported!"); STATUS; @@ -1362,7 +1369,8 @@ static inline bool __gemm_print_compare_failure(ViewType h_expected, ViewType h_ */ template static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { - double epsilon = Test::epsilon::value * 1e3; + double epsilon = Kokkos::ArithTraits::eps() * 1e3; + STATUS; typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); @@ -1418,14 +1426,16 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie vector_batch_size = src.ivec_4d.extent(0); simd_batch_size = src.ivec_4d.extent(3); last_batch = dst.extent(2); - if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; } else { remainder = dst.extent(0) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(3); simd_batch_size = src.ivec_4d.extent(0); last_batch = dst.extent(0); - if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; } // When the batch_size is a multiple of the simd_vector_size and the @@ -1448,7 +1458,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie remainder += simd_internal_vector_size; // Views needed for slow manual copy - using h_view_type_5d = Kokkos::View; + using h_view_type_5d = Kokkos::View; using h_subview_type_2d = Kokkos::View; using h_subview_type_3d = Kokkos::View; using h_subview_type_4d = Kokkos::View; @@ -1458,7 +1468,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie h_subview_type_2d h_sv2; // TODO: Clean everything below this point up... - if (std::is_same::value) + if (std::is_same::value) h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); else @@ -1468,7 +1478,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie // The below loops copies each corresponding 2-rank matrix within the simd // view back to the 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - if (std::is_same::value) + if (std::is_same::value) h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); else @@ -1599,7 +1609,8 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo if (gemm_args.C.data() != nullptr) { #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { - using view_type_2d = Kokkos::View; + using view_type_2d = + Kokkos::View; view_type_2d C; for (int ib = 0; ib < gemm_args.nbatch; ++ib) { for (int i = 0; i < gemm_args.ninter; ++i) { @@ -1703,7 +1714,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - using tmp_view_type_4d = Kokkos::View; + using tmp_view_type_4d = Kokkos::View; tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); @@ -1729,7 +1740,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - using tmp_view_type_3d = Kokkos::View; + using tmp_view_type_3d = Kokkos::View; tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); @@ -1776,11 +1787,14 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.B_pl.bstrd = bstrd_B; gemm_args.C_pl.bstrd = bstrd_C; - default_scalar *A_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_A * nbatch); - default_scalar *B_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_B * nbatch); - default_scalar *C_p = (default_scalar *)malloc(sizeof(default_scalar) * bstrd_C * nbatch); + KokkosKernels::default_scalar *A_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_A * nbatch); + KokkosKernels::default_scalar *B_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_B * nbatch); + KokkosKernels::default_scalar *C_p = + (KokkosKernels::default_scalar *)malloc(sizeof(KokkosKernels::default_scalar) * bstrd_C * nbatch); - using view_type_2d = Kokkos::View; + using view_type_2d = Kokkos::View; view_type_2d A, B, C; // Populate interleave-batch matrices @@ -1837,8 +1851,10 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) STATUS; __print_gemm_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << ", SPACE:" << typeid(memory_space).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() + << ", DEVICE:" << typeid(KokkosKernels::default_device).name() << ", SPACE:" << typeid(memory_space).name() + << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1847,10 +1863,12 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step, cur_dims.c.m += options.step, cur_dims.c.n += options.step) { - gemm_args = __do_setup(options, cur_dims); + gemm_args = __do_setup(options, cur_dims); if (options.verify) { - __gemm_do_verify(options, gemm_args, fn); + __gemm_do_verify( + options, gemm_args, fn); } else { fn(options, gemm_args); } @@ -1867,21 +1885,25 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, gemm_args_t)) /*************************** External fns **************************/ void do_gemm_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_blas); + __do_loop_and_invoke( + options, + __do_gemm_serial_blas); return; } void do_gemm_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_batched); + __do_loop_and_invoke(options, + __do_gemm_serial_batched); return; } void do_gemm_serial_batched_blocked(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_gemm_serial_batched); + __do_loop_and_invoke(options, + __do_gemm_serial_batched); return; } @@ -1893,29 +1915,31 @@ void do_gemm_heuristic_batched_parallel(options_t options) { exit(-EINVAL); } - __do_loop_and_invoke(options, __do_gemm_parallel_batched_heuristic); + __do_loop_and_invoke(options, __do_gemm_parallel_batched_heuristic); return; } void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } @@ -1925,11 +1949,13 @@ void do_gemm_serial_simd_batched_parallel(options_t options) { // SerialSimdTag options.use_simd = true; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); return; } @@ -1939,35 +1965,38 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { // SerialSimdTag options.use_simd = true; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); return; } -#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ +#if defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) && defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) && \ defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); return; } #else void do_gemm_serial_batched_compact_mkl_parallel(options_t) { STATUS; -#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) - std::cerr << std::string(__func__) << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; -#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) +#if !defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL) + std::cerr << std::string(__func__) << " disabled since KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL is undefined." + << std::endl; +#elif !defined(KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED) std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + << " disabled since KOKKOSBATCHED_IMPL_ENABLE_INTEL_MKL_BATCHED is " "undefined." << std::endl; #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) @@ -1983,34 +2012,37 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t) { void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke(options, __do_gemm_parallel_batched); else - __do_loop_and_invoke(options, - __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + __do_loop_and_invoke(options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); return; } @@ -2019,10 +2051,10 @@ void do_gemm_team_simd_batched_parallel(options_t options) { options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); else __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); return; } @@ -2031,10 +2063,10 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { options.use_simd = true; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); else __do_loop_and_invoke(options, __do_gemm_parallel_batched); + KokkosKernels::default_device, KokkosBatched::Mode::Team>); return; } @@ -2043,7 +2075,7 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( options, __do_gemm_parallel_batched); return; +KokkosBatched::Algo::Gemm::Blocked, KokkosKernels::default_device>); return; } */ void do_gemm_experiment_parallel(options_t options) { @@ -2054,23 +2086,23 @@ void do_gemm_experiment_parallel(options_t options) { // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment1); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment2); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment3); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment4); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment5); + // BlockingType, KokkosKernels::default_device>); // __do_loop_and_invoke( // options, __do_gemm_parallel_experiment6); - __do_loop_and_invoke(options, __do_gemm_armpl); + // BlockingType, KokkosKernels::default_device>); + __do_loop_and_invoke(options, __do_gemm_armpl); } #endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp index 22a268c2e6..a9f8db5be4 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas3_gemm.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" struct Params { int use_cuda = 0; @@ -103,7 +103,8 @@ template void run(int m, int n, int k, int repeat) { using LL = Kokkos::LayoutLeft; using LR = Kokkos::LayoutRight; - std::cout << "** Running GEMM experiments (" << ExecSpace::name() << ") **\n"; + std::cout << "** Running GEMM experiments (" << ExecSpace::name() << " m=" << m << " n=" << n << " k=" << k + << ") **\n"; std::cout << "Running: A LayoutLeft, B LayoutLeft : "; runImpl(m, n, k, repeat); std::cout << "Running: A LayoutLeft, B LayoutRight : "; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index eee2eea53c..6f0737180e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -16,7 +16,7 @@ #include "KokkosBlas3_gemm.hpp" #include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "Benchmark_Context.hpp" #include diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 3b21fb7e70..cce1d80361 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -301,12 +301,12 @@ int main(int argc, char **argv) { double alpha, beta; if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) __blas3_perf_test_input_error(argv, ret, optarg); - options.blas_args.gemm.alpha = static_cast(alpha); - options.blas_args.gemm.beta = static_cast(beta); + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.trmm.alpha = (default_scalar)atof(optarg); + options.blas_args.trmm.alpha = (KokkosKernels::default_scalar)atof(optarg); break; case 'l': for (i = 0; i < LOOP_N; i++) { diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 9b23ce4d51..084c0759d2 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -64,8 +64,9 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int /*a_m* flops = (b_n * (b_n + 1)) * b_m; } - if (std::is_same::value || std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value) return flops; // Account for 6 additional flops when complex numbers are used. @@ -86,8 +87,9 @@ static inline double __trmm_flop_count(char side, double b_m, double b_n, double flops = b_n * b_n * b_m; } - if (std::is_same::value || std::is_same::value || - std::is_same::value) + if (std::is_same::value || + std::is_same::value || + std::is_same::value) return flops; // Account for 6 additional flops when complex numbers are used. @@ -97,10 +99,11 @@ static inline double __trmm_flop_count(char side, double b_m, double b_n, double return flops * 4; } -using view_type_3d = Kokkos::View; +using view_type_3d = + Kokkos::View; struct trmm_args { char side, uplo, trans, diag; - default_scalar alpha; + KokkosKernels::default_scalar alpha; view_type_3d A, B; }; typedef struct trmm_args trmm_args_t; @@ -117,7 +120,8 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, doub double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; double gbytes_in_matrix = - (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(KokkosKernels::default_scalar)) / + 1e9; double min_memory_transactions, max_memory_transactions; // Assuming infinite cache size @@ -157,9 +161,9 @@ static void __print_trmm_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.trmm.trmm_args = %s\n", options.blas_args.trmm.trmm_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - if (std::is_same::value) + if (std::is_same::value) printf("options.alpha = %lf\n", options.blas_args.trmm.alpha); - else if (std::is_same::value) + else if (std::is_same::value) printf("options.alpha = %f\n", options.blas_args.trmm.alpha); return; } @@ -551,8 +555,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { host_A = Kokkos::create_mirror_view(trmm_args.A); { - Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), - trmm_args.A.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); Kokkos::fill_random(tmp, rand_pool, Kokkos::rand, double>::max()); Kokkos::deep_copy(host_A, tmp); } @@ -592,8 +596,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::deep_copy(trmm_args.A, host_A); { - Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), - trmm_args.B.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); Kokkos::fill_random(tmp, rand_pool, Kokkos::rand, double>::max()); Kokkos::deep_copy(trmm_args.B, tmp); } @@ -608,8 +612,9 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) STATUS; __print_trmm_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() + << ", DEVICE:" << typeid(KokkosKernels::default_device).name() << std::endl; options.out[0] << trmm_csv_header_str << std::endl; @@ -617,7 +622,8 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step) { - trmm_args = __do_setup(options, cur_dims); + trmm_args = __do_setup( + options, cur_dims); fn(options, trmm_args); } return; @@ -626,25 +632,30 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trmm_args_t)) /*************************** External fns **************************/ void do_trmm_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_serial_blas); + __do_loop_and_invoke( + options, + __do_trmm_serial_blas); return; } void do_trmm_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_serial_batched); + __do_loop_and_invoke(options, __do_trmm_serial_batched); return; } void do_trmm_parallel_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_parallel_blas); + __do_loop_and_invoke(options, __do_trmm_parallel_blas); return; } void do_trmm_parallel_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trmm_parallel_batched); + __do_loop_and_invoke(options, __do_trmm_parallel_batched); return; } diff --git a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp index d2d2460551..6040cd5c43 100644 --- a/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas_trtri_perf_test.hpp @@ -59,8 +59,9 @@ static inline double __trtri_impl_flop_count(double a_m, double /*a_n*/) { double flop_count = 0; double flops_per_div, flops_per_mul, flops_per_add; - if (std::is_same::value || std::is_same::value || - std::is_same::value) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { flops_per_div = 1; flops_per_mul = 1; flops_per_add = 1; @@ -93,8 +94,9 @@ static inline double __trtri_flop_count(double a_m, double a_n) { exit(255); } - if (std::is_same::value || std::is_same::value || - std::is_same::value) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { flops_per_mul = 1; flops_per_add = 1; } else { @@ -110,7 +112,8 @@ static inline double __trtri_flop_count(double a_m, double a_n) { return flops; } -using view_type_3d = Kokkos::View; +using view_type_3d = + Kokkos::View; struct trtri_args { char uplo, diag; view_type_3d A; @@ -145,8 +148,9 @@ static void __print_trtri_perf_test_options(options_t options) { printf("options.n = %d\n", options.n); printf("options.blas_args.trtri.trtri_args = %s\n", options.blas_args.trtri.trtri_args.c_str()); printf("options.out_file = %s\n", options.out_file.c_str()); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:." << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() << ", DEVICE:." + << typeid(KokkosKernels::default_device).name() << std::endl; #else static void __print_trtri_perf_test_options(options_t) { #endif // TRTRI_PERF_TEST_DEBUG @@ -456,8 +460,9 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) STATUS; __print_trtri_perf_test_options(options); - std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:." << typeid(default_device).name() << std::endl; + std::cout << "SCALAR:" << typeid(KokkosKernels::default_scalar).name() + << ", LAYOUT:" << typeid(KokkosKernels::default_layout).name() << ", DEVICE:." + << typeid(KokkosKernels::default_device).name() << std::endl; options.out[0] << trtri_csv_header_str << std::endl; @@ -465,7 +470,8 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; cur_dims.a.m += options.step, cur_dims.a.n += options.step, cur_dims.b.m += options.step, cur_dims.b.n += options.step) { - trtri_args = __do_setup(options, cur_dims); + trtri_args = + __do_setup(options, cur_dims); fn(options, trtri_args); } return; @@ -474,25 +480,29 @@ void __do_loop_and_invoke(options_t options, void (*fn)(options_t, trtri_args_t) /*************************** External fns **************************/ void do_trtri_serial_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_serial_blas); + __do_loop_and_invoke( + options, __do_trtri_serial_blas); return; } void do_trtri_serial_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_serial_batched); + __do_loop_and_invoke( + options, __do_trtri_serial_batched); return; } void do_trtri_parallel_blas(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_parallel_blas); + __do_loop_and_invoke( + options, __do_trtri_parallel_blas); return; } void do_trtri_parallel_batched(options_t options) { STATUS; - __do_loop_and_invoke(options, __do_trtri_parallel_batched); + __do_loop_and_invoke( + options, __do_trtri_parallel_batched); return; } diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 548abd0052..0c3706bdfe 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -26,7 +26,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosGraph_Distance1Color.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) { diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index dc54250e9e..04242f31f0 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -36,7 +36,7 @@ #include #include #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -69,9 +69,9 @@ struct D2Parameters { } }; -typedef default_scalar kk_scalar_t; -typedef default_size_type kk_size_type; -typedef default_lno_t kk_lno_t; +using kk_scalar_t = KokkosKernels::default_scalar; +using kk_size_type = KokkosKernels::default_size_type; +using kk_lno_t = KokkosKernels::default_lno_t; using KokkosKernels::Impl::xorshiftHash; diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index d45ecb4f1e..952f28e64c 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -37,7 +37,7 @@ #include "KokkosSparse_spadd.hpp" #include "KokkosGraph_MIS2.hpp" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -202,14 +202,15 @@ int parse_inputs(MIS2Parameters& params, int argc, char** argv) { template void run_mis2(const MIS2Parameters& params) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; - using crsMat_t = typename KokkosSparse::CrsMatrix; - using lno_view_t = typename crsMat_t::index_type::non_const_type; - using KKH = KokkosKernels::Experimental::KokkosKernelsHandle; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_type = KokkosKernels::default_scalar; + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using lno_view_t = typename crsMat_t::index_type::non_const_type; + using KKH = + KokkosKernels::Experimental::KokkosKernelsHandle; Kokkos::Timer t; crsMat_t A_in = KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); @@ -219,7 +220,7 @@ void run_mis2(const MIS2Parameters& params) { crsMat_t At_in = KokkosSparse::Impl::transpose_matrix(A_in); crsMat_t A; KKH kkh; - const default_scalar one = Kokkos::ArithTraits::one(); + const scalar_type one = Kokkos::ArithTraits::one(); kkh.create_spadd_handle(false); KokkosSparse::spadd_symbolic(&kkh, A_in, At_in, A); KokkosSparse::spadd_numeric(&kkh, one, A_in, one, At_in, A); diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 3b5802a1ef..84676607d9 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -18,7 +18,7 @@ #include "KokkosGraph_Triangle.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_IOUtils.hpp" //for read_kokkos_crst_graph -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosKernels_perf_test_utilities.hpp" @@ -230,13 +230,13 @@ void run_experiment(int argc, char **argv, perf_test::CommonInputParams) { using namespace KokkosSparse; using mem_space = typename exec_space::memory_space; using device_t = Kokkos::Device; - using lno_t = default_lno_t; - using size_type = default_size_type; - using graph_t = Kokkos::StaticCrsGraph; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; + using graph_t = Kokkos::StaticCrsGraph; using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { std::cerr << "** Triangle counting is currently not supported on GPU backends.\n"; return; } diff --git a/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp b/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp index 8336b3b737..55aa70b9cf 100644 --- a/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp +++ b/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp @@ -14,9 +14,11 @@ // //@HEADER -#include "KokkosLapack_svd.hpp" +#include -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_IOUtils.hpp" // getRandomBounds +#include "KokkosLapack_svd.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include @@ -72,7 +74,7 @@ void run_svd_benchmark(benchmark::State& state, const svd_parameters& svd_params // Initialize A with random numbers double randStart = 0, randEnd = 0; - Test::getRandomBounds(10.0, randStart, randEnd); + KokkosKernels::Impl::getRandomBounds(10.0, randStart, randEnd); Kokkos::fill_random(A, rand_pool, randStart, randEnd); for (auto _ : state) { diff --git a/perf_test/ode/KokkosODE_BDF.cpp b/perf_test/ode/KokkosODE_BDF.cpp index 60cc5e01ff..0b1ecdf276 100644 --- a/perf_test/ode/KokkosODE_BDF.cpp +++ b/perf_test/ode/KokkosODE_BDF.cpp @@ -16,7 +16,7 @@ #include "KokkosODE_BDF.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include diff --git a/perf_test/ode/KokkosODE_RK.cpp b/perf_test/ode/KokkosODE_RK.cpp index 83635d7af6..7a5fd33999 100644 --- a/perf_test/ode/KokkosODE_RK.cpp +++ b/perf_test/ode/KokkosODE_RK.cpp @@ -16,7 +16,7 @@ #include "KokkosODE_RungeKutta.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include @@ -92,7 +92,7 @@ struct chem_model_2 { } }; -template +template struct RKSolve_wrapper { using ode_params = KokkosODE::Experimental::ODE_params; @@ -103,10 +103,11 @@ struct RKSolve_wrapper { scalar_type tstart, tend; vec_type y_old, y_new, tmp; mv_type kstack; + count_type count; RKSolve_wrapper(const ode_type& my_ode_, const table_type& table_, const ode_params& params_, const scalar_type tstart_, const scalar_type tend_, const vec_type& y_old_, const vec_type& y_new_, - const vec_type& tmp_, const mv_type& kstack_) + const vec_type& tmp_, const mv_type& kstack_, const count_type& count_) : my_ode(my_ode_), table(table_), params(params_), @@ -115,7 +116,8 @@ struct RKSolve_wrapper { y_old(y_old_), y_new(y_new_), tmp(tmp_), - kstack(kstack_) {} + kstack(kstack_), + count(count_) {} KOKKOS_FUNCTION void operator()(const int idx) const { @@ -124,10 +126,12 @@ struct RKSolve_wrapper { auto local_y_new = Kokkos::subview(y_new, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_tmp = Kokkos::subview(tmp, Kokkos::pair(2 * idx, 2 * idx + 1)); auto local_kstack = Kokkos::subview(kstack, Kokkos::ALL(), Kokkos::pair(2 * idx, 2 * idx + 1)); + auto local_count = Kokkos::subview(count, idx, Kokkos::ALL()); // Run Runge-Kutta time integrator - KokkosODE::Impl::RKSolve( - my_ode, table, params, tstart, tend, local_y_old, local_y_new, local_tmp, local_kstack); + // This should be replaced by a call to the public interface! + KokkosODE::Impl::RKSolve(my_ode, table, params, tstart, tend, local_y_old, local_y_new, local_tmp, local_kstack, + local_count.data()); } }; @@ -149,6 +153,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { using mv_type = Kokkos::View; using table_type = KokkosODE::Impl::ButcherTableau<4, 5, 1>; using ode_params = KokkosODE::Experimental::ODE_params; + using count_type = Kokkos::View; const int num_odes = inputs.num_odes; const int model = inputs.model; @@ -159,6 +164,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { const int neqs = chem_model.neqs; const int num_steps = 15000; const double dt = 0.1; + count_type count("time steps count", num_odes, 1); table_type table; ode_params params(num_steps); @@ -176,7 +182,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { Kokkos::RangePolicy my_policy(0, num_odes); RKSolve_wrapper solve_wrapper(chem_model, table, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, - kstack); + kstack, count); Kokkos::Timer time; time.reset(); @@ -206,6 +212,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { const int neqs = chem_model.neqs; const int num_steps = 15000; const double dt = 0.1; + count_type count("time steps count", num_odes, 1); table_type table; ode_params params(num_steps); @@ -228,7 +235,7 @@ void run_ode_chem(benchmark::State& state, const rk_input_parameters& inputs) { Kokkos::RangePolicy my_policy(0, num_odes); RKSolve_wrapper solve_wrapper(chem_model, table, params, chem_model.tstart, chem_model.tend, y_old, y_new, tmp, - kstack); + kstack, count); Kokkos::Timer time; time.reset(); diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 514ef0ed82..70b5928efb 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -103,7 +103,6 @@ KOKKOSKERNELS_ADD_EXECUTABLE( KOKKOSKERNELS_ADD_EXECUTABLE( sparse_gs SOURCES KokkosSparse_gs.cpp - TESTONLYLIBS kokkoskernelsperf_gtest ) KOKKOSKERNELS_ADD_EXECUTABLE( diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 528f4a20ea..91a599ad71 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -23,7 +23,7 @@ #include "KokkosKernels_Utils.hpp" #include "KokkosSparse_IOUtils.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #define MAXVAL 1 @@ -62,7 +62,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { cols_view_t columns_view("colsmap_view", ne); values_view_t values_view("values_view", ne); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view(rowmap_view); typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view(columns_view); typename values_view_t::HostMirror hv = Kokkos::create_mirror_view(values_view); diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index b4633fd4c0..0ed984a8f8 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -14,10 +14,17 @@ // //@HEADER +#include +#include +#include +#include +#include + #include #include + #include -#include +#include #include #include #include @@ -25,11 +32,7 @@ #include #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_IOUtils.hpp" -#include -#include -#include -#include -#include +#include "KokkosKernels_TestMatrixUtils.hpp" using std::cout; using std::string; @@ -151,9 +154,9 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params) { template void runGS(const GS_Parameters& params) { - typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; typedef typename device_t::execution_space exec_space; typedef typename device_t::memory_space mem_space; typedef KokkosKernels::Experimental::KokkosKernelsHandle diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 999a913e06..a0a9e3456a 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -32,9 +32,9 @@ #include #include "KokkosKernels_default_types.hpp" -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; using KAT = Kokkos::ArithTraits; struct SPMVBenchmarking { @@ -200,7 +200,7 @@ void print_help() { int main(int argc, char** argv) { SPMVBenchmarking sb; char layout; - if (std::is_same::value) + if (std::is_same::value) layout = 'L'; else layout = 'R'; diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp index 4db45ce55f..0cd33f2eca 100644 --- a/perf_test/sparse/KokkosSparse_mdf.cpp +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -19,7 +19,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "KokkosSparse_mdf.hpp" diff --git a/perf_test/sparse/KokkosSparse_par_ilut.cpp b/perf_test/sparse/KokkosSparse_par_ilut.cpp index 73a6c6fc5e..5143e13e28 100644 --- a/perf_test/sparse/KokkosSparse_par_ilut.cpp +++ b/perf_test/sparse/KokkosSparse_par_ilut.cpp @@ -54,9 +54,9 @@ using KokkosSparse::Experimental::spiluk_symbolic; using KokkosSparse::Experimental::SPILUKAlgorithm; // Build up useful types -using scalar_t = default_scalar; -using lno_t = default_lno_t; -using size_type = default_size_type; +using scalar_t = KokkosKernels::default_scalar; +using lno_t = KokkosKernels::default_lno_t; +using size_type = KokkosKernels::default_size_type; using exe_space = Kokkos::DefaultExecutionSpace; using mem_space = typename exe_space::memory_space; using device = Kokkos::Device; @@ -126,7 +126,7 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh, const sp_matri #ifdef USE_GINKGO /////////////////////////////////////////////////////////////////////////////// -static constexpr bool IS_GPU = KokkosKernels::Impl::kk_is_gpu_exec_space(); +static constexpr bool IS_GPU = KokkosKernels::Impl::is_gpu_exec_space_v; using ginkgo_exec = std::conditional_t; @@ -277,7 +277,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows, int nnz_per_row // Now that we have A, we can set team_size if (team_size == -1) { - team_size = KokkosKernels::Impl::kk_is_gpu_exec_space() ? nnz_per_row : 1; + team_size = KokkosKernels::Impl::is_gpu_exec_space_v ? nnz_per_row : 1; } KokkosSparse::sort_crs_matrix(A); diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index d299952cb4..00466ae6c9 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -20,21 +20,22 @@ #include "KokkosKernels_Utils.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosSparse_IOUtils.hpp" #include #define MAXVAL 1 template -scalar_view_t create_x_vector(default_lno_t nv, default_scalar max_value = 1.0) { +scalar_view_t create_x_vector(KokkosKernels::default_lno_t nv, KokkosKernels::default_scalar max_value = 1.0) { scalar_view_t kok_x("X", nv); typename scalar_view_t::HostMirror h_x = Kokkos::create_mirror_view(kok_x); - for (default_lno_t i = 0; i < nv; ++i) { - default_scalar r = static_cast(rand()) / static_cast(RAND_MAX / max_value); - h_x(i) = r; + for (KokkosKernels::default_lno_t i = 0; i < nv; ++i) { + KokkosKernels::default_scalar r = static_cast(rand()) / + static_cast(RAND_MAX / max_value); + h_x(i) = r; } Kokkos::deep_copy(kok_x, h_x); return kok_x; @@ -57,9 +58,9 @@ void run_experiment(crsMat_t crsmat, int clusterSize, bool useSequential) { typedef typename lno_view_t::value_type size_type; typedef typename scalar_view_t::value_type scalar_t; - default_lno_t nv = crsmat.numRows(); - scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); - scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); + KokkosKernels::default_lno_t nv = crsmat.numRows(); + scalar_view_t kok_x_original = create_x_vector(nv, MAXVAL); + scalar_view_t kok_b_vector = create_y_vector(crsmat, kok_x_original); // create X vector scalar_view_t kok_x_vector("kok_x_vector", nv); @@ -220,13 +221,15 @@ enum { template void run_pcg(int *cmdline, const char *mtx_file) { - default_lno_t nv = 0, ne = 0; - default_lno_t *xadj, *adj; - default_scalar *ew; + using lno_t = KokkosKernels::default_lno_t; + lno_t nv = 0, ne = 0; + lno_t *xadj, *adj; + KokkosKernels::default_scalar *ew; - KokkosSparse::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, mtx_file); + KokkosSparse::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, mtx_file); - typedef typename KokkosSparse::CrsMatrix + typedef typename KokkosSparse::CrsMatrix crsMat_t; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -243,11 +246,11 @@ void run_pcg(int *cmdline, const char *mtx_file) { typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view(columns_view); typename values_view_t::HostMirror hv = Kokkos::create_mirror_view(values_view); - for (default_lno_t i = 0; i <= nv; ++i) { + for (lno_t i = 0; i <= nv; ++i) { hr(i) = xadj[i]; } - for (default_lno_t i = 0; i < ne; ++i) { + for (lno_t i = 0; i < ne; ++i) { hc(i) = adj[i]; hv(i) = ew[i]; } diff --git a/perf_test/sparse/KokkosSparse_pcg.hpp b/perf_test/sparse/KokkosSparse_pcg.hpp index 6d0b9180a0..12234d5773 100644 --- a/perf_test/sparse/KokkosSparse_pcg.hpp +++ b/perf_test/sparse/KokkosSparse_pcg.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_EXAMPLE_CG_SOLVE -#define KOKKOS_EXAMPLE_CG_SOLVE +#ifndef KOKKOSSPARSE_PCG_HPP +#define KOKKOSSPARSE_PCG_HPP #include #include @@ -468,4 +468,4 @@ void pcgsolve(KernelHandle_t &kh, const crsMatrix_t &crsMat, const y_vector_t &y //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -#endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */ +#endif /* #ifndef KOKKOSSPARSE_PCG_HPP */ diff --git a/perf_test/sparse/KokkosSparse_sort_crs.cpp b/perf_test/sparse/KokkosSparse_sort_crs.cpp index cd3ed91521..db5a0559b3 100644 --- a/perf_test/sparse/KokkosSparse_sort_crs.cpp +++ b/perf_test/sparse/KokkosSparse_sort_crs.cpp @@ -57,9 +57,9 @@ void run_experiment(int argc, char** argv, const CommonInputParams& common_param using mem_space = typename exec_space::memory_space; using device_t = typename Kokkos::Device; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::StaticCrsGraphType; @@ -79,8 +79,12 @@ void run_experiment(int argc, char** argv, const CommonInputParams& common_param // Randomly shuffle the entries within each row, so that the rows aren't // already sorted. Leave the values alone; this changes the matrix numerically // but this doesn't affect sorting. + std::random_device rd; + std::mt19937 g(rd()); for (lno_t i = 0; i < m; i++) { - std::random_shuffle(entriesHost.data() + i, entriesHost.data() + i + 1); + const size_type rowBegin = rowmapHost(i); + const size_type rowEnd = rowmapHost(i + 1); + std::shuffle(entriesHost.data() + rowBegin, entriesHost.data() + rowEnd, g); } Kokkos::deep_copy(shuffledEntries, entriesHost); exec_space exec; diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index ac7bfe636b..063c151812 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -20,7 +20,7 @@ #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" #include "KokkosSparse_Utils_mkl.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #include "KokkosSparse_spadd.hpp" @@ -129,9 +129,9 @@ void run_experiment(int argc, char** argv, CommonInputParams) { using mem_space = typename exec_space::memory_space; using device_t = typename Kokkos::Device; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using crsMat_t = KokkosSparse::CrsMatrix; using KernelHandle = diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index cff88e4998..2c63127c2d 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -23,7 +23,7 @@ #include "KokkosBlas1_nrminf.hpp" #include "KokkosBlas1_axpby.hpp" #include "KokkosKernels_TestParameters.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" #define TRANSPOSEFIRST false @@ -253,9 +253,9 @@ void run_spgemm(int argc, char** argv, perf_test::CommonInputParams) { using namespace KokkosSparse::Experimental; using MemSpace = typename ExecSpace::memory_space; - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using device_t = Kokkos::Device; using crsMat_t = typename KokkosSparse::CrsMatrix; using KernelHandle = diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 46610a8dab..a5646603b7 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -18,7 +18,7 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosSparse_run_spgemm_jacobi.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" void print_options() { std::cerr << "Options\n" << std::endl; @@ -201,9 +201,9 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** } int main(int argc, char** argv) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; KokkosKernels::Experiment::Parameters params; diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index 4aa05f9522..13c6eefea2 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -54,7 +54,7 @@ enum { DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1 /*, LVLSCHED_TP2*/ }; int test_spiluk_perf(std::vector tests, std::string afilename, int kin, int team_size, int /*vector_length*/, /*int idx_offset,*/ int loop) { - typedef default_scalar scalar_t; + using scalar_t = KokkosKernels::default_scalar; typedef int lno_t; typedef int size_type; typedef Kokkos::DefaultExecutionSpace execution_space; diff --git a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index 5ecf2f7248..6a2f92165b 100644 --- a/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -21,7 +21,7 @@ #include #include #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestStringUtils.hpp" #include "KokkosKernels_perf_test_utilities.hpp" // Headers for benchmark library diff --git a/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/perf_test/sparse/KokkosSparse_spmv_merge.cpp index b04142ed08..8e509939a2 100644 --- a/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -154,8 +154,8 @@ void print_help() { } int main(int argc, char** argv) { - using Scalar = default_scalar; - using lno_t = default_lno_t; + using Scalar = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; bool compare = false; lno_t loop = 100; diff --git a/perf_test/sparse/KokkosSparse_spmv_struct.cpp b/perf_test/sparse/KokkosSparse_spmv_struct.cpp index 862f6c8175..1e95bd7de5 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct.cpp @@ -151,9 +151,9 @@ int main(int argc, char **argv) { Kokkos::initialize(argc, argv); { - typedef default_size_type size_type; - typedef default_lno_t lno_t; - typedef default_scalar Scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using Scalar = KokkosKernels::default_scalar; typedef KokkosSparse::CrsMatrix matrix_type; typedef typename Kokkos::View mv_type; // typedef typename diff --git a/perf_test/sparse/KokkosSparse_spmv_test.hpp b/perf_test/sparse/KokkosSparse_spmv_test.hpp index 616543117a..f6008b9cd8 100644 --- a/perf_test/sparse/KokkosSparse_spmv_test.hpp +++ b/perf_test/sparse/KokkosSparse_spmv_test.hpp @@ -54,10 +54,10 @@ void armpl_matvec(AType /*A*/, XType x, YType y, spmv_additional_data* data); enum { KOKKOS, MKL, ARMPL, CUSPARSE, KK_KERNELS, KK_KERNELS_INSP, KK_INSP, OMP_STATIC, OMP_DYNAMIC, OMP_INSP }; enum { AUTO, DYNAMIC, STATIC }; -using Scalar = default_scalar; -using Ordinal = default_lno_t; -using Offset = default_size_type; -using Layout = default_layout; +using Scalar = KokkosKernels::default_scalar; +using Ordinal = KokkosKernels::default_lno_t; +using Offset = KokkosKernels::default_size_type; +using Layout = KokkosKernels::default_layout; #ifdef KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE std::vector make_spmv_kernel_base(const rajaperf::RunParams& params); diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp index eb36b67fbf..ae5c0ad7eb 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -98,9 +98,9 @@ void check_entries_sorted(const RowMapType drow_map, const EntriesType dentries) int test_sptrsv_perf(std::vector tests, const std::string &lfilename, const std::string &ufilename, const int team_size, const int vector_length, const int /*idx_offset*/, const int loop, const int chain_threshold = 0, const float /*dense_row_percent*/ = -1.0) { - typedef default_scalar scalar_t; - typedef default_lno_t lno_t; - typedef default_size_type size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; typedef Kokkos::DefaultExecutionSpace execution_space; typedef typename execution_space::memory_space memory_space; diff --git a/perf_test/sparse/spmv/Kokkos_SPMV.hpp b/perf_test/sparse/spmv/Kokkos_SPMV.hpp index 50f3f0cdd4..6e4578f93e 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPMV_HPP_ -#define KOKKOS_SPMV_HPP_ +#ifndef KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ +#define KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ template struct SPMV_Functor { @@ -142,4 +142,4 @@ void kokkos_matvec(AType A, XType x, YType y, int rows_per_thread, int team_size Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV", policy, func); } -#endif /* KOKKOS_SPMV_HPP_ */ +#endif /* KOKKOSKERNELS_PERFTEST_KOKKOS_SPMV_HPP_ */ diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index 6d82b9782b..258f1a1087 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPMV_INSPECTOR_HPP_ -#define KOKKOS_SPMV_INSPECTOR_HPP_ +#ifndef KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP +#define KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP #include "Kokkos_SPMV.hpp" @@ -125,4 +125,4 @@ void kk_inspector_matvec(AType A, XType x, YType y, int team_size, int vector_le Kokkos::parallel_for("KokkosSparse::PerfTest::SpMV_Inspector", policy, func); } -#endif /* KOKKOS_SPMV_HPP_ */ +#endif // KOKKOSKERNELS_PERFTEST_SPMV_INSPECTOR_HPP diff --git a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp index 1967d0d392..37a1fd05ed 100644 --- a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp +++ b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp @@ -20,6 +20,8 @@ #ifdef KOKKOS_ENABLE_OPENMP #include +#include +#include #define OMP_BENCH_RESTRICT __restrict__ @@ -33,8 +35,16 @@ void establishSmartSchedule(AType A) { // Generate a schedule Ordinal* rowSizes = NULL; - posix_memalign((void**)&rowSizes, 64, sizeof(int) * A.numRows()); - posix_memalign((void**)&threadStarts, 128, sizeof(int) * (omp_get_max_threads() + 1)); + if (posix_memalign((void**)&rowSizes, 64, sizeof(int) * A.numRows())) { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " posix_memalign failed"; + throw std::runtime_error(ss.str()); + } + if (posix_memalign((void**)&threadStarts, 128, sizeof(int) * (omp_get_max_threads() + 1))) { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " posix_memalign failed"; + throw std::runtime_error(ss.str()); + } for (int i = 0; i < omp_get_max_threads(); ++i) { threadStarts[i] = A.numRows(); diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 55749a67bb..f22d82ed6a 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -746,22 +746,22 @@ elif [ "$MACHINE" = "solo" ]; then module load cmake BASE_MODULE_LIST="cmake,/" - BASE_MODULE_LIST_LLVM="cmake,/,gnu/10.2.1" + BASE_MODULE_LIST_LLVM="cmake,/,gnu/10.3.1" BASE_MODULE_LIST_INTEL="cmake,/" ONEAPI_WARNING_FLAGS="" - GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" + GNU103_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then - COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.3.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then COMPILERS=("intel/19.1 $BASE_MODULE_LIST_INTEL,mkl/19.1 "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gnu/10.2.1 $GNU102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" + "gnu/10.3.1 $GNU103_MODULE_TPL_LIST "OpenMP_Serial" g++ $GNU_WARNING_FLAGS" ) else - COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.3.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "gnu/11.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "gnu/12.1.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" diff --git a/sparse/CMakeLists.txt b/sparse/CMakeLists.txt index 97076655f7..1e34990bb5 100644 --- a/sparse/CMakeLists.txt +++ b/sparse/CMakeLists.txt @@ -18,13 +18,6 @@ LIST(APPEND SOURCES sparse/tpls/KokkosKernels_tpl_handles.cpp) #that should be instantiated based on input options #Generate @X@ variables in the template X.hpp.in and X.cpp.in #files containing the list of all needed macros -KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve - COMPONENTS sparse - HEADER_LIST ETI_HEADERS - SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES -) - KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_struct spmv COMPONENTS sparse HEADER_LIST ETI_HEADERS @@ -160,6 +153,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_symbolic sptrsv_symbolic TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_sptrsv_solve sptrsv_solve + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_trsv trsv COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..2fdcd740e2 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..9be44095f0 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_apply_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_APPLY_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..1e3befcc89 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..493740dfb2 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_gauss_seidel_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in index 16b0bf66b7..97f3ac20df 100644 --- a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_avail.hpp.in @@ -1,5 +1,3 @@ -#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ /* //@HEADER // ************************************************************************ @@ -18,11 +16,11 @@ //@HEADER */ +#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { namespace Impl { - @SPARSE_GMRES_ETI_AVAIL_BLOCK@ - } // Impl } // KokkosSparse #endif // KOKKOSSPARSE_GMRES_ETI_SPEC_AVAIL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..96fdca40b4 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_gmres_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_GMRES_ETI_DECL_BLOCK@ +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_GMRES_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..0fd514c1ca --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_PAR_ILUT_NUMERIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..c30fe10f82 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_par_ilut_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_PAR_ILUT_SYMBOLIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..43b1da79d1 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..131960272e --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..6356f7c438 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_jacobi_eti_spec_decl.hpp.in @@ -0,0 +1,25 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_JACOBI_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..2ca1ecf07b --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_noreuse_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NOREUSE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..af422e6fe5 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..2f3870e948 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spgemm_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..4cc6d57e9b --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_SPILUK_NUMERIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..bfffae9dc0 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spiluk_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { + +@SPARSE_SPILUK_SYMBOLIC_ETI_DECL_BLOCK@ + +} // Impl +} // KokkosSparse +#endif // KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_DECL_HPP_ diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..ad3166a1e9 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,26 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +// clang-format off +@SPARSE_SPMV_BSRMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..14813536f0 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..9a44ea21e5 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,27 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..af58d3e7fc --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_MV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..11ba625f3c --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_struct_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_MV_STRUCT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..a03fcf586e --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_struct_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPMV_STRUCT_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..aa3d2b2cef --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_solve_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPTRSV_SOLVE_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..4c48c895a1 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_sptrsv_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPTRSV_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in b/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..5b24a276d0 --- /dev/null +++ b/sparse/eti/generated_specializations_hpp/KokkosSparse_trsv_eti_spec_decl.hpp.in @@ -0,0 +1,24 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_TRSV_ETI_SPEC_DECL_HPP_ +namespace KokkosSparse { +namespace Impl { +@SPARSE_TRSV_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/sparse/impl/KokkosSparse_bspgemm_impl.hpp index ae0604cff4..7f40d2ffc2 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -91,7 +91,7 @@ class KokkosBSPGEMM : public KokkosSPGEMM alignof(nnz_lno_t)) ? (alignof(scalar_t) - alignof(nnz_lno_t)) : 0; - static constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + static constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; private: nnz_lno_t block_dim; diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 1e2e598151..b306c219f1 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -376,7 +376,7 @@ struct KokkosBSPGEMM thread_shmem_key_size) { - volatile nnz_lno_t *tmp = NULL; + nnz_lno_t *tmp = NULL; // size_t tid = get_thread_id(row_index); // the code gets internal compiler error on gcc 4.7.2 // assuming that this part only runs on GPUs for now, below fix @@ -386,10 +386,7 @@ struct KokkosBSPGEMM max_first_level_hash_size) { { while (tmp == NULL) { Kokkos::single( Kokkos::PerTeam(teamMember), - [&](volatile nnz_lno_t *&memptr) { - memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk(row_index)); - }, - tmp); + [&](nnz_lno_t *&memptr) { memptr = (nnz_lno_t *)(memory_space.allocate_chunk(row_index)); }, tmp); } global_acc_row_keys = (nnz_lno_t *)(tmp); global_acc_row_vals = KokkosKernels::Impl::alignPtrTo(tmp + pow2_hash_size); @@ -613,9 +607,9 @@ struct KokkosBSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -635,9 +629,9 @@ struct KokkosBSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -657,9 +651,10 @@ struct KokkosBSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // then chose the best method and parameters. size_type average_row_nnz = 0; size_t average_row_flops = 0; @@ -1207,7 +1203,7 @@ void KokkosBSPGEMM< // END OF SHARED MEMORY SIZE CALCULATIONS // required memory for L2 - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { tmp_max_nnz = 1; } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { @@ -1257,7 +1253,7 @@ void KokkosBSPGEMM< KokkosKernels::Impl::PoolType my_pool_type = KokkosKernels::Impl::OneThread2OneChunk; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1288,7 +1284,7 @@ void KokkosBSPGEMM< } timer1.reset(); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { if (thread_shmem_key_size <= 0) { std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " diff --git a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index 98dca331c6..a358d52532 100644 --- a/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -451,7 +451,7 @@ void KokkosBSPGEMM< Kokkos::Timer numeric_speed_timer_with_free; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), valuesC_.extent(0)); nnz_lno_temp_work_view_t nextsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), valuesC_.extent(0)); diff --git a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 2315339858..2d2d709553 100644 --- a/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -243,6 +243,8 @@ struct BSPGEMM_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct BSPGEMM_NUMERIC< \ diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index fa4fa4a54e..2b15dcaa00 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -514,7 +514,7 @@ class PointGaussSeidel { }); #if KOKKOSSPARSE_IMPL_PRINTDEBUG - if (!KokkosKernels::Impl::kk_is_gpu_exec_space() && + if (!KokkosKernels::Impl::is_gpu_exec_space_v && (ii == 0 || (block_size == 1 && ii < 2))) { std::cout << "\n\n\nrow:" << ii * block_size + i; std::cout << "\nneighbors:"; @@ -606,8 +606,8 @@ class PointGaussSeidel { nnz_lno_t color = t.league_rank(); nnz_lno_t colorBegin = color_xadj(color); nnz_lno_t colorLen = color_xadj(color + 1) - colorBegin; - KokkosKernels::TeamBitonicSort(color_adj.data() + colorBegin, colorLen, t, comp); - t.team_barrier(); + Kokkos::Experimental::sort_team( + t, Kokkos::subview(color_adj, Kokkos::make_pair(colorBegin, colorBegin + colorLen)), comp); // Now that the color set is sorted, count how many long rows there were nnz_lno_t numLongRows; Kokkos::parallel_reduce( @@ -897,7 +897,7 @@ class PointGaussSeidel { nnz_lno_t num_values_in_l2 = 0; nnz_lno_t num_big_rows = 0; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (!KokkosKernels::Impl::is_gpu_exec_space_v) { // again, if it is on CPUs, we make L1 as big as we need. size_t l1mem = 1; while (l1mem < level_1_mem) { @@ -937,7 +937,7 @@ class PointGaussSeidel { KOKKOSKERNELS_MACRO_MIN(num_large_rows, (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. size_t free_byte; @@ -1208,7 +1208,7 @@ class PointGaussSeidel { // this!!!!!!!!!!!!!!!!!! change fill_matrix_numeric so that they store // the internal matrix as above. the rest will wok fine. - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { Kokkos::parallel_for("KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", team_policy_t(my_exec_space, (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), @@ -1239,7 +1239,7 @@ class PointGaussSeidel { Get_Matrix_Diagonals gmd(newxadj_, newadj_, permuted_adj_vals, permuted_inverse_diagonal, this->num_rows, rows_per_team, block_size, block_matrix_size); - if (KokkosKernels::Impl::kk_is_gpu_exec_space() || block_size > 1) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || block_size > 1) { Kokkos::parallel_for("KokkosSparse::GaussSeidel::team_get_matrix_diagonals", team_policy_t(my_exec_space, (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 0581a332cf..6f62de44b1 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -282,6 +282,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_SYMBOLIC< \ @@ -321,6 +323,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_NUMERIC< \ @@ -383,6 +387,8 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; +#include + #define KOKKOSSPARSE_GAUSS_SEIDEL_APPLY_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GAUSS_SEIDEL_APPLY< \ diff --git a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index f4ed853e67..0c20595892 100644 --- a/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ -#define KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#ifndef KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#define KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_OrdinalTraits.hpp" @@ -124,4 +124,4 @@ struct CrsMatrixGetDiagCopyWithOffsets { } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ +#endif // KOKKOSSPARSE_IMPL_GETDIAGCOPYWITHOFFSETS_HPP_ diff --git a/sparse/impl/KokkosSparse_gmres_spec.hpp b/sparse/impl/KokkosSparse_gmres_spec.hpp index ef9dd508bb..8e9dbe5041 100644 --- a/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -128,6 +128,8 @@ struct GMRES >, \ false, true>; +#include + #define KOKKOSSPARSE_GMRES_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct GMRES< \ diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 4949887a7d..537dd35468 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -568,7 +568,8 @@ struct MDF_compute_list_length { team.team_reduce(Kokkos::Sum(updateIdx)); // Sort update list - KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); + Kokkos::Experimental::sort_team( + team, Kokkos::subview(update_list, Kokkos::make_pair(0, updateIdx))); } { size_type numEntrU = 0; diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp index c347b24647..471dedfb76 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_spec.hpp @@ -147,6 +147,8 @@ struct PAR_ILUT_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_PAR_ILUT_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct PAR_ILUT_NUMERIC< \ diff --git a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp index dfb66ca9c9..8b816e35be 100644 --- a/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_symbolic_spec.hpp @@ -115,6 +115,8 @@ struct PAR_ILUT_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_PAR_ILUT_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct PAR_ILUT_SYMBOLIC< \ diff --git a/sparse/impl/KokkosSparse_partitioning_impl.hpp b/sparse/impl/KokkosSparse_partitioning_impl.hpp index b926b74523..f336f42b06 100644 --- a/sparse/impl/KokkosSparse_partitioning_impl.hpp +++ b/sparse/impl/KokkosSparse_partitioning_impl.hpp @@ -113,7 +113,7 @@ struct BalloonClustering { auto state = randPool.get_state(); do { root = state.rand(numRows); - } while (!Kokkos::atomic_compare_exchange_strong(&vertClusters(root), numClusters, i)); + } while (numClusters != Kokkos::atomic_compare_exchange(&vertClusters(root), numClusters, i)); randPool.free_state(state); distances(root) = 0; pressure(root) = 1; @@ -125,7 +125,7 @@ struct BalloonClustering { nnz_lno_t cluster = state.rand(numClusters); randPool.free_state(state); vertClusters(i) = cluster; - Kokkos::atomic_increment(&clusterCounts(cluster)); + Kokkos::atomic_inc(&clusterCounts(cluster)); distances(i) = numRows; pressure(i) = 0.1; } @@ -175,8 +175,8 @@ struct BalloonClustering { // this cluster will take over weakNei if (vertLocks.set(i)) { if (vertLocks.set(weakNei)) { - Kokkos::atomic_increment(&clusterCounts(cluster)); - if (weakNeiCluster != numClusters) Kokkos::atomic_decrement(&clusterCounts(weakNeiCluster)); + Kokkos::atomic_inc(&clusterCounts(cluster)); + if (weakNeiCluster != numClusters) Kokkos::atomic_dec(&clusterCounts(weakNeiCluster)); vertClusters(weakNei) = cluster; pressure(i) -= pressure(weakNei); pressure(weakNei) = pressure(i); diff --git a/sparse/impl/KokkosSparse_sort_crs_impl.hpp b/sparse/impl/KokkosSparse_sort_crs_impl.hpp index 5e18c3fd5c..a90885a9fa 100644 --- a/sparse/impl/KokkosSparse_sort_crs_impl.hpp +++ b/sparse/impl/KokkosSparse_sort_crs_impl.hpp @@ -329,11 +329,32 @@ Kokkos::View computeEntryPerm } // Heuristic for choosing bulk sorting algorithm -template +template bool useBulkSortHeuristic(Ordinal avgDeg, Ordinal maxDeg) { - // Use bulk sort if matrix is highly imbalanced, - // OR the longest rows have many entries. - return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); + // Issue 2352: the KokkosSparse::sort_crs_matrix uses Kokkos::Experimental::sort_by_key when this returns true. + // sort_by_key executes on the host when a thrust-like library is not available, which really kills the performance in + // a scenario where the bulk sort algorithm would otherwise be appropriate. Additionally, On MI300A, sorting via + // ROCTHRUST was observed to be ~3x slower than the Kokkos kernels native implementation on some matrices of interest, + // so on that architecture only always bypass bulk sort. + // * GPU execution space, SYLC is enabled, but no ONEDPL does not have sort_by_key + // * GPU execution space, HIP is enabled, but no ROCTHRUST + // * GPU execution space, HIP is enabled, and GPU is GFX942 + // (Kokkos seems to require thrust when CUDA is enabled) + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { +#if (defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ONEDPL_HAS_SORT_BY_KEY)) || \ + (defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_ROCTHRUST)) || \ + (defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_ARCH_AMD_GFX942)) + return false; +#else + // Use bulk sort if matrix is highly imbalanced, + // OR the longest rows have many entries. + return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); +#endif + } else { + // Use bulk sort if matrix is highly imbalanced, + // OR the longest rows have many entries. + return (maxDeg / 10 > avgDeg) || (maxDeg > 1024); + } } #endif diff --git a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index 1d4ccee220..a5367bc562 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -145,6 +145,8 @@ struct SPADD_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPADD_NUMERIC< \ diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index bde3b02531..9ba03dae9a 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -342,7 +342,7 @@ template ()>::type* = nullptr) { + typename std::enable_if>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using range_type = Kokkos::RangePolicy; @@ -361,7 +361,7 @@ template ()>::type* = nullptr) { + typename std::enable_if>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; using RangePol = Kokkos::RangePolicy; diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index f5286fd9d9..ca6898dbc9 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -117,6 +117,8 @@ struct SPADD_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPADD_SYMBOLIC< \ diff --git a/sparse/impl/KokkosSparse_spgemm_impl.hpp b/sparse/impl/KokkosSparse_spgemm_impl.hpp index 228840381e..99541537fb 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -650,7 +650,7 @@ class KokkosSPGEMM { // chunk (no contention) template size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) { - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) return ideal_num_chunks; + if (!KokkosKernels::Impl::is_gpu_exec_space_v) return ideal_num_chunks; size_t free_byte, total_byte; KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); size_t required_size = ideal_num_chunks * chunk_bytes; diff --git a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp index bfbdaa9a45..0fb1a618ca 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_compression.hpp @@ -523,7 +523,7 @@ struct KokkosSPGEMMhandle->get_handle_exec_space(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; // get the suggested vectorlane size based on the execution space, and average // number of nnzs per row. int suggested_vector_size = this->handle->get_suggested_vector_size(n, nnz); @@ -723,7 +723,7 @@ bool KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { #ifndef KOKKOSKERNELSMOREMEM size_type max_row_nnz = 0; KokkosKernels::Impl::view_reduce_maxsizerow(n, in_row_map, max_row_nnz); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp index 86f25ba47d..106c5825ff 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_def.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_def.hpp @@ -60,7 +60,7 @@ void KokkosSPGEMMhandle->get_spgemm_handle()->get_compression_step(); // compress in single step if it is GPU. - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) compress_in_single_step = true; + if (KokkosKernels::Impl::is_gpu_exec_space_v) compress_in_single_step = true; // compressed B fields. row_lno_temp_work_view_t new_row_mapB(Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp index 8d81117f93..cf61676b5d 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_kkmem.hpp @@ -692,9 +692,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -714,9 +714,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -737,9 +737,10 @@ struct KokkosSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == this->spgemm_algorithm) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // then chose the best method and parameters. size_type average_row_nnz = 0; size_t average_row_flops = 0; @@ -1308,7 +1310,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { tmp_max_nnz = 1; } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { @@ -1358,7 +1360,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1390,7 +1392,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { if (thread_shmem_key_size <= 0) { std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " @@ -1516,7 +1518,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; } @@ -1548,7 +1550,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index 6dc365b58e..3a82679562 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -428,7 +428,7 @@ void KokkosSPGEMM()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { // allocate memory for begins and next to be used by the hashmap nnz_lno_temp_work_view_t beginsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), valuesC_.extent(0)); nnz_lno_temp_work_view_t nextsC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), valuesC_.extent(0)); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp index ccf747883b..8a6e8d0a62 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_symbolic.hpp @@ -1281,7 +1281,7 @@ void KokkosSPGEMMspgemm_algorithm; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; @@ -1523,7 +1523,7 @@ void KokkosSPGEMMspgemm_algorithm; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space = this->handle->get_handle_exec_space(); if (exec_gpu) { current_spgemm_algorithm = SPGEMM_KK_MEMORY; diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp index 5b0352d99b..d9ca251e89 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle.hpp @@ -1165,7 +1165,7 @@ void KokkosSPGEMM< nnz_lno_t *entriesC, // null if it is symbolic, otherwise not null! struct_visit_t visit_applier) { bool apply_compression = this->handle->get_spgemm_handle()->get_compression(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; const nnz_lno_t *min_result_row_for_each_row = this->handle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz( @@ -1468,7 +1468,7 @@ template void KokkosSPGEMM::KokkosSPGEMM_symbolic_triangle_setup() { - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; nnz_lno_t n = this->row_mapB.extent(0) - 1; size_type nnz = this->entriesB.extent(0); diff --git a/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp b/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp index 298af30aea..5da138f04a 100644 --- a/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp +++ b/sparse/impl/KokkosSparse_spgemm_impl_triangle_no_compression.hpp @@ -778,7 +778,7 @@ void KokkosSPGEMMhandle->get_spgemm_handle()->get_min_col_of_row().data(); nnz_lno_t max_row_size = this->handle->get_spgemm_handle()->get_max_result_nnz(); - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; typedef KokkosKernels::Impl::UniformMemoryPool pool_memory_space; int suggested_vector_size = this->handle->get_suggested_vector_size(this->b_row_cnt, bnnz); diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp index 3730c6ec51..5642a9f11a 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_sparseacc_impl.hpp @@ -591,7 +591,7 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -835,9 +835,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -856,7 +856,7 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -960,9 +961,9 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; fail = 0; break; @@ -980,7 +981,7 @@ struct KokkosSPGEMM; - constexpr bool exec_gpu = KokkosKernels::Impl::kk_is_gpu_exec_space(); + constexpr bool exec_gpu = KokkosKernels::Impl::is_gpu_exec_space_v; if (KOKKOSKERNELS_VERBOSE) { std::cout << "\tSPARSE ACC MODE" << std::endl; } diff --git a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp index 65ab7b4cc8..7dbc3083a4 100644 --- a/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_jacobi_spec.hpp @@ -200,6 +200,8 @@ struct SPGEMM_JACOBI >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_JACOBI_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPGEMM_JACOBI< \ diff --git a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp index bdbcbe1e85..e80d9dfb26 100644 --- a/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_noreuse_spec.hpp @@ -120,6 +120,8 @@ struct SPGEMM_NOREUSE, const OFFSET_TYPE>, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_NOREUSE_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPGEMM_NOREUSE< \ diff --git a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 18e35c1df8..460033e950 100644 --- a/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -173,6 +173,8 @@ struct SPGEMM_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPGEMM_NUMERIC< \ diff --git a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 971ddec716..117492e21d 100644 --- a/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -152,6 +152,8 @@ struct SPGEMM_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPGEMM_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPGEMM_SYMBOLIC< \ diff --git a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index cd16aa0f4f..92c9b80753 100644 --- a/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -181,6 +181,8 @@ struct SPILUK_NUMERIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPILUK_NUMERIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPILUK_NUMERIC< \ diff --git a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp index 622e929dfc..433c9c4e80 100644 --- a/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spiluk_symbolic_spec.hpp @@ -131,6 +131,8 @@ struct SPILUK_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPILUK_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPILUK_SYMBOLIC< \ diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 0223087f1f..d9702af900 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -589,7 +589,7 @@ struct BSR_GEMV_Functor { // template ()>::type + typename std::enable_if>::type * = nullptr> void spMatVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -636,8 +636,8 @@ void spMatVec_no_transpose( // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix, AS> &A, @@ -839,7 +839,7 @@ struct BSR_GEMV_Transpose_Functor { /// trivial serial impl used) template ()>::type + typename std::enable_if>::type * = nullptr> void spMatVec_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -885,8 +885,8 @@ void spMatVec_transpose( // spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatVec_transpose(const typename AMatrix::execution_space &exec, Handle *handle, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { @@ -1093,7 +1093,7 @@ struct BSR_GEMM_Functor { // template ()>::type + typename std::enable_if>::type * = nullptr> void spMatMultiVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -1139,8 +1139,8 @@ void spMatMultiVec_no_transpose( // template ()>::type - * = nullptr> + typename std::enable_if>::type * = + nullptr> void spMatMultiVec_no_transpose( const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix, AS> &A, @@ -1352,7 +1352,7 @@ struct BSR_GEMM_Transpose_Functor { /// (RangePolicy or trivial serial impl used) template ()>::type + typename std::enable_if>::type * = nullptr> void spMatMultiVec_transpose( const execution_space &exec, Handle *handle, const AlphaType &alpha, @@ -1391,7 +1391,7 @@ void spMatMultiVec_transpose( // template ()>::type * = nullptr> + typename std::enable_if>::type * = nullptr> void spMatMultiVec_transpose(const execution_space &exec, Handle *handle, const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 1c4564f1ba..b6ba3e5b88 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -134,7 +134,7 @@ struct SPMV_BSRMATRIX() || handle->algo == SPMV_BSR_V42) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -258,7 +258,7 @@ struct SPMV_MV_BSRMATRIX() || handle->algo == SPMV_BSR_V42) { + if (KokkosKernels::Impl::is_gpu_exec_space_v || handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -321,6 +321,8 @@ struct SPMV_MV_BSRMATRIX>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_BSRMATRIX< \ @@ -352,6 +354,8 @@ struct SPMV_MV_BSRMATRIX>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_MV_BSRMATRIX< \ diff --git a/sparse/impl/KokkosSparse_spmv_impl.hpp b/sparse/impl/KokkosSparse_spmv_impl.hpp index 5fefcb897d..ce1bd9c3a4 100644 --- a/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -187,7 +187,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // Determine rows per thread if (rows_per_thread < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) rows_per_thread = 1; else { if (nnz_per_row < 20 && nnz > 5000000) { @@ -198,7 +198,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th } if (team_size < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { team_size = 256 / vector_length; } else { team_size = 1; @@ -220,7 +220,7 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, int64_t rows_per_th // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -334,7 +334,7 @@ static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -380,7 +380,7 @@ static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, // spmv_beta_transpose: version for CPU execution spaces (RangePolicy or trivial // serial impl used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_transpose(const execution_space& exec, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -460,7 +460,7 @@ static void spmv_beta_transpose(const execution_space& exec, typename YVector::c // spmv_beta_transpose: version for GPU execution spaces (TeamPolicy used) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_beta_transpose(const execution_space& exec, typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { @@ -1005,7 +1005,7 @@ struct SPMV_MV_LayoutLeft_Functor { // spmv_alpha_beta_mv_no_transpose: version for CPU execution spaces // (RangePolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1054,7 +1054,7 @@ static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, // spmv_alpha_beta_mv_no_transpose: version for GPU execution spaces // (TeamPolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1119,7 +1119,7 @@ static void spmv_alpha_beta_mv_no_transpose(const execution_space& exec, // spmv_alpha_beta_mv_transpose: version for CPU execution spaces (RangePolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, @@ -1160,7 +1160,7 @@ static void spmv_alpha_beta_mv_transpose(const execution_space& exec, // spmv_alpha_beta_mv_transpose: version for GPU execution spaces (TeamPolicy) template ()>::type* = nullptr> + typename std::enable_if>::type* = nullptr> static void spmv_alpha_beta_mv_transpose(const execution_space& exec, const typename YVector::non_const_value_type& alpha, const AMatrix& A, const XVector& x, const typename YVector::non_const_value_type& beta, diff --git a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp index 0db896adec..b74bcc5e20 100644 --- a/sparse/impl/KokkosSparse_spmv_impl_merge.hpp +++ b/sparse/impl/KokkosSparse_spmv_impl_merge.hpp @@ -291,7 +291,7 @@ struct SpmvMergeHierarchical { const A_size_type pathLength = A.numRows() + A.nnz(); A_size_type pathLengthThreadChunk; int teamSize; - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { pathLengthThreadChunk = 4; teamSize = 128; } else { @@ -315,7 +315,7 @@ struct SpmvMergeHierarchical { using GpuOp = SpmvMergeImplFunctor; using CpuOp = SpmvMergeImplFunctor; using Op = - typename std::conditional(), GpuOp, CpuOp>::type; + typename std::conditional, GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else if (KokkosSparse::Conjugate[0] == mode[0]) { @@ -323,7 +323,7 @@ struct SpmvMergeHierarchical { using GpuOp = SpmvMergeImplFunctor; using CpuOp = SpmvMergeImplFunctor; using Op = - typename std::conditional(), GpuOp, CpuOp>::type; + typename std::conditional, GpuOp, CpuOp>::type; Op op(alpha, A, x, y, pathLengthThreadChunk); Kokkos::parallel_for("SpmvMergeHierarchical::spmv", policy, op); } else { diff --git a/sparse/impl/KokkosSparse_spmv_spec.hpp b/sparse/impl/KokkosSparse_spmv_spec.hpp index a03b15241b..d9a0175ea6 100644 --- a/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -222,6 +222,8 @@ struct SPMV_MV>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV< \ @@ -248,6 +250,8 @@ struct SPMV_MV>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV_MV< \ diff --git a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp index 80b2c908f7..e3283fe532 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_impl.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_impl.hpp @@ -617,7 +617,7 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ // Determine rows per thread if (rows_per_thread < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) rows_per_thread = 1; else { if (nnz_per_row < 20 && numInterior * nnz_per_row > 5000000) { @@ -628,7 +628,7 @@ int64_t spmv_struct_launch_parameters(int64_t numInterior, int64_t nnz, int nnz_ } if (team_size < 1) { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (KokkosKernels::Impl::is_gpu_exec_space_v) { team_size = 128 / vector_length; } else { team_size = 1; diff --git a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp index 8770f51ea0..ac0bc580f4 100644 --- a/sparse/impl/KokkosSparse_spmv_struct_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_struct_spec.hpp @@ -232,6 +232,8 @@ struct SPMV_MV_STRUCT>, \ false, true>; +#include + #define KOKKOSSPARSE_SPMV_STRUCT_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPMV_STRUCT< \ @@ -256,6 +258,8 @@ struct SPMV_MV_STRUCT>, \ std::is_integral_v, false, true>; +#include + #define KOKKOSSPARSE_SPMV_MV_STRUCT_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPMV_MV_STRUCT< \ diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 6dbc3039da..3afeadc559 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -79,7 +79,7 @@ struct SptrsvWrap { using range_type = Kokkos::pair; // Tag structs - struct UnsortedTag {}; // This doesn't appear to be supported + struct UnsortedTag {}; struct LargerCutoffTag {}; struct UnsortedLargerCutoffTag {}; @@ -115,7 +115,9 @@ struct SptrsvWrap { RHSType rhs; entries_t nodes_grouped_by_level; - using reftype = scalar_t &; + using reftype = scalar_t &; + using ArrayType = reftype; + using SumArray = reftype; struct SBlock { template @@ -141,6 +143,16 @@ struct SptrsvWrap { KOKKOS_INLINE_FUNCTION size_type get_block_size() const { return 0; } + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const scalar_t &a, const scalar_t &b, scalar_t &c) { c -= a * b; } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &, scalar_t &, const scalar_t &) {} + + KOKKOS_INLINE_FUNCTION + static void copy(scalar_t &, const scalar_t &) {} + // lget KOKKOS_INLINE_FUNCTION scalar_t &lget(const size_type row) const { return lhs(row); } @@ -195,6 +207,60 @@ struct SptrsvWrap { using reftype = Vector; + struct ArrayType { + scalar_t m_data[MAX_VEC_SIZE]; + + KOKKOS_INLINE_FUNCTION + ArrayType() { init(); } + + KOKKOS_INLINE_FUNCTION + ArrayType(const ArrayType &rhs_) { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = rhs_.m_data[i]; + } + + KOKKOS_INLINE_FUNCTION + ArrayType(const Vector &) { init(); } + + KOKKOS_INLINE_FUNCTION + void init() { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] = 0; + } + + KOKKOS_INLINE_FUNCTION + ArrayType &operator+=(const ArrayType &rhs_) { + for (size_type i = 0; i < MAX_VEC_SIZE; ++i) m_data[i] += rhs_.m_data[i]; + return *this; + } + }; + + struct SumArray { + using reducer = SumArray; + using value_type = ArrayType; + using result_view_type = Kokkos::View; + + private: + value_type &m_value; + + public: + KOKKOS_INLINE_FUNCTION + SumArray(value_type &value) : m_value(value) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type &dest, const value_type &src) const { dest += src; } + + KOKKOS_INLINE_FUNCTION + void init(value_type &val) const { val.init(); } + + KOKKOS_INLINE_FUNCTION + value_type &reference() const { return m_value; } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return result_view_type(&m_value, 1); } + + KOKKOS_INLINE_FUNCTION + bool reference_scalar() const { return true; } + }; + RowMapType row_map; EntriesType entries; ValuesType values; @@ -215,6 +281,7 @@ struct SptrsvWrap { block_size(block_size_), block_items(block_size * block_size) { KK_REQUIRE_MSG(block_size > 0, "Tried to use block_size=0 with the blocked Common?"); + KK_REQUIRE_MSG(block_size <= MAX_VEC_SIZE, "Max supported block size is " << MAX_VEC_SIZE); } KOKKOS_INLINE_FUNCTION @@ -257,17 +324,17 @@ struct SptrsvWrap { team.team_barrier(); KokkosBatched::TeamLU::invoke(team, LU); - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first + // Ax = LUx = Lz = b, we use the change of variable z = U*x + // z = L^-1 * b, first we solve for z, storing the result back into b + // x = U^-1 * z, second we solve for x, again storing the result back into b + team.team_barrier(); + KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); + team.team_barrier(); KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); - - team.team_barrier(); - KokkosBatched::TeamTrsv::invoke(team, 1.0, LU, b); } // serial divide. b /= A (b = b * A^-1) @@ -278,21 +345,44 @@ struct SptrsvWrap { // Need a temp block to do LU of A const auto block_size_ = b.size(); - KK_KERNEL_REQUIRE_MSG(block_size_ <= MAX_VEC_SIZE, - "Max supported block size for range-policy is 16. Use team-policy alg if you need more."); - Block LU(&buff[0], block_size_, block_size_); assign(LU, A); KokkosBatched::SerialLU::invoke(LU); - // A = LU - // A^-1 = U^-1 * L^-1 - // b = (b * U^-1) * L^-1, so do U trsv first + // Ax = LUx = Lz = b, we use the change of variable z = U*x + // z = L^-1 * b, first we solve for z, storing the result back into b + // x = U^-1 * z, second we solve for x, again storing the result back into b + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + } - KokkosBatched::SerialTrsv::invoke(1.0, LU, b); + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &b, ArrayType &ca) { + Vector c(&ca.m_data[0], b.size()); + multiply_subtract(A, b, c); + } + + KOKKOS_INLINE_FUNCTION + static void multiply_subtract(const CBlock &A, const CVector &b, Vector &c) { + // Use gemv. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBlas::SerialGemv::invoke(-1.0, A, b, 1.0, + c); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const member_type &team, const Vector &lhs_, ArrayType &rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(team, lhs_, rhs_); + } + + KOKKOS_INLINE_FUNCTION + static void copy(const Vector &lhs_, ArrayType &rhsa) { + CVector rhs_(&rhsa.m_data[0], lhs_.size()); + assign(lhs_, rhs_); } // lget @@ -331,58 +421,69 @@ struct SptrsvWrap { */ template struct Intermediate : public Common { - using Base = Common; + using Base = Common; + using accum_t = std::conditional_t; Intermediate(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const entries_t &nodes_grouped_by_level_, const size_type block_size_ = 0) : Base(row_map_, entries_, values_, lhs_, rhs_, nodes_grouped_by_level_, block_size_) {} - struct ReduceFunctorBasic { + struct ReduceSumFunctor { const Base *m_obj; + const lno_t rowid; + lno_t diag; KOKKOS_INLINE_FUNCTION - ReduceFunctorBasic(const Base *obj, const lno_t = 0) : m_obj(obj) {} - - KOKKOS_INLINE_FUNCTION - static void multiply_subtract(const scalar_t &val, const scalar_t &lhs_col_val, scalar_t &accum) { - accum -= val * lhs_col_val; - } - - KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { + void operator()(size_type i, accum_t &accum) const { const auto colid = m_obj->entries(i); - multiply_subtract(m_obj->vget(i), m_obj->lget(colid), accum); + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + if constexpr (BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } else { + Base::multiply_subtract(val, lhs_colid, accum); + } + KK_KERNEL_ASSERT_MSG(colid != rowid, "Should not have hit diag"); } }; - struct ReduceFunctorBlock : public ReduceFunctorBasic { - using P = ReduceFunctorBasic; - - const size_type block_size; - const size_type b; - - KOKKOS_INLINE_FUNCTION - ReduceFunctorBlock(const Base *obj, const size_type block_size_, const size_type b_, const lno_t = 0) - : P(obj), block_size(block_size_), b(b_) {} + struct ReduceSumDiagFunctor { + const Base *m_obj; + const lno_t rowid; + mutable lno_t diag; KOKKOS_INLINE_FUNCTION - void operator()(size_type i, scalar_t &accum) const { - const auto idx = i / block_size; - const auto colid = P::m_obj->entries(idx); - P::multiply_subtract(P::m_obj->vget(idx)(b, i % block_size), P::m_obj->lget(colid)(b), accum); + void operator()(size_type i, accum_t &accum) const { + const auto colid = m_obj->entries(i); + if (colid != rowid) { + auto val = m_obj->vget(i); + auto lhs_colid = m_obj->lget(colid); + // accum -= val * lhs_colid; + if constexpr (BlockEnabled) { + accum_t temp; + Base::multiply_subtract(val, lhs_colid, temp); + accum += temp; + } else { + Base::multiply_subtract(val, lhs_colid, accum); + } + } else { + diag = i; + } } }; - /** - * If we want to support Unsorted, we'll need a Functor that returns the ptr - * of the diag item (colid == rowid). Possibly via multi-reduce? The UnsortedTag - * is defined above but no policies actually use it. - */ - template KOKKOS_INLINE_FUNCTION void solve_impl(const member_type *team, const int my_rank, const long node_count) const { + using reduce_item_t = typename Base::ArrayType; + using reducer_t = typename Base::SumArray; + using functor_t = std::conditional_t; + + static_assert(!((!IsSerial && BlockEnabled) && UseThreadVec), + "ThreadVectorRanges are not yet supported for block-enabled"); static_assert(!(IsSerial && UseThreadVec), "Requested thread vector range in serial?"); - static_assert(IsSorted, "Unsorted is not yet supported."); const auto rowid = Base::nodes_grouped_by_level(my_rank + node_count); const auto soffset = Base::row_map(rowid); @@ -394,76 +495,58 @@ struct SptrsvWrap { const auto itr_e = eoffset - (IsSorted ? (IsLower ? 1 : 0) : 0); // We don't need the reducer to find the diag item if sorted + functor_t rf{this, rowid, -1}; typename Base::reftype lhs_val = Base::lget(rowid); - - const auto block_size_ = BlockEnabled ? Base::get_block_size() : 1; - (void)block_size_; // Some settings do not use this var + reduce_item_t reduce = lhs_val; if constexpr (IsSerial) { KK_KERNEL_ASSERT_MSG(my_rank == 0, "Non zero rank in serial"); KK_KERNEL_ASSERT_MSG(team == nullptr, "Team provided in serial?"); - if constexpr (BlockEnabled) { - for (size_type b = 0; b < block_size_; ++b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { - rf(i, lhs_val(b)); - } - } - } else { - ReduceFunctorBasic rf(this, rowid); - for (size_type i = itr_b; i < itr_e; ++i) { - rf(i, lhs_val); - } + for (auto ptr = itr_b; ptr < itr_e; ++ptr) { + rf(ptr, reduce); } + Base::copy(lhs_val, reduce); } else { KK_KERNEL_ASSERT_MSG(team != nullptr, "Cannot do team operations without team"); if constexpr (!UseThreadVec) { - if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(*team, block_size_), [&](size_type b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b * block_size_, itr_e * block_size_), rf, - lhs_val(b)); - }); - } else { - ReduceFunctorBasic rf(this, rowid); - Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, lhs_val); - } + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + team->team_barrier(); + Base::copy(*team, lhs_val, reduce); team->team_barrier(); } else { - if constexpr (BlockEnabled) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(*team, block_size_), [&](size_type b) { - ReduceFunctorBlock rf(this, block_size_, b, rowid); - for (size_type i = itr_b * block_size_; i < itr_e * block_size_; ++i) { - rf(i, lhs_val(b)); - } - }); - } else { - ReduceFunctorBasic rf(this, rowid); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, lhs_val); - } + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(*team, itr_b, itr_e), rf, reducer_t(reduce)); + Base::copy(lhs_val, reduce); } } // If sorted, we already know the diag. Otherwise, get it from the reducer - const lno_t diag = IsLower ? eoffset - 1 : soffset; + rf.diag = IsSorted ? (IsLower ? eoffset - 1 : soffset) : rf.diag; // At end, handle the diag element. We need to be careful to avoid race // conditions here. if constexpr (IsSerial) { // Serial case is easy, there's only 1 thread so just do the // add_and_divide - KK_KERNEL_ASSERT_MSG(diag != -1, "Serial should always know diag"); - Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Serial should always know diag"); + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); } else { - // Parallel sorted case is complex. All threads know what the diag is. - // If we have a team sharing the work, we need to ensure only one - // thread performs the add_and_divide (except in BlockEnabled, then - // we can use team operations). - KK_KERNEL_ASSERT_MSG(diag != -1, "Sorted should always know diag"); - if constexpr (!UseThreadVec) { - Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(diag)); + if constexpr (IsSorted) { + // Parallel sorted case is complex. All threads know what the diag is. + // If we have a team sharing the work, we need to ensure only one + // thread performs the add_and_divide (except in BlockEnabled, then + // we can use team operations). + KK_KERNEL_ASSERT_MSG(rf.diag != -1, "Sorted should always know diag"); + if constexpr (!UseThreadVec) { + Base::add_and_divide(*team, lhs_val, rhs_val, Base::vget(rf.diag)); + } else { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } } else { - Base::add_and_divide(lhs_val, rhs_val, Base::vget(diag)); + // Parallel unsorted case. Only one thread should know what the diag + // item is. We have that one do the add_and_divide. + if (rf.diag != -1) { + Base::add_and_divide(lhs_val, rhs_val, Base::vget(rf.diag)); + } } } } @@ -734,10 +817,10 @@ struct SptrsvWrap { const int nsrow = colptr(j1 + 1) - i1; // create a view for the s-th supernocal column - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft scalar_t *dataL = const_cast(values.data()); - Kokkos::View viewL(&dataL[i1], nsrow, - nscol); + Kokkos::View viewL( + &dataL[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal block auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -776,8 +859,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv::invoke(team, one, Ljj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); if (unit_diagonal) { KokkosBatched::TeamTrsm struct UpperTriSupernodalFunctor { - // NOTE: we currently supports only default_layout = LayoutLeft - using SupernodeView = typename Kokkos::View; + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + using SupernodeView = + typename Kokkos::View; bool invert_diagonal; const int *supercols; @@ -939,8 +1024,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv:: template invoke(team, one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBatched::TeamTrsm::invoke(team, one, Ujj, Xjj); @@ -1030,10 +1116,10 @@ struct SptrsvWrap { const int nsrow2 = nsrow - nscol; // create a view of the s-th supernocal column of U - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft scalar_t *dataU = const_cast(values.data()); - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); // extract part of solution, corresponding to the diagonal block U(s, s) auto Xj = Kokkos::subview(X, range_type(j1, j2)); @@ -1069,8 +1155,9 @@ struct SptrsvWrap { KokkosBlas::TeamGemv::invoke(team, one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBatched::TeamTrsm::invoke(team, one, Ujj, Xjj); @@ -1280,8 +1367,9 @@ struct SptrsvWrap { timer.reset(); #endif - // NOTE: we currently supports only default_layout = LayoutLeft - using supernode_view_type = Kokkos::View; + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + using supernode_view_type = + Kokkos::View; if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to scatter the // results) @@ -1340,9 +1428,10 @@ struct SptrsvWrap { KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, Xjj); // TODO: space.fence(); Kokkos::fence(); @@ -1612,9 +1701,9 @@ struct SptrsvWrap { int workoffset = work_offset_host(s); // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); if (invert_offdiagonal) { auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); @@ -1637,10 +1726,10 @@ struct SptrsvWrap { // instead of trmv/trsv KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, - 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks @@ -1713,9 +1802,9 @@ struct SptrsvWrap { int workoffset = work_offset_host(s); // create a view for the s-th supernocal block column - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, - nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewU( + &dataU[i1], nsrow, nscol); // extract part of the solution, corresponding to the diagonal // block @@ -1741,9 +1830,10 @@ struct SptrsvWrap { if (invert_diagonal) { KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { - // NOTE: we currently supports only default_layout = + // NOTE: we currently supports only KokkosKernels::default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); + Kokkos::View Xjj( + Xj.data(), nscol, 1); KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 87cf72686c..de9f31dbd6 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -217,6 +217,8 @@ struct SPTRSV_SOLVE >, \ false, true>; +#include + #define KOKKOSSPARSE_SPTRSV_SOLVE_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct SPTRSV_SOLVE< \ diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 2229e90414..0b009f0e71 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -113,6 +113,8 @@ struct SPTRSV_SYMBOLIC >, \ false, true>; +#include + #define KOKKOSSPARSE_SPTRSV_SYMBOLIC_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct SPTRSV_SYMBOLIC< \ diff --git a/sparse/impl/KokkosSparse_trsv_impl.hpp b/sparse/impl/KokkosSparse_trsv_impl.hpp index 443a91ed02..d166024049 100644 --- a/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -261,10 +261,8 @@ struct TrsvWrap { return; } - // Don't use r >= 0 as the test, because that fails if - // lno_t is unsigned. We do r == 0 (last - // iteration) below. - for (lno_t r = numRows - 1; r != 0; --r) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t r = numRows - 1; r != static_cast(-1); --r) { const offset_type beg = ptr(r); const offset_type end = ptr(r + 1); for (offset_type k = beg; k < end; ++k) { @@ -275,20 +273,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current row r } // for each row r - - // Last iteration: r = 0. - { - const lno_t r = 0; - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const scalar_t A_rc = val(k); - const lno_t c = ind(k); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // last iteration: r = 0 } static void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -312,10 +296,8 @@ struct TrsvWrap { return; } - // Don't use r >= 0 as the test, because that fails if - // lno_t is unsigned. We do r == 0 (last - // iteration) below. - for (lno_t r = numRows - 1; r != 0; --r) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t r = numRows - 1; r != static_cast(-1); --r) { const offset_type beg = ptr(r); const offset_type end = ptr(r + 1); auto A_rr = co.zero(); @@ -334,28 +316,6 @@ struct TrsvWrap { co.template divide(X, A_rr, r, j); } } // for each row r - - // Last iteration: r = 0. - { - const lno_t r = 0; - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - auto A_rr = co.zero(); - for (offset_type k = beg; k < end; ++k) { - const auto A_rc = co.get(val, k); - const lno_t c = ind(k); - if (r == c) { - co.pluseq(A_rr, A_rc); - } else { - for (lno_t j = 0; j < numVecs; ++j) { - co.gemv(X, A_rc, r, c, j); - } - } - } // for each entry A_rc in the current row r - for (lno_t j = 0; j < numVecs; ++j) { - co.template divide(X, A_rr, r, j); - } - } // last iteration: r = 0 } static void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -380,10 +340,8 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); for (offset_type k = beg; k < end; ++k) { @@ -394,20 +352,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const lno_t c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const scalar_t A_rc = val(k); - const lno_t r = ind(k); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } } static void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -432,13 +376,11 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { + for (offset_type k = end - 1; k != beg - 1; --k) { const lno_t r = ind(k); const auto A_rc = val(k); /*(vqd 20 Jul 2020) This assumes that the diagonal entry @@ -454,19 +396,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const auto A_rc = val(beg); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (lno_t j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } - } } static void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -520,10 +449,8 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); for (offset_type k = beg; k < end; ++k) { @@ -534,20 +461,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const lno_t c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const lno_t r = ind(k); - const scalar_t A_rc = STS::conj(val(k)); - for (lno_t j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } } static void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { @@ -572,13 +485,11 @@ struct TrsvWrap { return; } - // Don't use c >= 0 as the test, because that fails if - // lno_t is unsigned. We do c == 0 (last - // iteration) below. - for (lno_t c = numCols - 1; c != 0; --c) { + // Iterate backwards with care due to potentially unsigned type + for (lno_t c = numCols - 1; c != static_cast(-1); --c) { const offset_type beg = ptr(c); const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { + for (offset_type k = end - 1; k != beg - 1; --k) { const lno_t r = ind(k); const scalar_t A_rc = STS::conj(val(k)); /*(vqd 20 Jul 2020) This assumes that the diagonal entry @@ -594,19 +505,6 @@ struct TrsvWrap { } } // for each entry A_rc in the current column c } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const scalar_t A_rc = STS::conj(val(beg)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (lno_t j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } - } } static void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, DomainMultiVectorType Y) { diff --git a/sparse/impl/KokkosSparse_trsv_spec.hpp b/sparse/impl/KokkosSparse_trsv_spec.hpp index ff852f97e2..95c6d23c5f 100644 --- a/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -174,6 +174,8 @@ struct TRSV >, \ false, true>; +#include + #define KOKKOSSPARSE_TRSV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template struct TRSV< \ diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 4d01e23a1d..f5e3fb4a00 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -223,7 +223,7 @@ class KokkosKernelsHandle { typename size_type_persistent_work_view_t::HostMirror size_type_persistent_work_host_view_t; // Host view type typedef typename Kokkos::View scalar_temp_work_view_t; typedef typename Kokkos::View scalar_persistent_work_view_t; - typedef typename Kokkos::View + typedef typename Kokkos::View scalar_persistent_work_view2d_t; typedef typename Kokkos::View nnz_lno_temp_work_view_t; typedef typename Kokkos::View nnz_lno_persistent_work_view_t; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index d174aa9b31..1fecb3b7b9 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -21,8 +21,8 @@ /// This implements a local (no MPI) sparse matrix stored in block-by-block /// compressed row sparse format. -#ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_ -#define KOKKOS_SPARSE_BSRMATRIX_HPP_ +#ifndef KOKKOSSPARSE_BSRMATRIX_HPP_ +#define KOKKOSSPARSE_BSRMATRIX_HPP_ #include #include @@ -300,7 +300,7 @@ struct BsrRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template + class SizeType = KokkosKernels::default_size_type> class BsrMatrix { static_assert(std::is_signed::value, "BsrMatrix requires that OrdinalType is a signed integer type."); static_assert(Kokkos::is_memory_traits_v || std::is_void_v, @@ -529,7 +529,9 @@ class BsrMatrix { auto it = blocks.find(block); if (it == blocks.end()) { std::vector entries = {entry}; - entries.reserve(blockDim_ * blockDim_); + entries.reserve( + static_cast::size_type, ordinal_type>>(blockDim_) * + blockDim_); blocks[block] = std::move(entries); // new block with entry } else { it->second.push_back(entry); // add entry to block @@ -724,9 +726,9 @@ class BsrMatrix { Kokkos::deep_copy(h_crs_values, crs_mtx.values); typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values); - if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) { - Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_); - Kokkos::resize(values, numBlocks * blockDim_ * blockDim_); + if (h_values.extent(0) < static_cast(numBlocks) * blockDim_ * blockDim_) { + Kokkos::resize(h_values, static_cast(numBlocks) * blockDim_ * blockDim_); + Kokkos::resize(values, static_cast(numBlocks) * blockDim_ * blockDim_); } Kokkos::deep_copy(h_values, 0); @@ -967,7 +969,7 @@ class BsrMatrix { case BsrMatrix::valueOperation::ASSIGN: { for (ordinal_type lcol = 0; lcol < block_size; ++lcol) { if (force_atomic) { - Kokkos::atomic_assign(&(local_row_values[lcol]), vals[offset_into_vals + lrow * block_size + lcol]); + Kokkos::atomic_store(&(local_row_values[lcol]), vals[offset_into_vals + lrow * block_size + lcol]); } else { local_row_values[lcol] = vals[offset_into_vals + lrow * block_size + lcol]; } @@ -1005,4 +1007,4 @@ inline constexpr bool is_bsr_matrix_v = is_bsr_matrix::value; } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_BSRMATRIX_HPP_ diff --git a/sparse/src/KokkosSparse_CcsMatrix.hpp b/sparse/src/KokkosSparse_CcsMatrix.hpp index 58665708b1..e39ab730e6 100644 --- a/sparse/src/KokkosSparse_CcsMatrix.hpp +++ b/sparse/src/KokkosSparse_CcsMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in compressed column sparse /// ("Ccs") format. -#ifndef KOKKOS_SPARSE_CCSMATRIX_HPP_ -#define KOKKOS_SPARSE_CCSMATRIX_HPP_ +#ifndef KOKKOSSPARSE_CCSMATRIX_HPP_ +#define KOKKOSSPARSE_CCSMATRIX_HPP_ #include "Kokkos_Core.hpp" #include @@ -142,7 +142,7 @@ class CcsMatrix { //! Type of each (column) index in the matrix. typedef OrdinalType ordinal_type; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCcsGraph + typedef Kokkos::StaticCcsGraph staticccsgraph_type; //! Type of the "column map" (which contains the offset for each column's //! data). @@ -236,4 +236,4 @@ template struct is_ccs_matrix> : public std::true_type {}; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_CCSMATRIX_HPP_ diff --git a/sparse/src/KokkosSparse_CooMatrix.hpp b/sparse/src/KokkosSparse_CooMatrix.hpp index 996b3c29aa..c91a05f863 100644 --- a/sparse/src/KokkosSparse_CooMatrix.hpp +++ b/sparse/src/KokkosSparse_CooMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in coordinate ("Coo") format /// which is also known as ivj or triplet format. -#ifndef KOKKOS_SPARSE_COOMATRIX_HPP_ -#define KOKKOS_SPARSE_COOMATRIX_HPP_ +#ifndef KOKKOSSPARSE_COOMATRIX_HPP_ +#define KOKKOSSPARSE_COOMATRIX_HPP_ #include "Kokkos_Core.hpp" #include "KokkosKernels_Error.hpp" @@ -147,4 +147,4 @@ template struct is_coo_matrix> : public std::true_type {}; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_COOMATRIX_HPP_ diff --git a/sparse/src/KokkosSparse_CrsMatrix.hpp b/sparse/src/KokkosSparse_CrsMatrix.hpp index 86586401cd..676dfb64cb 100644 --- a/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -21,8 +21,8 @@ /// local (no MPI) sparse matrix stored in compressed row sparse /// ("Crs") format. -#ifndef KOKKOS_SPARSE_CRSMATRIX_HPP_ -#define KOKKOS_SPARSE_CRSMATRIX_HPP_ +#ifndef KOKKOSSPARSE_CRSMATRIX_HPP_ +#define KOKKOSSPARSE_CRSMATRIX_HPP_ #include "Kokkos_Core.hpp" #include "Kokkos_StaticCrsGraph.hpp" @@ -315,7 +315,7 @@ struct SparseRowViewConst { /// storage for sparse matrices, as described, for example, in Saad /// (2nd ed.). template + class SizeType = KokkosKernels::default_size_type> class CrsMatrix { static_assert(std::is_signed::value, "CrsMatrix requires that OrdinalType is a signed integer type."); @@ -344,10 +344,10 @@ class CrsMatrix { //! Type of a host-memory mirror of the sparse matrix. typedef CrsMatrix HostMirror; //! Type of the graph structure of the sparse matrix. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. - typedef Kokkos::StaticCrsGraph + typedef Kokkos::StaticCrsGraph staticcrsgraph_type; //! Type of column indices in the sparse matrix. typedef typename staticcrsgraph_type::entries_type index_type; @@ -611,7 +611,7 @@ class CrsMatrix { const ordinal_type offset = findRelOffset(&(row_view.colidx(0)), length, cols[i], hint, is_sorted); if (offset != length) { if (force_atomic) { - Kokkos::atomic_assign(&(row_view.value(offset)), vals[i]); + Kokkos::atomic_store(&(row_view.value(offset)), vals[i]); } else { row_view.value(offset) = vals[i]; } @@ -787,4 +787,4 @@ template inline constexpr bool is_crs_matrix_v = is_crs_matrix::value; } // namespace KokkosSparse -#endif +#endif // KOKKOSSPARSE_CRSMATRIX_HPP_ diff --git a/sparse/src/KokkosSparse_IOUtils.hpp b/sparse/src/KokkosSparse_IOUtils.hpp index 588c9dbca9..edb2cb6e97 100644 --- a/sparse/src/KokkosSparse_IOUtils.hpp +++ b/sparse/src/KokkosSparse_IOUtils.hpp @@ -73,7 +73,7 @@ void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, SizeType &nn } // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 // + 50i) for complex types. - Kokkos::View valuesView(values, nnz * block_elem_count); + Kokkos::View valuesView(values, nnz * static_cast(block_elem_count)); ScalarType randStart, randEnd; KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); Kokkos::Random_XorShift64_Pool pool(13718); @@ -888,7 +888,7 @@ int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, si if (mtx_format == COORDINATE) ss >> nnz; else - nnz = nr * nc; + nnz = static_cast(nr) * nc; size_type numEdges = nnz; symmetrize = symmetrize || mtx_sym != GENERAL; if (symmetrize && nr != nc) { diff --git a/sparse/src/KokkosSparse_OrdinalTraits.hpp b/sparse/src/KokkosSparse_OrdinalTraits.hpp index ef08eb89e2..42cd564dc9 100644 --- a/sparse/src/KokkosSparse_OrdinalTraits.hpp +++ b/sparse/src/KokkosSparse_OrdinalTraits.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_ORDINALTRAITS_HPP_ -#define KOKKOS_SPARSE_ORDINALTRAITS_HPP_ +#ifndef KOKKOSSPARSE_ORDINALTRAITS_HPP_ +#define KOKKOSSPARSE_ORDINALTRAITS_HPP_ /// \file KokkosSparse_OrdinalTraits.hpp /// \brief Declaration and definition of KokkosSparse::OrdinalTraits, @@ -95,4 +95,4 @@ struct OrdinalTraits { } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_ORDINALTRAITS_HPP_ +#endif // KOKKOSSPARSE_ORDINALTRAITS_HPP_ diff --git a/sparse/src/KokkosSparse_SortCrs.hpp b/sparse/src/KokkosSparse_SortCrs.hpp index 1203cd244b..8da608be56 100644 --- a/sparse/src/KokkosSparse_SortCrs.hpp +++ b/sparse/src/KokkosSparse_SortCrs.hpp @@ -63,7 +63,7 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const return; } Ordinal numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (!KokkosKernels::Impl::is_gpu_exec_space_v) { // On CPUs, use a sequential radix sort within each row. Kokkos::parallel_for("sort_crs_matrix[CPU,radix]", Kokkos::RangePolicy>(exec, 0, numRows), @@ -77,7 +77,7 @@ void sort_crs_matrix(const execution_space& exec, const rowmap_t& rowmap, const #ifndef KK_DISABLE_BULK_SORT_BY_KEY Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); bool useBulkSort = false; - if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { // Calculate the true number of columns if user didn't pass it in if (numCols == Kokkos::ArithTraits::max()) { KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols); @@ -240,7 +240,7 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e if (entries.extent(0) <= size_t(1)) { return; } - if constexpr (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (!KokkosKernels::Impl::is_gpu_exec_space_v) { // If on CPU, sort each row independently. Don't need to know numCols for // this. Kokkos::parallel_for("sort_crs_graph[CPU,radix]", @@ -255,7 +255,7 @@ void sort_crs_graph(const execution_space& exec, const rowmap_t& rowmap, const e #ifndef KK_DISABLE_BULK_SORT_BY_KEY Ordinal maxDeg = KokkosSparse::Impl::graph_max_degree(exec, rowmap); bool useBulkSort = false; - if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { + if (KokkosSparse::Impl::useBulkSortHeuristic(avgDeg, maxDeg)) { // Calculate the true number of columns if user didn't pass it in if (numCols == Kokkos::ArithTraits::max()) { KokkosKernels::Impl::kk_view_reduce_max(exec, entries.extent(0), entries, numCols); diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index d73787481e..e7d7b1c5bf 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -526,7 +526,7 @@ bsrMat_t transpose_bsr_matrix(const bsrMat_t &A) { rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1); entries_t AT_entries(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"), A.nnz()); values_t AT_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"), - A.nnz() * A.blockDim() * A.blockDim()); + A.nnz() * static_cast(A.blockDim()) * A.blockDim()); transpose_bsr_matrix(A.numRows(), A.numCols(), A.blockDim(), A.graph.row_map, A.graph.entries, A.values, AT_rowmap, AT_entries, AT_values); @@ -983,7 +983,7 @@ struct LowerTriangularMatrix { // TODO: Write GPU (vector-level) version here: /* - if(kk_is_gpu_exec_space()) + if(is_gpu_exec_space_v) { Kokkos::parallel_for( Kokkos::ThreadVectorRange(teamMember, read_left_work), diff --git a/sparse/src/KokkosSparse_Utils_rocsparse.hpp b/sparse/src/KokkosSparse_Utils_rocsparse.hpp index 4b99c96c81..be939d5890 100644 --- a/sparse/src/KokkosSparse_Utils_rocsparse.hpp +++ b/sparse/src/KokkosSparse_Utils_rocsparse.hpp @@ -80,7 +80,7 @@ inline void rocsparse_internal_safe_call(rocsparse_status rocsparseStatus, const // The macro below defines is the public interface for the safe cusparse calls. // The functions themselves are protected by impl namespace. -#define KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(call) \ +#define KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(call) \ KokkosSparse::Impl::rocsparse_internal_safe_call(call, #call, __FILE__, __LINE__) inline rocsparse_operation mode_kk_to_rocsparse(const char kk_mode[]) { @@ -169,10 +169,10 @@ struct kokkos_to_rocsparse_type> { // destructed. struct TemporarySetRocsparseStream { TemporarySetRocsparseStream(rocsparse_handle handle_, const Kokkos::HIP& exec_) : handle(handle_) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, exec_.hip_stream())); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, exec_.hip_stream())); } - ~TemporarySetRocsparseStream() { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(handle, NULL)); } + ~TemporarySetRocsparseStream() { KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(handle, NULL)); } rocsparse_handle handle; }; diff --git a/sparse/src/KokkosSparse_findRelOffset.hpp b/sparse/src/KokkosSparse_findRelOffset.hpp index 6dffcdd3d7..fe9a3968c0 100644 --- a/sparse/src/KokkosSparse_findRelOffset.hpp +++ b/sparse/src/KokkosSparse_findRelOffset.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_SPARSE_FINDRELOFFSET_HPP -#define KOKKOS_SPARSE_FINDRELOFFSET_HPP +#ifndef KOKKOSSPARSE_FINDRELOFFSET_HPP +#define KOKKOSSPARSE_FINDRELOFFSET_HPP /// \file KokkosSparse_findRelOffset.hpp /// \brief Find the relative offset of a column index in a sparse @@ -147,4 +147,4 @@ KOKKOS_FUNCTION OffsetType findRelOffset(const IndexViewType& indsToSearch, cons } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_FINDRELOFFSET_HPP +#endif // KOKKOSSPARSE_FINDRELOFFSET_HPP diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index bf5ee8633b..8531f39bd0 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -208,7 +208,7 @@ class PointGaussSeidelHandle : public GaussSeidelHandle scalar_temp_work_view_t; typedef typename Kokkos::View scalar_persistent_work_view_t; - typedef typename Kokkos::View + typedef typename Kokkos::View scalar_persistent_work_view2d_t; typedef typename scalar_persistent_work_view_t::HostMirror scalar_persistent_work_host_view_t; // Host view type @@ -283,7 +283,7 @@ class PointGaussSeidelHandle : public GaussSeidelHandleblock_size; } void choose_default_algorithm() { - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) this->algorithm_type = GS_TEAM; else this->algorithm_type = GS_PERMUTED; @@ -492,7 +492,7 @@ class ClusterGaussSeidelHandle : public GaussSeidelHandle(); } + bool use_teams() const { return KokkosKernels::Impl::is_gpu_exec_space_v; } ~ClusterGaussSeidelHandle() = default; @@ -532,7 +532,7 @@ class TwoStageGaussSeidelHandle using const_ordinal_t = typename const_entries_view_t::value_type; using const_scalar_t = typename const_values_view_t::value_type; - using vector_view_t = Kokkos::View; + using vector_view_t = Kokkos::View; using GSHandle = GaussSeidelHandle; diff --git a/sparse/src/KokkosSparse_getDiagCopy.hpp b/sparse/src/KokkosSparse_getDiagCopy.hpp index 8f67d8a1c6..bfe612d1cb 100644 --- a/sparse/src/KokkosSparse_getDiagCopy.hpp +++ b/sparse/src/KokkosSparse_getDiagCopy.hpp @@ -17,8 +17,8 @@ /// \file KokkosSparse_getDiagCopy.hpp /// \brief Get a copy of the diagonal entries of a KokkosSparse::CrsMatrix. -#ifndef KOKKOS_SPARSE_GETDIAGCOPY_HPP_ -#define KOKKOS_SPARSE_GETDIAGCOPY_HPP_ +#ifndef KOKKOSSPARSE_GETDIAGCOPY_HPP_ +#define KOKKOSSPARSE_GETDIAGCOPY_HPP_ #include "KokkosSparse_getDiagCopyWithOffsets_impl.hpp" #include @@ -55,4 +55,4 @@ void getDiagCopy(const DiagType& D, const OffsetsType& offsets, const CrsMatrixT } // namespace KokkosSparse -#endif // KOKKOS_SPARSE_GETDIAGCOPY_HPP_ +#endif // KOKOS_SPARSE_GETDIAGCOPY_HPP_ diff --git a/sparse/src/KokkosSparse_spgemm_handle.hpp b/sparse/src/KokkosSparse_spgemm_handle.hpp index 9e7679a3a9..1857e0bbc7 100644 --- a/sparse/src/KokkosSparse_spgemm_handle.hpp +++ b/sparse/src/KokkosSparse_spgemm_handle.hpp @@ -135,11 +135,11 @@ class SPGEMMHandle { bufferSize = 0; buffer = nullptr; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_A)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_B)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_C)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr_D)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info_C)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_A)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_B)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_C)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&descr_D)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&info_C)); rocsparseHandle = kkControls.getRocsparseHandle(); } @@ -671,7 +671,7 @@ class SPGEMMHandle { // them in the handle suggested_vector_size_ = KokkosKernels::Impl::kk_get_suggested_vector_size( nr, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + if (KokkosKernels::Impl::is_gpu_exec_space_v) suggested_team_size_ = max_allowed_team_size / suggested_vector_size_; else suggested_team_size = max_allowed_team_size; diff --git a/sparse/src/KokkosSparse_spmv.hpp b/sparse/src/KokkosSparse_spmv.hpp index e31ff2ef8d..86171b6cc7 100644 --- a/sparse/src/KokkosSparse_spmv.hpp +++ b/sparse/src/KokkosSparse_spmv.hpp @@ -119,8 +119,8 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const m = A.numRows(); n = A.numCols(); } else { - m = A.numRows() * A.blockDim(); - n = A.numCols() * A.blockDim(); + m = static_cast(A.numRows()) * A.blockDim(); + n = static_cast(A.numCols()) * A.blockDim(); } if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { diff --git a/sparse/src/KokkosSparse_spmv_handle.hpp b/sparse/src/KokkosSparse_spmv_handle.hpp index e91e53d68d..4da6e47551 100644 --- a/sparse/src/KokkosSparse_spmv_handle.hpp +++ b/sparse/src/KokkosSparse_spmv_handle.hpp @@ -148,7 +148,7 @@ struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { ~RocSparse_CRS_SpMV_Data() { // note: hipFree includes an implicit device synchronize KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(buffer)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_spmat_descr(mat)); } rocsparse_spmat_descr mat; @@ -159,9 +159,9 @@ struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { struct RocSparse_BSR_SpMV_Data : public TPL_SpMV_Data { RocSparse_BSR_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} ~RocSparse_BSR_SpMV_Data() { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_mat_descr(mat)); #if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_info(info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_mat_info(info)); #endif } @@ -190,7 +190,7 @@ struct MKL_SpMV_Data : public TPL_SpMV_Data { }; #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) struct OneMKL_SpMV_Data : public TPL_SpMV_Data { OneMKL_SpMV_Data(const Kokkos::Experimental::SYCL& exec_) : TPL_SpMV_Data(exec_) {} ~OneMKL_SpMV_Data() { diff --git a/sparse/src/KokkosSparse_sptrsv_handle.hpp b/sparse/src/KokkosSparse_sptrsv_handle.hpp index b6ca3dacfd..37feace62e 100644 --- a/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -471,6 +471,14 @@ class SPTRSVHandle { #endif } +#if defined(__clang__) && defined(KOKKOS_ENABLE_CUDA) + if (algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 && Kokkos::ArithTraits::isComplex && + std::is_same_v && block_size_ != 0) { + throw(std::runtime_error( + "sptrsv handle: SPTRSV may not work with blocks+clang+cuda+complex due to a compiler bug")); + } +#endif + #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV if (lower_tri) { // lower-triangular is stored in CSC diff --git a/sparse/src/KokkosSparse_sptrsv_supernode.hpp b/sparse/src/KokkosSparse_sptrsv_supernode.hpp index 586e8f3a64..1f67054ca3 100644 --- a/sparse/src/KokkosSparse_sptrsv_supernode.hpp +++ b/sparse/src/KokkosSparse_sptrsv_supernode.hpp @@ -1358,8 +1358,9 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int n char uplo_char = (lower ? 'L' : 'U'); char diag_char = (unit_diag ? 'U' : 'N'); - // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewL(&hv(nnzD), nsrow, nscol); + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft + Kokkos::View viewL( + &hv(nnzD), nsrow, nscol); auto Ljj = Kokkos::subview(viewL, range_type(0, nscol), Kokkos::ALL()); #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE @@ -1403,7 +1404,7 @@ void invert_supernodal_columns(KernelHandle *kernelHandle, bool unit_diag, int n Kokkos::deep_copy(dViewL, viewL); #endif - // NOTE: we currently supports only default_layout = LayoutLeft + // NOTE: we currently supports only KokkosKernels::default_layout = LayoutLeft auto dViewLjj = Kokkos::subview(dViewL, range_type(0, nscol), Kokkos::ALL()); auto dViewLij = Kokkos::subview(dViewL, range_type(nscol, nsrow), Kokkos::ALL()); @@ -1946,7 +1947,7 @@ void split_crsmat(KernelHandle *kernelHandleL, host_crsmat_t superluL) { } // allocate for all the subgraphs row_map_view_t total_rowmap_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "rowmap_view"), - 2 * nlevels * (nrows + 1)); + static_cast(2 * nlevels) * (nrows + 1)); cols_view_t total_column_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "colmap_view"), newNnz); values_view_t total_values_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_view"), newNnz); // create host-mirrors diff --git a/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp b/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp index a1cd3c97f5..4a14d43df0 100644 --- a/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp +++ b/sparse/tpls/KokkosKernels_tpl_handles_decl.hpp @@ -30,7 +30,11 @@ struct CusparseSingleton { CusparseSingleton(); + static bool is_initialized(); static CusparseSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl @@ -48,7 +52,11 @@ struct RocsparseSingleton { RocsparseSingleton(); + static bool is_initialized(); static RocsparseSingleton& singleton(); + + private: + static std::unique_ptr& get_instance(); }; } // namespace Impl diff --git a/sparse/tpls/KokkosKernels_tpl_handles_def.hpp b/sparse/tpls/KokkosKernels_tpl_handles_def.hpp index a88ad12130..d52959a591 100644 --- a/sparse/tpls/KokkosKernels_tpl_handles_def.hpp +++ b/sparse/tpls/KokkosKernels_tpl_handles_def.hpp @@ -25,14 +25,24 @@ namespace KokkosKernels { namespace Impl { -CusparseSingleton::CusparseSingleton() { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle)); +CusparseSingleton::CusparseSingleton() { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle)); } - Kokkos::push_finalize_hook([&]() { cusparseDestroy(cusparseHandle); }); +CusparseSingleton& CusparseSingleton::singleton() { + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(instance->cusparseHandle)); + instance.reset(); + }); + } + return *instance; } -CusparseSingleton& CusparseSingleton::singleton() { - static CusparseSingleton s; +bool CusparseSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& CusparseSingleton::get_instance() { + static std::unique_ptr s; return s; } @@ -47,13 +57,25 @@ namespace KokkosKernels { namespace Impl { RocsparseSingleton::RocsparseSingleton() { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_handle(&rocsparseHandle)); - - Kokkos::push_finalize_hook([&]() { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_handle(rocsparseHandle)); }); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_handle(&rocsparseHandle)); } RocsparseSingleton& RocsparseSingleton::singleton() { - static RocsparseSingleton s; + std::unique_ptr& instance = get_instance(); + if (!instance) { + instance = std::make_unique(); + Kokkos::push_finalize_hook([&]() { + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_handle(instance->rocsparseHandle)); + instance.reset(); + }); + } + return *instance; +} + +bool RocsparseSingleton::is_initialized() { return get_instance() != nullptr; } + +std::unique_ptr& RocsparseSingleton::get_instance() { + static std::unique_ptr s; return s; } diff --git a/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp index 14eac2aee1..eff19977cf 100644 --- a/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp @@ -184,21 +184,21 @@ KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) auto &rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; \ rocsparse_pointer_mode oldPtrMode; \ \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( \ rocsparse_set_pointer_mode(rocspHandle, rocsparse_pointer_mode_host)); /* alpha, beta on host*/ \ OFFSET_TYPE nnzA = colidxA.extent(0); \ OFFSET_TYPE nnzB = colidxB.extent(0); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_##TOKEN##csrgeam( \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_##TOKEN##csrgeam( \ rocspHandle, m, n, reinterpret_cast(&alpha), rocData.descrA, nnzA, \ reinterpret_cast(valuesA.data()), rowmapA.data(), colidxA.data(), \ reinterpret_cast(&beta), rocData.descrB, nnzB, \ reinterpret_cast(valuesB.data()), rowmapB.data(), colidxB.data(), rocData.descrC, \ reinterpret_cast(valuesC.data()), const_cast(rowmapC.data()), \ colidxC.data())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, NULL)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, NULL)); \ \ Kokkos::Profiling::popRegion(); \ } \ diff --git a/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp index 514b019f1b..69286e5e98 100644 --- a/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp @@ -160,15 +160,15 @@ KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) OFFSET_TYPE nnzB = colidxB.extent(0); \ OFFSET_TYPE nnzC = 0; \ \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrA)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrB)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&rocData.descrC)); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgeam_nnz(rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), \ - colidxA.data(), rocData.descrB, nnzB, rowmapB.data(), \ - colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrA)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrB)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&rocData.descrC)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_csrgeam_nnz( \ + rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), colidxA.data(), rocData.descrB, nnzB, \ + rowmapB.data(), colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ addHandle->set_c_nnz(nnzC); \ - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_stream(rocspHandle, NULL)); \ + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_stream(rocspHandle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index 6609d77a81..bbec0e402a 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -45,23 +45,23 @@ struct spgemm_numeric_tpl_spec_avail { struct spgemm_numeric_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -80,30 +80,30 @@ SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#define SPGEMM_NUMERIC_AVAIL_ROCSPARSE(SCALAR) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_ROCSPARSE(SCALAR) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; SPGEMM_NUMERIC_AVAIL_ROCSPARSE(float) @@ -113,30 +113,30 @@ SPGEMM_NUMERIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_numeric_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_NUMERIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_numeric_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_NUMERIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp index 5e636eea0e..c26118fec9 100644 --- a/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_decl.hpp @@ -191,50 +191,53 @@ void spgemm_numeric_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, co #endif -#define SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using c_scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ - c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ - c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ - std::string label = "KokkosSparse::spgemm_numeric[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_numeric_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ - valuesB, row_mapC, entriesC, valuesC); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define SPGEMM_NUMERIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ + template <> \ + struct SPGEMM_NUMERIC, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using c_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ + c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ + c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ + std::string label = "KokkosSparse::spgemm_numeric[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_numeric_cusparse(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ + valuesB, row_mapC, entriesC, valuesC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_NUMERIC_DECL_CUSPARSE_S(SCALAR, TPL_AVAIL) \ @@ -301,18 +304,18 @@ void spgemm_numeric_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_l auto nnz_B = colidxB.extent(0); auto nnz_C = colidxC.extent(0); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); if (!handle->are_entries_computed()) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_symbolic( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_csrgemm_symbolic( h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, nullptr, h->descr_C, nnz_C, rowptrC.data(), colidxC.data(), h->info_C, h->buffer)); handle->set_computed_entries(); } - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_numeric( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_Xcsrgemm_numeric( h->rocsparseHandle, h->opA, h->opB, m, k, n, reinterpret_cast(&alpha), h->descr_A, nnz_A, reinterpret_cast(valuesA.data()), rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, reinterpret_cast(valuesB.data()), rowptrB.data(), @@ -320,45 +323,49 @@ void spgemm_numeric_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_l h->descr_C, nnz_C, reinterpret_cast(valuesC.data()), rowptrC.data(), colidxC.data(), h->info_C, h->buffer)); // Restore old pointer mode - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); handle->set_call_numeric(); } #define SPGEMM_NUMERIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL) \ template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = \ KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ using c_scalar_view_t = \ - Kokkos::View, \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ @@ -437,50 +444,54 @@ void spgemm_numeric_mkl(KernelHandle *handle, typename KernelHandle::nnz_lno_t m handle->set_computed_entries(); } -#define SPGEMM_NUMERIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ - template <> \ - struct SPGEMM_NUMERIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle \ + struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using c_scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using scalar_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ - typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ - c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ - c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ - c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ - std::string label = "KokkosSparse::spgemm_numeric[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spgemm_numeric_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ - valuesB, row_mapC, entriesC, valuesC); \ - Kokkos::Profiling::popRegion(); \ - } \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using c_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spgemm_numeric(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ + c_int_view_t row_mapA, c_int_view_t entriesA, c_scalar_view_t valuesA, bool, \ + c_int_view_t row_mapB, c_int_view_t entriesB, c_scalar_view_t valuesB, bool, \ + c_int_view_t row_mapC, int_view_t entriesC, scalar_view_t valuesC) { \ + std::string label = "KokkosSparse::spgemm_numeric[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spgemm_numeric_mkl(handle->get_spgemm_handle(), m, n, k, row_mapA, entriesA, valuesA, row_mapB, entriesB, \ + valuesB, row_mapC, entriesC, valuesC); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define SPGEMM_NUMERIC_DECL_MKL_SE(SCALAR, EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index 75615623a5..392eb761c5 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -44,15 +44,15 @@ struct spgemm_symbolic_tpl_spec_avail { struct spgemm_symbolic_tpl_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -70,22 +70,22 @@ SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(float) @@ -95,22 +95,22 @@ SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(Kokkos::complex) #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ - template <> \ - struct spgemm_symbolic_tpl_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +#define SPGEMM_SYMBOLIC_AVAIL_MKL(SCALAR, EXEC) \ + template <> \ + struct spgemm_symbolic_tpl_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; #define SPGEMM_SYMBOLIC_AVAIL_MKL_E(EXEC) \ diff --git a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp index 4ac41ca80d..6385fff835 100644 --- a/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_decl.hpp @@ -313,25 +313,27 @@ void spgemm_symbolic_cusparse(KernelHandle *handle, lno_t m, lno_t n, lno_t k, c #define SPGEMM_SYMBOLIC_DECL_CUSPARSE(SCALAR, MEMSPACE, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ @@ -409,11 +411,11 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ const auto beta = Kokkos::ArithTraits::zero(); rocsparse_pointer_mode oldPtrMode; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_get_pointer_mode(h->rocsparseHandle, &oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, rocsparse_pointer_mode_host)); // C = alpha * OpA(A) * OpB(B) + beta * D - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_Xcsrgemm_buffer_size( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_Xcsrgemm_buffer_size( h->rocsparseHandle, h->opA, h->opB, m, k, n, reinterpret_cast(&alpha), h->descr_A, nnz_A, rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), reinterpret_cast(&beta), h->descr_D, 0, nullptr, nullptr, h->info_C, @@ -422,16 +424,16 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&h->buffer, h->bufferSize)); rocsparse_int nnz_C = 0; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgemm_nnz(h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, - rowptrA.data(), colidxA.data(), h->descr_B, nnz_B, - rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, nullptr, - h->descr_C, rowptrC.data(), &nnz_C, h->info_C, h->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_csrgemm_nnz(h->rocsparseHandle, h->opA, h->opB, m, k, n, h->descr_A, nnz_A, rowptrA.data(), + colidxA.data(), h->descr_B, nnz_B, rowptrB.data(), colidxB.data(), h->descr_D, 0, nullptr, + nullptr, h->descr_C, rowptrC.data(), &nnz_C, h->info_C, h->buffer)); // If C has zero rows, its rowptrs are not populated if (m == 0) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(rowptrC.data(), 0, rowptrC.extent(0) * sizeof(index_type))); } // Restore previous pointer mode - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_set_pointer_mode(h->rocsparseHandle, oldPtrMode)); handle->set_c_nnz(nnz_C); handle->set_call_symbolic(); @@ -440,26 +442,29 @@ void spgemm_symbolic_rocsparse(KernelHandle *handle, typename KernelHandle::nnz_ #define SPGEMM_SYMBOLIC_DECL_ROCSPARSE(SCALAR, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = \ KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ @@ -529,25 +534,27 @@ void spgemm_symbolic_mkl(KernelHandle *handle, typename KernelHandle::nnz_lno_t #define SPGEMM_SYMBOLIC_DECL_MKL(SCALAR, EXEC, TPL_AVAIL) \ template <> \ - struct SPGEMM_SYMBOLIC, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, TPL_AVAIL> { \ + struct SPGEMM_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, TPL_AVAIL> { \ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle; \ - using c_int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ - using int_view_t = Kokkos::View, \ - Kokkos::MemoryTraits>; \ + using c_int_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using int_view_t = Kokkos::View, \ + Kokkos::MemoryTraits>; \ static void spgemm_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, \ typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, \ c_int_view_t row_mapA, c_int_view_t entriesA, bool, c_int_view_t row_mapB, \ diff --git a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 11bf82f7b4..defb13044f 100644 --- a/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -667,26 +667,26 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode } else { subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); handle->tpl_rank1 = subhandle; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&subhandle->mat)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_descr(&subhandle->mat)); // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&subhandle->info)); if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } @@ -694,23 +694,23 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 // No analysis step in the older versions #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_mat_info(&subhandle->info)); if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, - block_dim, subhandle->info)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_ex_analysis(rocsparseHandle, dir, trans, mb, nb, nnzb, + subhandle->mat, bsr_val, bsr_row_ptr, + bsr_col_ind, block_dim, subhandle->info)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } @@ -720,58 +720,62 @@ void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info, x_, - beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } // *_ex* functions introduced in 5.4.0 #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, - bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, + x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, - subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, - subhandle->info, x_, beta_, y_)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } diff --git a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index c4cc3fbc88..bcd367b9bb 100644 --- a/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -331,10 +331,10 @@ void spmv_mv_rocsparse(const Kokkos::HIP &exec, Handle *handle, const char mode[ } rocsparse_dnmat_descr vecX, vecY; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_create_dnmat_descr(&vecX, x.extent(0), x.extent(1), x_ld, x_data, rocsparse_compute_type(), x_order)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_create_dnmat_descr(&vecY, y.extent(0), y.extent(1), y_ld, y_data, rocsparse_compute_type(), y_order)); @@ -359,30 +359,30 @@ void spmv_mv_rocsparse(const Kokkos::HIP &exec, Handle *handle, const char mode[ void *csr_col_ind = static_cast(const_cast(A.graph.entries.data())); void *csr_val = static_cast(const_cast(A.values.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_csr_descr( &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, compute_type)); // Size and allocate buffer, and analyze the matrix - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm(rocsparseHandle, rocsparseOperation, rocsparse_operation_none, - &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, - rocsparse_spmm_stage_buffer_size, &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmm( + rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmm_stage_buffer_size, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmm( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmm( rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmm_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); } // Perform the actual computation - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( rocsparse_spmm(rocsparseHandle, rocsparseOperation, rocsparse_operation_none, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmm_stage_compute, &subhandle->bufferSize, subhandle->buffer)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecY)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnmat_descr(vecX)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnmat_descr(vecY)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnmat_descr(vecX)); } #define KOKKOSSPARSE_SPMV_MV_ROCSPARSE(SCALAR, XL, YL, MEMSPACE) \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 2f5ceca09e..30c760c14e 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -179,7 +179,7 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ template <> \ struct spmv_tpl_spec_avail< \ diff --git a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 30e790a3ab..8cf1f49e51 100644 --- a/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -301,9 +301,9 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], rocsparse_dnvec_descr vecX, vecY; void* x_data = static_cast(const_cast(x.data())); void* y_data = static_cast(const_cast(y.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_dnvec_descr( &vecX, x.extent_int(0), x_data, rocsparse_compute_type())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_dnvec_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_dnvec_descr( &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); // Default to using the "stream" algorithm which has almost no setup cost, @@ -332,31 +332,32 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], void* csr_col_ind = static_cast(const_cast(A.graph.entries.data())); void* csr_val = static_cast(const_cast(A.values.data())); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_create_csr_descr( &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, compute_type)); /* Size and allocate buffer, and analyze the matrix */ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, - &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_buffer_size, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, - vecX, &beta, vecY, compute_type, alg, rocsparse_spmv_stage_auto, - &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, + compute_type, alg, rocsparse_spmv_stage_auto, &subhandle->bufferSize, nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv_ex( rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, &subhandle->bufferSize, subhandle->buffer)); #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, &subhandle->bufferSize, nullptr)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, compute_type, alg, &subhandle->bufferSize, + nullptr)); KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif } @@ -364,21 +365,21 @@ void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], /* Perform the actual computation */ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_compute, &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &subhandle->bufferSize, subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL( + rocsparse_spmv_ex(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, &beta, vecY, compute_type, + alg, rocsparse_spmv_stage_compute, &subhandle->bufferSize, subhandle->buffer)); #else - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, - &beta, vecY, compute_type, alg, &subhandle->bufferSize, - subhandle->buffer)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, compute_type, alg, &subhandle->bufferSize, + subhandle->buffer)); #endif - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnvec_descr(vecY)); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(rocsparse_destroy_dnvec_descr(vecX)); } #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT) \ @@ -548,7 +549,7 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP) #undef KOKKOSSPARSE_SPMV_MKL #endif -#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +#if defined(KOKKOS_ENABLE_SYCL) inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return oneapi::mkl::transpose::nontrans; diff --git a/sparse/unit_test/Test_Sparse_SortCrs.hpp b/sparse/unit_test/Test_Sparse_SortCrs.hpp index 6898b8aa9d..0877665ae0 100644 --- a/sparse/unit_test/Test_Sparse_SortCrs.hpp +++ b/sparse/unit_test/Test_Sparse_SortCrs.hpp @@ -42,11 +42,11 @@ enum : int { } template -void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, - bool doStructInterface, int howExecSpecified) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; +void testSortCRS(KokkosKernels::default_lno_t numRows, KokkosKernels::default_lno_t numCols, + KokkosKernels::default_size_type nnz, bool doValues, bool doStructInterface, int howExecSpecified) { + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; // Create a random matrix on device @@ -135,9 +135,9 @@ void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type template void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix, size_type>; @@ -175,9 +175,9 @@ void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { template void testSortAndMerge(bool justGraph, int howExecSpecified, bool doStructInterface, bool inPlace, int testCase) { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; + using size_type = KokkosKernels::default_size_type; + using lno_t = KokkosKernels::default_lno_t; + using scalar_t = KokkosKernels::default_scalar; using exec_space = typename device_t::execution_space; using crsMat_t = KokkosSparse::CrsMatrix; using graph_t = typename crsMat_t::staticcrsgraph_type; diff --git a/sparse/unit_test/Test_Sparse_Transpose.hpp b/sparse/unit_test/Test_Sparse_Transpose.hpp index da430c6ca4..d8ddaf500b 100644 --- a/sparse/unit_test/Test_Sparse_Transpose.hpp +++ b/sparse/unit_test/Test_Sparse_Transpose.hpp @@ -44,9 +44,9 @@ template void testTranspose(int numRows, int numCols, bool doValues) { using exec_space = typename device_t::execution_space; using range_pol = Kokkos::RangePolicy; - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using crsMat_t = typename KokkosSparse::CrsMatrix; using c_rowmap_t = typename crsMat_t::row_map_type; using c_entries_t = typename crsMat_t::index_type; @@ -106,7 +106,7 @@ template void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { using exec_space = typename bsrMat_t::execution_space; using range_pol = Kokkos::RangePolicy; - using size_type = default_size_type; + using size_type = KokkosKernels::default_size_type; using c_rowmap_t = typename bsrMat_t::row_map_type; using c_entries_t = typename bsrMat_t::index_type; using values_t = typename bsrMat_t::values_type::non_const_type; @@ -133,9 +133,9 @@ void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { template void testTransposeBsrRef() { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; using entries_t = typename bsrMat_t::index_type::non_const_type; @@ -199,9 +199,9 @@ void testTransposeBsrRef() { template void testTransposeBsr(int numRows, int numCols, int blockSize) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; + using scalar_t = KokkosKernels::default_scalar; + using lno_t = KokkosKernels::default_lno_t; + using size_type = KokkosKernels::default_size_type; using exec_space = typename device_t::execution_space; using bsrMat_t = typename KokkosSparse::Experimental::BsrMatrix; using c_rowmap_t = typename bsrMat_t::row_map_type; diff --git a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp index 80c23356ce..7c6bddfd4d 100644 --- a/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_block_gauss_seidel.hpp @@ -220,7 +220,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; diff --git a/sparse/unit_test/Test_Sparse_bspgemm.hpp b/sparse/unit_test/Test_Sparse_bspgemm.hpp index 32168f1686..f5314e516d 100644 --- a/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -169,7 +169,7 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, lno_t SPGEMM_KK, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ }; - if (!KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (!KokkosKernels::Impl::is_gpu_exec_space_v) { // SPGEMM_KK_LP is useful on CPU to cover MultiCoreTag4 functor // (otherwise skipped) but on GPU it's same as SPGEMM_KK, so we can skip it. algorithms.push_back(SPGEMM_KK_LP); diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 92109f02dd..3704b63fbd 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -14,6 +14,13 @@ // //@HEADER +#include +#include +#include +#include +#include +#include + #include #include @@ -25,17 +32,13 @@ #include #include #include -#include -#include -#include -#include -#include -#include + #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosSparse_sor_sequential_impl.hpp" #include "KokkosSparse_SortCrs.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_TestMatrixUtils.hpp" #include "Test_Sparse_Utils.hpp" // #ifndef kokkos_complex_double @@ -248,8 +251,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_ using namespace Test; srand(245); typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View host_scalar_view2d_t; + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; lno_t numCols = numRows; diff --git a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp index 8df02a1d4d..dc2d199f09 100644 --- a/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp +++ b/sparse/unit_test/Test_Sparse_removeCrsMatrixZeros.hpp @@ -217,7 +217,8 @@ void getTestInput(int test, Matrix& A, Matrix& Afiltered_ref) { void testRemoveCrsMatrixZeros(int testCase) { using namespace TestRemoveCrsMatrixZeros; - using Matrix = KokkosSparse::CrsMatrix; + using Matrix = KokkosSparse::CrsMatrix; Matrix A, Afiltered_ref; getTestInput(testCase, A, Afiltered_ref); Matrix Afiltered_actual = KokkosSparse::removeCrsMatrixZeros(A); diff --git a/sparse/unit_test/Test_Sparse_rocsparse.hpp b/sparse/unit_test/Test_Sparse_rocsparse.hpp index 379b422d7f..6ec48e6665 100644 --- a/sparse/unit_test/Test_Sparse_rocsparse.hpp +++ b/sparse/unit_test/Test_Sparse_rocsparse.hpp @@ -47,11 +47,11 @@ void test_rocsparse_safe_call() { bool caught_exception = false; rocsparse_status myStatus = rocsparse_status_success; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(myStatus); try { myStatus = rocsparse_status_internal_error; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(myStatus); + KOKKOSSPARSE_IMPL_ROCSPARSE_SAFE_CALL(myStatus); } catch (std::runtime_error& e) { caught_exception = true; } diff --git a/sparse/unit_test/Test_Sparse_spiluk.hpp b/sparse/unit_test/Test_Sparse_spiluk.hpp index 1fde1ac5ed..89511dcf37 100644 --- a/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -134,12 +134,12 @@ struct SpilukTest { const scalar_t MONE = scalar_t(-1); // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows * block_size); + ValuesType e_one("e_one", static_cast(nrows) * block_size); Kokkos::deep_copy(e_one, ONE); // Create two views for spmv results - ValuesType bb("bb", nrows * block_size); - ValuesType bb_tmp("bb_tmp", nrows * block_size); + ValuesType bb("bb", static_cast(nrows) * block_size); + ValuesType bb_tmp("bb_tmp", static_cast(nrows) * block_size); // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); @@ -244,8 +244,8 @@ struct SpilukTest { Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL() * block_items); - ValuesType U_values("U_values", spiluk_handle->get_nnzU() * block_items); + ValuesType L_values("L_values", spiluk_handle->get_nnzL() * static_cast(block_items)); + ValuesType U_values("U_values", spiluk_handle->get_nnzU() * static_cast(block_items)); spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); @@ -579,7 +579,7 @@ struct SpilukTest { RowMapType_hostmirror hrow_map("hrow_map", bnrows + 1); EntriesType_hostmirror hentries("hentries", bnnz); - ValuesType_hostmirror hvalues("hvalues", bnnz * block_items); + ValuesType_hostmirror hvalues("hvalues", static_cast(bnnz) * block_items); Kokkos::deep_copy(hrow_map, brow_map); Kokkos::deep_copy(hentries, bentries); @@ -591,7 +591,7 @@ struct SpilukTest { // Allocate A as input A_row_map_v[i] = RowMapType("A_row_map", bnrows + 1); A_entries_v[i] = EntriesType("A_entries", bnnz); - A_values_v[i] = ValuesType("A_values", bnnz * block_items); + A_values_v[i] = ValuesType("A_values", static_cast(bnnz) * block_items); // Copy from host to device Kokkos::deep_copy(A_row_map_v[i], hrow_map); @@ -619,8 +619,8 @@ struct SpilukTest { Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); - L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL() * block_items); - U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU() * block_items); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL() * static_cast(block_items)); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU() * static_cast(block_items)); } // Done handle creation and spiluk_symbolic on all streams // Numeric phase diff --git a/sparse/unit_test/Test_Sparse_spmv.hpp b/sparse/unit_test/Test_Sparse_spmv.hpp index 43a9bd11e9..200a982d61 100644 --- a/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/sparse/unit_test/Test_Sparse_spmv.hpp @@ -864,9 +864,9 @@ void test_github_issue_101() { // vectors. Include a little extra in case the implementers decide // to strip-mine that. constexpr int numVecs = 22; - Kokkos::View X("X", numCols, numVecs); + Kokkos::View X("X", numCols, numVecs); Kokkos::deep_copy(X, static_cast(1.0)); - Kokkos::View Y("Y", numRows, numVecs); + Kokkos::View Y("Y", numRows, numVecs); auto Y_h = Kokkos::create_mirror_view(Y); // we'll want this later // Start with the easy test case, where the matrix and the vectors diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index dbc1fb7e02..53f1de5e56 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -316,8 +316,8 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a, const bool nans = fals using execution_space = typename Bsr::execution_space; using policy_type = Kokkos::RangePolicy; - size_t nx = a.numCols() * a.blockDim(); - size_t ny = a.numRows() * a.blockDim(); + size_t nx = static_cast(a.numCols()) * a.blockDim(); + size_t ny = static_cast(a.numRows()) * a.blockDim(); if (mode_is_transpose(mode)) { std::swap(nx, ny); } @@ -366,7 +366,7 @@ void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t ma // Tensor core algorithm temporarily disabled, fails on V100 /* - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) @@ -557,8 +557,8 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, const size_t nu using execution_space = typename Bsr::execution_space; using policy_type = Kokkos::RangePolicy; - size_t nx = a.numCols() * a.blockDim(); - size_t ny = a.numRows() * a.blockDim(); + size_t nx = static_cast(a.numCols()) * a.blockDim(); + size_t ny = static_cast(a.numRows()) * a.blockDim(); if (mode_is_transpose(mode)) { std::swap(nx, ny); } @@ -607,7 +607,7 @@ void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t // Tensor core algorithm temporarily disabled, fails on V100 /* - if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if constexpr (KokkosKernels::Impl::is_gpu_exec_space_v) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) diff --git a/sparse/unit_test/Test_Sparse_sptrsv.hpp b/sparse/unit_test/Test_Sparse_sptrsv.hpp index 385367bca2..a3f3f4ab7a 100644 --- a/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -179,6 +179,15 @@ struct SptrsvTest { const size_type nrows = row_map.size() - 1; for (auto alg : algs) { + // FIXME CUDA+Clang+Complex seems to expose a compiler bug +#if defined(__clang__) && defined(KOKKOS_ENABLE_CUDA) + if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1 && Kokkos::ArithTraits::isComplex && + std::is_same_v && block_size != 0) { + std::cerr << "Skipping TP1 alg test for blocked mtx. There's a compiler bug " + << "for clang+CUDA+complex" << std::endl; + continue; + } +#endif KernelHandle kh; kh.create_sptrsv_handle(alg, nrows, is_lower, block_size); if (alg == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { diff --git a/test_common/KokkosKernels_TestMatrixUtils.hpp b/test_common/KokkosKernels_TestMatrixUtils.hpp new file mode 100644 index 0000000000..998ccdba90 --- /dev/null +++ b/test_common/KokkosKernels_TestMatrixUtils.hpp @@ -0,0 +1,89 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTMATRIXUTILS_HPP +#define KOKKOSKERNELS_TESTMATRIXUTILS_HPP + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosBatched_Vector.hpp" +// Make this include-able from all subdirectories + +namespace Test { + +template +crsMat_t symmetrize(crsMat_t A) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); + auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); + auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); + lno_t numRows = A.numRows(); + // symmetrize as input_mat + input_mat^T, to still have a diagonally dominant + // matrix + typedef std::map Row; + std::vector symRows(numRows); + for (lno_t r = 0; r < numRows; r++) { + auto& row = symRows[r]; + for (size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) { + lno_t c = host_entries(i); + auto& col = symRows[c]; + auto it = row.find(c); + if (it == row.end()) + row[c] = host_values(i); + else + row[c] += host_values(i); + it = col.find(r); + if (it == col.end()) + col[r] = host_values(i); + else + col[r] += host_values(i); + } + } + // Count entries + Kokkos::View new_host_rowmap("Rowmap", numRows + 1); + size_t accum = 0; + for (lno_t r = 0; r <= numRows; r++) { + new_host_rowmap(r) = accum; + if (r < numRows) accum += symRows[r].size(); + } + // Allocate new entries/values + Kokkos::View new_host_entries("Entries", accum); + Kokkos::View new_host_values("Values", accum); + for (lno_t r = 0; r < numRows; r++) { + auto rowIt = symRows[r].begin(); + for (size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) { + new_host_entries(i) = rowIt->first; + new_host_values(i) = rowIt->second; + rowIt++; + } + } + lno_view_t new_rowmap("Rowmap", numRows + 1); + lno_nnz_view_t new_entries("Entries", accum); + scalar_view_t new_values("Values", accum); + Kokkos::deep_copy(new_rowmap, new_host_rowmap); + Kokkos::deep_copy(new_entries, new_host_entries); + Kokkos::deep_copy(new_values, new_host_values); + return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); +} + +} // namespace Test +#endif diff --git a/test_common/KokkosKernels_TestStringUtils.hpp b/test_common/KokkosKernels_TestStringUtils.hpp new file mode 100644 index 0000000000..fec1b09361 --- /dev/null +++ b/test_common/KokkosKernels_TestStringUtils.hpp @@ -0,0 +1,38 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTSTRINGUTILS_HPP +#define KOKKOSKERNELS_TESTSTRINGUTILS_HPP + +#include +#include + +namespace Test { + +inline int string_compare_no_case(const char* str1, const char* str2) { + std::string str1_s(str1); + std::string str2_s(str2); + for (size_t i = 0; i < str1_s.size(); i++) str1_s[i] = std::tolower(str1_s[i]); + for (size_t i = 0; i < str2_s.size(); i++) str2_s[i] = std::tolower(str2_s[i]); + return std::strcmp(str1_s.c_str(), str2_s.c_str()); +} + +inline int string_compare_no_case(const std::string& str1, const std::string& str2) { + return string_compare_no_case(str1.c_str(), str2.c_str()); +} + +} // namespace Test +#endif diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 24c07925e5..1d33bc8b75 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -212,153 +212,6 @@ using halfScalarType = Kokkos::Experimental::half_t; using bhalfScalarType = Kokkos::Experimental::bhalf_t; #endif // KOKKOS_BHALF_T_IS_FLOAT -template -struct SharedVanillaGEMM { - bool A_t, B_t, A_c, B_c; - int C_rows, C_cols, A_cols; - ViewTypeA A; - ViewTypeB B; - ViewTypeC C; - - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeB::value_type ScalarB; - typedef typename ViewTypeC::value_type ScalarC; - typedef Kokkos::View SubviewTypeA; - typedef Kokkos::View SubviewTypeB; - typedef Kokkos::ArithTraits APT; - typedef typename APT::mag_type mag_type; - ScalarA alpha; - ScalarC beta; - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, C_rows), [&](const int& i) { - // Give each kokkos thread a vector of A - SubviewTypeA a_vec; - if (A_t) - a_vec = Kokkos::subview(A, Kokkos::ALL(), i); - else - a_vec = Kokkos::subview(A, i, Kokkos::ALL()); - - // Have all vector lanes perform the dot product - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, C_cols), [&](const int& j) { - SubviewTypeB b_vec; - if (B_t) - b_vec = Kokkos::subview(B, j, Kokkos::ALL()); - else - b_vec = Kokkos::subview(B, Kokkos::ALL(), j); - ScalarC ab = ScalarC(0); - for (int k = 0; k < A_cols; k++) { - auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); - auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); - ab += a * b; - } - C(i, j) = beta * C(i, j) + alpha * ab; - }); - }); - } -}; -// C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) -template -struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; - ViewTypeA A; - ViewTypeB B; - ViewTypeC C; - - using ScalarA = typename ViewTypeA::value_type; - using ScalarB = typename ViewTypeB::value_type; - using ScalarC = typename ViewTypeC::value_type; - using SubviewTypeA = typename Kokkos::View; - using SubviewTypeB = typename Kokkos::View; - using SubviewTypeC = typename Kokkos::View; - - ScalarA alpha; - ScalarC beta; - - KOKKOS_INLINE_FUNCTION - void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { - int i = team.league_rank(); - SubviewTypeA _A; - SubviewTypeB _B; - SubviewTypeC _C; - - if (batch_size_last_dim) { - _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); - _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); - _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); - } else { - _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); - } - struct SharedVanillaGEMM vgemm; - vgemm.A_t = A_t; - vgemm.B_t = B_t; - vgemm.A_c = A_c; - vgemm.B_c = B_c; - vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); - vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); - vgemm.A_cols = batch_size_last_dim ? (A_t ? A.extent(0) : A.extent(1)) : (A_t ? A.extent(1) : A.extent(2)); - vgemm.A = _A; - vgemm.B = _B; - vgemm.C = _C; - vgemm.alpha = alpha; - vgemm.beta = beta; - vgemm(team); - } - - inline void run() { - Kokkos::parallel_for( - "Test::VanillaGEMM", - Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, - KokkosKernels::Impl::kk_get_max_vector_size()), - *this); - } -}; - -// Compute C := alpha * AB + beta * C -template -void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeB& B, - typename ViewTypeC::non_const_value_type beta, const ViewTypeC& C) { - using value_type = typename ViewTypeC::non_const_value_type; - using KAT = Kokkos::ArithTraits; - int m = A.extent(0); - int k = A.extent(1); - int n = B.extent(1); - for (int i = 0; i < m; i++) { - for (int j = 0; j < n; j++) { - value_type sum = KAT::zero(); - for (int ii = 0; ii < k; ii++) { - sum += A(i, ii) * B(ii, j); - } - C(i, j) = alpha * sum + beta * C(i, j); - } - } -} - -template -KOKKOS_INLINE_FUNCTION void vanillaGEMV(char mode, AlphaType alpha, const ViewTypeA& A, const ViewTypeX& x, - BetaType beta, const ViewTypeY& y) { - using ScalarY = typename ViewTypeY::non_const_value_type; - using KAT_A = Kokkos::ArithTraits; - const bool transposed = mode == 'T' || mode == 'C'; - const bool conjugated = mode == 'C'; - const bool has_beta = beta != Kokkos::ArithTraits::zero(); - int M = A.extent(transposed ? 1 : 0); - int N = A.extent(transposed ? 0 : 1); - for (int i = 0; i < M; i++) { - ScalarY y_i{}; - if (has_beta) y_i = beta * y(i); - for (int j = 0; j < N; j++) { - const auto a = transposed ? A(j, i) : A(i, j); - const auto Aij = conjugated ? KAT_A::conj(a) : a; - y_i += alpha * Aij * x(j); - } - y(i) = y_i; - } -} - template class epsilon { public: @@ -367,64 +220,6 @@ class epsilon { using KokkosKernels::Impl::getRandomBounds; -template -crsMat_t symmetrize(crsMat_t A) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - auto host_rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.row_map); - auto host_entries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.graph.entries); - auto host_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A.values); - lno_t numRows = A.numRows(); - // symmetrize as input_mat + input_mat^T, to still have a diagonally dominant - // matrix - typedef std::map Row; - std::vector symRows(numRows); - for (lno_t r = 0; r < numRows; r++) { - auto& row = symRows[r]; - for (size_type i = host_rowmap(r); i < host_rowmap(r + 1); i++) { - lno_t c = host_entries(i); - auto& col = symRows[c]; - auto it = row.find(c); - if (it == row.end()) - row[c] = host_values(i); - else - row[c] += host_values(i); - it = col.find(r); - if (it == col.end()) - col[r] = host_values(i); - else - col[r] += host_values(i); - } - } - // Count entries - Kokkos::View new_host_rowmap("Rowmap", numRows + 1); - size_t accum = 0; - for (lno_t r = 0; r <= numRows; r++) { - new_host_rowmap(r) = accum; - if (r < numRows) accum += symRows[r].size(); - } - // Allocate new entries/values - Kokkos::View new_host_entries("Entries", accum); - Kokkos::View new_host_values("Values", accum); - for (lno_t r = 0; r < numRows; r++) { - auto rowIt = symRows[r].begin(); - for (size_type i = new_host_rowmap(r); i < new_host_rowmap(r + 1); i++) { - new_host_entries(i) = rowIt->first; - new_host_values(i) = rowIt->second; - rowIt++; - } - } - lno_view_t new_rowmap("Rowmap", numRows + 1); - lno_nnz_view_t new_entries("Entries", accum); - scalar_view_t new_values("Values", accum); - Kokkos::deep_copy(new_rowmap, new_host_rowmap); - Kokkos::deep_copy(new_entries, new_host_entries); - Kokkos::deep_copy(new_values, new_host_values); - return crsMat_t("SymA", numRows, numRows, accum, new_values, new_rowmap, new_entries); -} - // create_random_x_vector and create_random_y_vector can be used together to // generate a random linear system Ax = y. template @@ -485,17 +280,6 @@ std::string value_type_name>() { return "::ComplexDouble"; } -int string_compare_no_case(const char* str1, const char* str2) { - std::string str1_s(str1); - std::string str2_s(str2); - for (size_t i = 0; i < str1_s.size(); i++) str1_s[i] = std::tolower(str1_s[i]); - for (size_t i = 0; i < str2_s.size(); i++) str2_s[i] = std::tolower(str2_s[i]); - return strcmp(str1_s.c_str(), str2_s.c_str()); -} - -int string_compare_no_case(const std::string& str1, const std::string& str2) { - return string_compare_no_case(str1.c_str(), str2.c_str()); -} /// /brief Coo matrix class for testing purposes. /// \tparam ScalarType /// \tparam LayoutType @@ -561,7 +345,7 @@ class RandCooMat { /// \tparam LayoutType /// \tparam Device template + typename Size = KokkosKernels::default_size_type> class RandCsMatrix { public: using value_type = ScalarType; diff --git a/test_common/KokkosKernels_TestVanilla.hpp b/test_common/KokkosKernels_TestVanilla.hpp new file mode 100644 index 0000000000..43ab7a3cf0 --- /dev/null +++ b/test_common/KokkosKernels_TestVanilla.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_TESTVANILLA_HPP +#define KOKKOSKERNELS_TESTVANILLA_HPP + +#include + +#include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace Test { + +template +struct SharedVanillaGEMM { + bool A_t, B_t, A_c, B_c; + int C_rows, C_cols, A_cols; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + typedef typename ViewTypeA::value_type ScalarA; + typedef typename ViewTypeB::value_type ScalarB; + typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::View SubviewTypeA; + typedef Kokkos::View SubviewTypeB; + typedef Kokkos::ArithTraits APT; + typedef typename APT::mag_type mag_type; + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, C_rows), [&](const int& i) { + // Give each kokkos thread a vector of A + SubviewTypeA a_vec; + if (A_t) + a_vec = Kokkos::subview(A, Kokkos::ALL(), i); + else + a_vec = Kokkos::subview(A, i, Kokkos::ALL()); + + // Have all vector lanes perform the dot product + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, C_cols), [&](const int& j) { + SubviewTypeB b_vec; + if (B_t) + b_vec = Kokkos::subview(B, j, Kokkos::ALL()); + else + b_vec = Kokkos::subview(B, Kokkos::ALL(), j); + ScalarC ab = ScalarC(0); + for (int k = 0; k < A_cols; k++) { + auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); + auto b = B_c ? APT::conj(b_vec(k)) : b_vec(k); + ab += a * b; + } + C(i, j) = beta * C(i, j) + alpha * ab; + }); + }); + } +}; +// C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) +template +struct Functor_BatchedVanillaGEMM { + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; + ViewTypeA A; + ViewTypeB B; + ViewTypeC C; + + using ScalarA = typename ViewTypeA::value_type; + using ScalarB = typename ViewTypeB::value_type; + using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + + ScalarA alpha; + ScalarC beta; + + KOKKOS_INLINE_FUNCTION + void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { + int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; + + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + } + struct SharedVanillaGEMM vgemm; + vgemm.A_t = A_t; + vgemm.B_t = B_t; + vgemm.A_c = A_c; + vgemm.B_c = B_c; + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t ? A.extent(0) : A.extent(1)) : (A_t ? A.extent(1) : A.extent(2)); + vgemm.A = _A; + vgemm.B = _B; + vgemm.C = _C; + vgemm.alpha = alpha; + vgemm.beta = beta; + vgemm(team); + } + + inline void run() { + Kokkos::parallel_for( + "Test::VanillaGEMM", + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, + KokkosKernels::Impl::kk_get_max_vector_size()), + *this); + } +}; + +// Compute C := alpha * AB + beta * C +template +void vanillaGEMM(typename ViewTypeC::non_const_value_type alpha, const ViewTypeA& A, const ViewTypeB& B, + typename ViewTypeC::non_const_value_type beta, const ViewTypeC& C) { + using value_type = typename ViewTypeC::non_const_value_type; + using KAT = Kokkos::ArithTraits; + int m = A.extent(0); + int k = A.extent(1); + int n = B.extent(1); + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + value_type sum = KAT::zero(); + for (int ii = 0; ii < k; ii++) { + sum += A(i, ii) * B(ii, j); + } + C(i, j) = alpha * sum + beta * C(i, j); + } + } +} + +template +KOKKOS_INLINE_FUNCTION void vanillaGEMV(char mode, AlphaType alpha, const ViewTypeA& A, const ViewTypeX& x, + BetaType beta, const ViewTypeY& y) { + using ScalarY = typename ViewTypeY::non_const_value_type; + using KAT_A = Kokkos::ArithTraits; + const bool transposed = mode == 'T' || mode == 'C'; + const bool conjugated = mode == 'C'; + const bool has_beta = beta != Kokkos::ArithTraits::zero(); + int M = A.extent(transposed ? 1 : 0); + int N = A.extent(transposed ? 0 : 1); + for (int i = 0; i < M; i++) { + ScalarY y_i{}; + if (has_beta) y_i = beta * y(i); + for (int j = 0; j < N; j++) { + const auto a = transposed ? A(j, i) : A(i, j); + const auto Aij = conjugated ? KAT_A::conj(a) : a; + y_i += alpha * Aij * x(j); + } + y(i) = y_i; + } +} + +} // namespace Test +#endif diff --git a/test_common/Kokkos_Performance.hpp b/test_common/Kokkos_Performance.hpp index 648f7c5356..10e366bc3c 100644 --- a/test_common/Kokkos_Performance.hpp +++ b/test_common/Kokkos_Performance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_PERFORMANCE_HPP -#define KOKKOS_PERFORMANCE_HPP +#ifndef KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP +#define KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP #include "Kokkos_Core.hpp" #include @@ -555,4 +555,4 @@ void Performance::Tolerance::from_string(const std::string& valtol_str) { } // namespace KokkosKernels -#endif // KOKKOS_PERFORMANCE_HPP +#endif // KOKKOSKERNELS_KOKKOS_PERFORMANCE_HPP