From 9e7232ac347a54e0405a1b0bcfd593930a1b8095 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Thu, 10 Oct 2024 15:38:26 +0200 Subject: [PATCH] Fix bugs in generators --- .github/workflows/codegen.yml | 14 +++- pspamm/codegen/architectures/arm/generator.py | 24 +++--- .../architectures/arm_sve/generator.py | 4 +- pspamm/codegen/architectures/hsw/generator.py | 10 +-- pspamm/codegen/architectures/knl/generator.py | 23 +++-- .../architectures/knl/inlineprinter.py | 5 +- pspamm/codegen/precision.py | 3 + pspamm/matmul.py | 13 +-- tests/sve_testsuite_generator.py | 72 ++++++---------- tests/testsuite_generator.py | 84 +++++++++++-------- tests/unit_tests_arm_sve.py | 19 +++-- tests/unit_tests_hsw.py | 3 +- tests/unit_tests_knl.py | 3 +- 13 files changed, 149 insertions(+), 128 deletions(-) diff --git a/.github/workflows/codegen.yml b/.github/workflows/codegen.yml index 5a3a9ba..8894641 100644 --- a/.github/workflows/codegen.yml +++ b/.github/workflows/codegen.yml @@ -69,12 +69,14 @@ jobs: - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/hsw_testsuite.cpp -o build/hsw-test + g++ -static -mavx2 build/hsw256_testsuite.cpp -o build/hsw256-test + g++ -static -mavx2 build/hsw128_testsuite.cpp -o build/hsw128-test - name: pspamm-tests-run run: | cd tests/ - qemu-x86_64-static -cpu Haswell build/hsw-test + qemu-x86_64-static -cpu Haswell build/hsw256-test + qemu-x86_64-static -cpu Haswell build/hsw128-test pspamm-codegen-avx512-no-run: name: pspamm-codegen-avx512-no-run @@ -107,13 +109,17 @@ jobs: - name: pspamm-tests-compile run: | cd tests/ - g++ -static -mavx512f build/knl_testsuite.cpp -o build/knl-test + g++ -static -mavx512f build/knl512_testsuite.cpp -o build/knl512-test + g++ -static -mavx512f build/knl256_testsuite.cpp -o build/knl256-test + g++ -static -mavx512f build/knl128_testsuite.cpp -o build/knl128-test # disabled, since qemu doesn't support AVX512F (yet) with of Ubuntu 24.04 # - name: pspamm-tests-run # run: | # cd tests/ - # qemu-x86_64-static -cpu Skylake-Server build/knl-test + # qemu-x86_64-static -cpu Skylake-Server build/knl512-test + # qemu-x86_64-static -cpu Skylake-Server build/knl256-test + # qemu-x86_64-static -cpu Skylake-Server build/knl128-test pspamm-codegen-aarch64: name: pspamm-codegen-aarch64 diff --git a/pspamm/codegen/architectures/arm/generator.py b/pspamm/codegen/architectures/arm/generator.py index 89de2e3..fcb6a75 100644 --- a/pspamm/codegen/architectures/arm/generator.py +++ b/pspamm/codegen/architectures/arm/generator.py @@ -32,9 +32,7 @@ class Generator(AbstractGenerator): """ def get_v_size(self): - if self.precision == Precision.DOUBLE: - return 2 - raise NotImplementedError + return 16 // self.precision.size() def get_template(self): return Generator.template @@ -137,7 +135,7 @@ def move_register_block(self, next_offset = [0, 0] if ir+1 < rows: next_offset = [1, 0] - elif ic +1 < rows: + elif ic +1 < cols: next_offset = [0, 1] addr_next, comment_next = cursor.look(cursor_ptr, block_offset, Coords(down=(ir+next_offset[0])*v_size, right=ic+next_offset[1])) @@ -153,16 +151,16 @@ def move_register_block(self, addr.disp = 0 addr.base = additional_regs[0] - if not skipflag: - if store: - asm.add(st(registers[ir,ic], addr, True, comment)) - else: - asm.add(ld(addr, registers[ir,ic], True, comment)) - else: - if store: - asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if not skipflag: + if store: + asm.add(st(registers[ir,ic], addr, True, comment)) + else: + asm.add(ld(addr, registers[ir,ic], True, comment)) else: - asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + if store: + asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) + else: + asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]])) return asm diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index ac56177..61a8217 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -230,7 +230,7 @@ def move_register_block(self, # addr = base "pointer" + relative offset in bytes addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset # count how many elements we have processed between last step and this step cont_counter = ((addr.disp - prev_disp) // mul_vl) @@ -313,7 +313,7 @@ def make_microkernel(self, cur11 = -1000 Vm = max(self.ceil_div(bm, v_size), 1) - multiple = self.precision.value + multiple = self.precision.size() # for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252 # for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504 # in both cases: instruction encodes the immediate offset within 6 bits diff --git a/pspamm/codegen/architectures/hsw/generator.py b/pspamm/codegen/architectures/hsw/generator.py index 47bceae..912183a 100644 --- a/pspamm/codegen/architectures/hsw/generator.py +++ b/pspamm/codegen/architectures/hsw/generator.py @@ -87,11 +87,11 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): additional_regs.append(available_regs[reg_count]) reg_count += 1 @@ -134,7 +134,7 @@ def make_b_pointers(self, reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): asm.add(lea(B_reg, additional_regs[reg_count], i)) reg_count += 1 @@ -183,7 +183,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset if store: asm.add(mov(registers[ir,ic], addr, True, comment)) if prefetching == 'BL2viaC': @@ -212,7 +212,7 @@ def move_register_single(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset asm.add(mov(addr, registers[ir,ic], True, comment)) return asm diff --git a/pspamm/codegen/architectures/knl/generator.py b/pspamm/codegen/architectures/knl/generator.py index ad8a6ae..9530ef8 100644 --- a/pspamm/codegen/architectures/knl/generator.py +++ b/pspamm/codegen/architectures/knl/generator.py @@ -44,7 +44,14 @@ def use_broadcast(self): return False def has_masks(self): - return True + return False # for now + + def pred_n_trues(self, count, v_size, mode): + # a bit hacky at the moment (won't work for all masks) + if count < v_size and count > 0: + return mask(0) + else: + return None def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int): vm = self.ceil_div(bm, v_size) @@ -74,15 +81,15 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n: reg_count = 0 - for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048): + for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048): additional_regs.append(available_regs[reg_count]) reg_count += 1 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): additional_regs.append(available_regs[reg_count]) reg_count += 1 - loop_reg = [r(12), r(13), r(14)] + loop_regs = [r(12), r(13), r(14)] return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs @@ -93,8 +100,8 @@ def init_mask(self, bm, v_size, tempreg, maskregs): else: asm = block("Set mask register") restval = (1 << rest) - 1 - asm.add(mov(restval, tempreg)) - asm.add(mov(tempreg, maskreg[0])) + asm.add(mov(restval, tempreg, False)) + asm.add(mov(tempreg, maskregs[0], False)) return asm def bcst_alpha_beta(self, @@ -131,7 +138,7 @@ def make_b_pointers(self, reg_count = 5 - for i in range(8192, min(nnz * self.precision.value, 33000), 8192): + for i in range(8192, min(nnz * self.precision.size(), 33000), 8192): asm.add(lea(B_reg, additional_regs[reg_count], i)) reg_count += 1 @@ -180,7 +187,7 @@ def move_register_block(self, if (mask is None) or (mask[ir,ic]): cell_offset = Coords(down=ir*v_size, right=ic) addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) - addr.disp += self.precision.value * load_offset + addr.disp += self.precision.size() * load_offset if store: asm.add(mov(registers[ir,ic], addr, True, comment)) if prefetching == 'BL2viaC': diff --git a/pspamm/codegen/architectures/knl/inlineprinter.py b/pspamm/codegen/architectures/knl/inlineprinter.py index fe8aa1c..33b313f 100644 --- a/pspamm/codegen/architectures/knl/inlineprinter.py +++ b/pspamm/codegen/architectures/knl/inlineprinter.py @@ -54,7 +54,8 @@ def addLine(self, stmt: str, comment: str): self.output.append(line) - + def maskformat(self, pred): + pass def visitFma(self, stmt: FmaStmt): b = stmt.bcast_src.ugly @@ -75,7 +76,7 @@ def visitMul(self, stmt: MulStmt): b = stmt.src.ugly m = stmt.mult_src.ugly a = stmt.dest.ugly - regsize = stmt.add_dest.size() // 16 + regsize = stmt.dest.size() // 16 if stmt.mult_src.typeinfo == AsmType.i64: # in this case, m is a Register that points to alpha/beta; manually format to be a memory address s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a) diff --git a/pspamm/codegen/precision.py b/pspamm/codegen/precision.py index df5153e..417c9a6 100644 --- a/pspamm/codegen/precision.py +++ b/pspamm/codegen/precision.py @@ -10,6 +10,9 @@ class Precision(Enum): def getCType(cls, precision): ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'} return ctype[precision] + + def ctype(self): + return self.getCType(self) def size(self): return { diff --git a/pspamm/matmul.py b/pspamm/matmul.py index 0d5d6e1..6b43b9f 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -218,17 +218,18 @@ def __init__(self, if lda == 0: blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk) - self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value, blocks, patterns, mtx_overhead) + self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead) + self.nnz += sum(mtx_overhead) else: - self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value) + self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size()) if ldb == 0: blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn) - self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns, mtx_overhead) + self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead) self.nnz += sum(mtx_overhead) else: - self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value) - self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) - self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None + self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size()) + self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) + self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if prefetchReg else None self.unroll = ldb == 0 or lda == 0 diff --git a/tests/sve_testsuite_generator.py b/tests/sve_testsuite_generator.py index d3832bf..3281ef2 100644 --- a/tests/sve_testsuite_generator.py +++ b/tests/sve_testsuite_generator.py @@ -31,6 +31,8 @@ def make(kernels, arch): f.write(test_generator.head_of_testsuite) + testcases = [] + for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)] @@ -41,12 +43,12 @@ def make(kernels, arch): prec = 's' if kern.precision == Precision.SINGLE else 'd' arguments += ['--precision', prec] - block_sizes = list(set(kern.block_sizes)) + block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) for bs in block_sizes: bm = bs[0] bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 + bk = bs[2] if arch == "knl": assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) @@ -58,11 +60,10 @@ def make(kernels, arch): reglen = veclen // 128 v_len = (16 // kern.precision.size()) * reglen # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 # ceiling division - vm = -(bm // -v_len) + vm = -(bm // -v_len) if not ((bn + bk) * vm + bn * bk <= 32): - print(f'Skipping block size {bm}x{bn} for {arch}') + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') continue name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) @@ -78,50 +79,15 @@ def make(kernels, arch): f.write('#include "' + arch + '/' + name + '.h"\n') - f.write('\n') - # necessary functions are defined in testsuite_generator.py - f.write(test_generator.function_definitions) - f.write(setup_prefetching) - f.write(test_generator.setup_main) - # add variable declarations for single precision test cases - f.write(""" std::tuple fpointers; - float falpha; float fbeta; - double* prefetch; - float* fprefetch; - """) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - - if arch.startswith("arm_sve"): - veclen = int(arch[7:]) - assert veclen % 128 == 0 and veclen <= 2048 - reglen = veclen // 128 - v_len = (16 // kern.precision.size()) * reglen - # this should be the same assertion as in ../scripts/max_arm_sve.py - bk = 1 - # ceiling division - vm = -( bm // -v_len) - if not ((bn + bk) * vm + bn * bk <= 32): - # print(f'Skipping block size {bm}x{bn} for {arch}') - continue - - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) - if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" - # for double precision: set prec to '' to conform to test_generator.function_definitions - prec = 'f' if kern.precision == Precision.SINGLE else '' + + prec2 = 'f' if kern.precision == Precision.SINGLE else '' - f.write(""" + testcases += [ + """ {p}alpha = {alpha}; {p}beta = {beta}; ldb = {ldb}; {p}pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); setup_prefetch({p}prefetch, std::get<3>({p}pointers), {n}, {ldc}); @@ -130,6 +96,22 @@ def make(kernels, arch): results.push_back(std::make_tuple("{name}", result)); free(std::get<0>({p}pointers)); free(std::get<1>({p}pointers)); free(std::get<2>({p}pointers)); free(std::get<3>({p}pointers)); free(std::get<4>({p}pointers)); free({p}prefetch); """.format(m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, beta=kern.beta, - mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec, T="float" if prec == 'f' else "double")) + mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec2, T="float" if prec == 'f' else "double") + ] + + f.write('\n') + # necessary functions are defined in testsuite_generator.py + f.write(test_generator.function_definitions) + f.write(setup_prefetching) + f.write(test_generator.setup_main) + # add variable declarations for single precision test cases + f.write(""" std::tuple fpointers; + float falpha; float fbeta; + double* prefetch; + float* fprefetch; + """) + + for testcase in testcases: + f.write(testcase) f.write(test_generator.end_of_testsuite) diff --git a/tests/testsuite_generator.py b/tests/testsuite_generator.py index e4bc9be..86f09a3 100755 --- a/tests/testsuite_generator.py +++ b/tests/testsuite_generator.py @@ -4,6 +4,7 @@ import random import sys import os.path +from pspamm.codegen.precision import * BASEDIR = 'build' @@ -145,22 +146,20 @@ int main() { std::vector> results; - std::tuple pointers; - int result; - - // A compiler related issue makes it necessary to store certain values in variables before using them - unsigned ldb; - double alpha; double beta; """ setup_single_testcase = """ - ldb = {ldb}; alpha = {alpha}; beta = {beta}; - pointers = pre({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); +{{ + const unsigned ldb = {ldb}; + const {precision} alpha = {alpha}; + const {precision} beta = {beta}; + auto pointers = pre<{precision}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}"); {name}(std::get<0>(pointers), std::get<{sparse}>(pointers), std::get<3>(pointers), {alpha}, {beta}, nullptr); - result = post({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); + const auto result = post<{precision}>({m}, {n}, {k}, {lda}, &ldb, {ldc}, &alpha, &beta, std::get<0>(pointers), std::get<1>(pointers), std::get<3>(pointers), std::get<4>(pointers), {delta:.7f}); results.push_back(std::make_tuple("{name}", result)); free(std::get<0>(pointers)); free(std::get<1>(pointers)); free(std::get<2>(pointers)); free(std::get<3>(pointers)); free(std::get<4>(pointers)); +}} """ end_of_testsuite = """ @@ -219,6 +218,8 @@ def make(kernels, arch): f.write(head_of_testsuite) + testcases = [] + for kern in kernels: arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda), str(kern.ldb), @@ -227,17 +228,38 @@ def make(kernels, arch): if isinstance(kern, SparseKernel): arguments += ['--mtx_filename', kern.mtx] - block_sizes = list(set(kern.block_sizes)) + prec = 's' if kern.precision == Precision.SINGLE else 'd' + arguments += ['--precision', prec] + + block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes)) for bs in block_sizes: bm = bs[0] bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - - if arch == "knl": - assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32) - elif arch == "arm": - assert (bm % 2 == 0 and (bn + 1) * (bm / 2) + bn <= 32) + bk = bs[2] + + veclen = int(arch[3:]) if arch[3:] != '' else 128 + assert veclen % 128 == 0 + reglen = veclen // 128 + v_len = (16 // kern.precision.size()) * reglen + # this should be the same assertion as in ../scripts/max_arm_sve.py + # ceiling division + vm = -(bm // -v_len) + v_size = v_len + + if arch.startswith("knl"): + print(f'{bn} {bk} {vm} {bm} {v_size}') + if not ((bn+bk) * vm <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("hsw"): + if not ((bn+bk) * vm + bn * bk <= 16) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue + elif arch.startswith("arm"): + if not ((bn+bk) * vm + bn * bk <= 32) or not (kern.m % v_size) == 0 or not (bm % v_size) == 0: + print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}') + continue name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) @@ -253,28 +275,24 @@ def make(kernels, arch): f.write('#include "' + arch + '/' + name + '.h"\n') - f.write('\n') - - f.write(function_definitions) - f.write(setup_main) - - for kern in kernels: - - block_sizes = list(set(kern.block_sizes)) - - for bs in block_sizes: - bm = bs[0] - bn = bs[1] - bk = bs[2] if len(bs) > 2 else 1 - name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk) - if isinstance(kern, SparseKernel): mtx = kern.mtx else: mtx = "" - f.write(setup_single_testcase.format( + testcases += [ + setup_single_testcase.format( m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, - beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1)) + beta=kern.beta, mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, + precision=kern.precision.ctype()) + ] + + f.write('\n') + + f.write(function_definitions) + f.write(setup_main) + + for testcase in testcases: + f.write(testcase) f.write(end_of_testsuite) diff --git a/tests/unit_tests_arm_sve.py b/tests/unit_tests_arm_sve.py index ad4a0fb..b57b2f9 100644 --- a/tests/unit_tests_arm_sve.py +++ b/tests/unit_tests_arm_sve.py @@ -16,6 +16,7 @@ v_size = lambda prec: (16 // prec.size()) * v_len v_size_d = v_size(Precision.DOUBLE) v_size_s = v_size(Precision.SINGLE) +v_size_h = v_size(Precision.HALF) bitlen = v_len * 128 kernels = [] @@ -67,13 +68,15 @@ kernels.append(generator.SparseKernel("sve_single_prec_test_S7", Precision.SINGLE, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) kernels.append(generator.SparseKernel("sve_single_prec_test_S8", Precision.SINGLE, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) -kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_s) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) +""" +kernels.append(generator.DenseKernel("sve_half_prec_test_S1", Precision.HALF, 9, 9, 9, 9, 9, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S2", Precision.HALF, 15, 15, 15, 15, 15, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S3", Precision.HALF, 23, 23, 23, 23, 23, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.DenseKernel("sve_half_prec_test_S4", Precision.HALF, 23, 31, 13, 23, 13, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S5", Precision.HALF, 9, 9, 9, 9, 0, 9, 1.24, 0.87, [x.getBlocksize(9, 9, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(9, 9, 8), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S6", Precision.HALF, 15, 15, 15, 15, 0, 15, -3.14, 6.28, [x.getBlocksize(15, 15, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(15, 15, 22), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S7", Precision.HALF, 23, 23, 23, 23, 0, 23, 1.5, -0.66, [x.getBlocksize(23, 23, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(23, 23, 52), delta_sp)) +kernels.append(generator.SparseKernel("sve_half_prec_test_S8", Precision.HALF, 23, 31, 13, 23, 0, 23, 2.0, 0.0, [x.getBlocksize(23, 31, 1, v_size_h) for x in blocksize_algs], generator.generateMTX(13, 31, 40), delta_sp)) +""" generator.make(kernels, f"arm_sve{bitlen}") diff --git a/tests/unit_tests_hsw.py b/tests/unit_tests_hsw.py index 47316e5..92fa298 100755 --- a/tests/unit_tests_hsw.py +++ b/tests/unit_tests_hsw.py @@ -30,6 +30,7 @@ kernels.append(generator.DenseKernel(f"hsw_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) kernels.append(generator.DenseKernel(f"hsw_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) -generator.make(kernels, "hsw") +for arch in ('hsw256', 'hsw128'): + generator.make(kernels, arch) diff --git a/tests/unit_tests_knl.py b/tests/unit_tests_knl.py index 1795301..b673be3 100755 --- a/tests/unit_tests_knl.py +++ b/tests/unit_tests_knl.py @@ -32,6 +32,7 @@ kernels.append(generator.DenseKernel(f"knl_only_test13_{precision}", precision, 8, 1, 1, 16, 1, 56, 0.0, 123.0, [(8, 1)] + [x.getBlocksize(8, 1, 2) for x in blocksize_algs], 0.0000001)) kernels.append(generator.DenseKernel(f"knl_only_test14_{precision}", precision, 8, 24, 40, 8, 41, 8, 2.0, 1.0, [(8, 24)] + [x.getBlocksize(8, 24, 2) for x in blocksize_algs], 0.0000001)) -generator.make(kernels, "knl") +for arch in ('knl512', 'knl256', 'knl128'): + generator.make(kernels, arch)