Skip to content

Commit

Permalink
Fix bugs in generators
Browse files Browse the repository at this point in the history
  • Loading branch information
davschneller committed Oct 11, 2024
1 parent dae3e37 commit 9e7232a
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 128 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/codegen.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,14 @@ jobs:
- name: pspamm-tests-compile
run: |
cd tests/
g++ -static -mavx512f build/hsw_testsuite.cpp -o build/hsw-test
g++ -static -mavx2 build/hsw256_testsuite.cpp -o build/hsw256-test
g++ -static -mavx2 build/hsw128_testsuite.cpp -o build/hsw128-test
- name: pspamm-tests-run
run: |
cd tests/
qemu-x86_64-static -cpu Haswell build/hsw-test
qemu-x86_64-static -cpu Haswell build/hsw256-test
qemu-x86_64-static -cpu Haswell build/hsw128-test
pspamm-codegen-avx512-no-run:
name: pspamm-codegen-avx512-no-run
Expand Down Expand Up @@ -107,13 +109,17 @@ jobs:
- name: pspamm-tests-compile
run: |
cd tests/
g++ -static -mavx512f build/knl_testsuite.cpp -o build/knl-test
g++ -static -mavx512f build/knl512_testsuite.cpp -o build/knl512-test
g++ -static -mavx512f build/knl256_testsuite.cpp -o build/knl256-test
g++ -static -mavx512f build/knl128_testsuite.cpp -o build/knl128-test
# disabled, since qemu doesn't support AVX512F (yet) with of Ubuntu 24.04
# - name: pspamm-tests-run
# run: |
# cd tests/
# qemu-x86_64-static -cpu Skylake-Server build/knl-test
# qemu-x86_64-static -cpu Skylake-Server build/knl512-test
# qemu-x86_64-static -cpu Skylake-Server build/knl256-test
# qemu-x86_64-static -cpu Skylake-Server build/knl128-test

pspamm-codegen-aarch64:
name: pspamm-codegen-aarch64
Expand Down
24 changes: 11 additions & 13 deletions pspamm/codegen/architectures/arm/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,7 @@ class Generator(AbstractGenerator):
"""

def get_v_size(self):
if self.precision == Precision.DOUBLE:
return 2
raise NotImplementedError
return 16 // self.precision.size()

def get_template(self):
return Generator.template
Expand Down Expand Up @@ -137,7 +135,7 @@ def move_register_block(self,
next_offset = [0, 0]
if ir+1 < rows:
next_offset = [1, 0]
elif ic +1 < rows:
elif ic +1 < cols:
next_offset = [0, 1]

addr_next, comment_next = cursor.look(cursor_ptr, block_offset, Coords(down=(ir+next_offset[0])*v_size, right=ic+next_offset[1]))
Expand All @@ -153,16 +151,16 @@ def move_register_block(self,
addr.disp = 0
addr.base = additional_regs[0]

if not skipflag:
if store:
asm.add(st(registers[ir,ic], addr, True, comment))
else:
asm.add(ld(addr, registers[ir,ic], True, comment))
else:
if store:
asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]]))
if not skipflag:
if store:
asm.add(st(registers[ir,ic], addr, True, comment))
else:
asm.add(ld(addr, registers[ir,ic], True, comment))
else:
asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]]))
if store:
asm.add(st(registers[ir,ic], addr, True, comment, registers[ir+next_offset[0],ic+next_offset[1]]))
else:
asm.add(ld(addr, registers[ir,ic], True, comment, registers[ir+next_offset[0],ic+next_offset[1]]))

return asm

Expand Down
4 changes: 2 additions & 2 deletions pspamm/codegen/architectures/arm_sve/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def move_register_block(self,

# addr = base "pointer" + relative offset in bytes
addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset)
addr.disp += self.precision.value * load_offset
addr.disp += self.precision.size() * load_offset

# count how many elements we have processed between last step and this step
cont_counter = ((addr.disp - prev_disp) // mul_vl)
Expand Down Expand Up @@ -313,7 +313,7 @@ def make_microkernel(self,
cur11 = -1000
Vm = max(self.ceil_div(bm, v_size), 1)

multiple = self.precision.value
multiple = self.precision.size()
# for ld1rw (single prec): immediate offset is multiple of 4 in range of 0 to 252
# for ld1rd (double prec): immediate offset is multiple of 8 in range of 0 to 504
# in both cases: instruction encodes the immediate offset within 6 bits
Expand Down
10 changes: 5 additions & 5 deletions pspamm/codegen/architectures/hsw/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:

reg_count = 0

for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048):
for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048):
additional_regs.append(available_regs[reg_count])
reg_count += 1

for i in range(8192, min(nnz * self.precision.value, 33000), 8192):
for i in range(8192, min(nnz * self.precision.size(), 33000), 8192):
additional_regs.append(available_regs[reg_count])
reg_count += 1

Expand Down Expand Up @@ -134,7 +134,7 @@ def make_b_pointers(self,

reg_count = 5

for i in range(8192, min(nnz * self.precision.value, 33000), 8192):
for i in range(8192, min(nnz * self.precision.size(), 33000), 8192):
asm.add(lea(B_reg, additional_regs[reg_count], i))
reg_count += 1

Expand Down Expand Up @@ -183,7 +183,7 @@ def move_register_block(self,
if (mask is None) or (mask[ir,ic]):
cell_offset = Coords(down=ir*v_size, right=ic)
addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset)
addr.disp += self.precision.value * load_offset
addr.disp += self.precision.size() * load_offset
if store:
asm.add(mov(registers[ir,ic], addr, True, comment))
if prefetching == 'BL2viaC':
Expand Down Expand Up @@ -212,7 +212,7 @@ def move_register_single(self,
if (mask is None) or (mask[ir,ic]):
cell_offset = Coords(down=ir*v_size, right=ic)
addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset)
addr.disp += self.precision.value * load_offset
addr.disp += self.precision.size() * load_offset
asm.add(mov(addr, registers[ir,ic], True, comment))
return asm

Expand Down
23 changes: 15 additions & 8 deletions pspamm/codegen/architectures/knl/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,14 @@ def use_broadcast(self):
return False

def has_masks(self):
return True
return False # for now

def pred_n_trues(self, count, v_size, mode):
# a bit hacky at the moment (won't work for all masks)
if count < v_size and count > 0:
return mask(0)
else:
return None

def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:int, k:int):
vm = self.ceil_div(bm, v_size)
Expand Down Expand Up @@ -74,15 +81,15 @@ def make_reg_blocks(self, bm:int, bn:int, bk:int, v_size:int, nnz:int, m:int, n:

reg_count = 0

for i in range(1024, min(max(nnz * self.precision.value, m*k*self.precision.value, m*n*self.precision.value),8000), 2048):
for i in range(1024, min(max(nnz * self.precision.size(), m*k*self.precision.size(), m*n*self.precision.size()),8000), 2048):
additional_regs.append(available_regs[reg_count])
reg_count += 1

for i in range(8192, min(nnz * self.precision.value, 33000), 8192):
for i in range(8192, min(nnz * self.precision.size(), 33000), 8192):
additional_regs.append(available_regs[reg_count])
reg_count += 1

loop_reg = [r(12), r(13), r(14)]
loop_regs = [r(12), r(13), r(14)]

return A_regs, B_regs, C_regs, starting_regs, alpha_reg, beta_reg, loop_regs, additional_regs, mask_regs

Expand All @@ -93,8 +100,8 @@ def init_mask(self, bm, v_size, tempreg, maskregs):
else:
asm = block("Set mask register")
restval = (1 << rest) - 1
asm.add(mov(restval, tempreg))
asm.add(mov(tempreg, maskreg[0]))
asm.add(mov(restval, tempreg, False))
asm.add(mov(tempreg, maskregs[0], False))
return asm

def bcst_alpha_beta(self,
Expand Down Expand Up @@ -131,7 +138,7 @@ def make_b_pointers(self,

reg_count = 5

for i in range(8192, min(nnz * self.precision.value, 33000), 8192):
for i in range(8192, min(nnz * self.precision.size(), 33000), 8192):
asm.add(lea(B_reg, additional_regs[reg_count], i))
reg_count += 1

Expand Down Expand Up @@ -180,7 +187,7 @@ def move_register_block(self,
if (mask is None) or (mask[ir,ic]):
cell_offset = Coords(down=ir*v_size, right=ic)
addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset)
addr.disp += self.precision.value * load_offset
addr.disp += self.precision.size() * load_offset
if store:
asm.add(mov(registers[ir,ic], addr, True, comment))
if prefetching == 'BL2viaC':
Expand Down
5 changes: 3 additions & 2 deletions pspamm/codegen/architectures/knl/inlineprinter.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ def addLine(self, stmt: str, comment: str):

self.output.append(line)


def maskformat(self, pred):
pass

def visitFma(self, stmt: FmaStmt):
b = stmt.bcast_src.ugly
Expand All @@ -75,7 +76,7 @@ def visitMul(self, stmt: MulStmt):
b = stmt.src.ugly
m = stmt.mult_src.ugly
a = stmt.dest.ugly
regsize = stmt.add_dest.size() // 16
regsize = stmt.dest.size() // 16
if stmt.mult_src.typeinfo == AsmType.i64:
# in this case, m is a Register that points to alpha/beta; manually format to be a memory address
s = "vmulp{} 0({})%{{1to{}%}}, {}, {}".format(self.precision, m, regsize * self.broadcast_multiplier, b, a)
Expand Down
3 changes: 3 additions & 0 deletions pspamm/codegen/precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ class Precision(Enum):
def getCType(cls, precision):
ctype = {cls.DOUBLE: 'double', cls.SINGLE: 'float', cls.HALF: 'uint16_t', cls.BFLOAT16: 'uint16_t'}
return ctype[precision]

def ctype(self):
return self.getCType(self)

def size(self):
return {
Expand Down
13 changes: 7 additions & 6 deletions pspamm/matmul.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,17 +218,18 @@ def __init__(self,

if lda == 0:
blocks, patterns, mtx_overhead = decompose_pattern(self.m, self.k, apattern, self.bm, self.bk)
self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value, blocks, patterns, mtx_overhead)
self.A = BlockCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size(), blocks, patterns, mtx_overhead)
self.nnz += sum(mtx_overhead)
else:
self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.value)
self.A = DenseCursor("A", self.starting_regs[0], self.m, self.k, self.lda, self.bm, self.bk, self.precision.size())
if ldb == 0:
blocks, patterns, mtx_overhead = decompose_pattern(self.k, self.n, bpattern, self.bk, self.bn)
self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value, blocks, patterns, mtx_overhead)
self.B = BlockCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size(), blocks, patterns, mtx_overhead)
self.nnz += sum(mtx_overhead)
else:
self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.value)
self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value)
self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.value) if prefetchReg else None
self.B = DenseCursor("B", self.starting_regs[1], self.k, self.n, self.ldb, self.bk, self.bn, self.precision.size())
self.C = DenseCursor("C", self.starting_regs[2], self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size())
self.C_pf = DenseCursor("C_pf", prefetchReg, self.m, self.n, self.ldc, self.bm, self.bn, self.precision.size()) if prefetchReg else None

self.unroll = ldb == 0 or lda == 0

Expand Down
72 changes: 27 additions & 45 deletions tests/sve_testsuite_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ def make(kernels, arch):

f.write(test_generator.head_of_testsuite)

testcases = []

for kern in kernels:
arguments = ['pspamm-generator', str(kern.m), str(kern.n), str(kern.k), str(kern.lda),
str(kern.ldb), str(kern.ldc), str(kern.alpha), str(kern.beta)]
Expand All @@ -41,12 +43,12 @@ def make(kernels, arch):
prec = 's' if kern.precision == Precision.SINGLE else 'd'
arguments += ['--precision', prec]

block_sizes = list(set(kern.block_sizes))
block_sizes = list(set(bs if len(bs) > 2 else (bs[0], bs[1], 1) for bs in kern.block_sizes))

for bs in block_sizes:
bm = bs[0]
bn = bs[1]
bk = bs[2] if len(bs) > 2 else 1
bk = bs[2]

if arch == "knl":
assert (bm % 8 == 0 and (bn + 1) * (bm / 8) <= 32)
Expand All @@ -58,11 +60,10 @@ def make(kernels, arch):
reglen = veclen // 128
v_len = (16 // kern.precision.size()) * reglen
# this should be the same assertion as in ../scripts/max_arm_sve.py
bk = 1
# ceiling division
vm = -(bm // -v_len)
vm = -(bm // -v_len)
if not ((bn + bk) * vm + bn * bk <= 32):
print(f'Skipping block size {bm}x{bn} for {arch}')
print(f'Skipping block size {bm}x{bn}x{bk} for {arch} / {prec}')
continue

name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk)
Expand All @@ -78,50 +79,15 @@ def make(kernels, arch):

f.write('#include "' + arch + '/' + name + '.h"\n')

f.write('\n')
# necessary functions are defined in testsuite_generator.py
f.write(test_generator.function_definitions)
f.write(setup_prefetching)
f.write(test_generator.setup_main)
# add variable declarations for single precision test cases
f.write(""" std::tuple<float*, float*, float*, float*, float*> fpointers;
float falpha; float fbeta;
double* prefetch;
float* fprefetch;
""")

for kern in kernels:

block_sizes = list(set(kern.block_sizes))

for bs in block_sizes:
bm = bs[0]
bn = bs[1]
bk = bs[2] if len(bs) > 2 else 1

if arch.startswith("arm_sve"):
veclen = int(arch[7:])
assert veclen % 128 == 0 and veclen <= 2048
reglen = veclen // 128
v_len = (16 // kern.precision.size()) * reglen
# this should be the same assertion as in ../scripts/max_arm_sve.py
bk = 1
# ceiling division
vm = -( bm // -v_len)
if not ((bn + bk) * vm + bn * bk <= 32):
# print(f'Skipping block size {bm}x{bn} for {arch}')
continue

name = kern.name + '_' + str(bm) + '_' + str(bn) + '_' + str(bk)

if isinstance(kern, SparseKernel):
mtx = kern.mtx
else:
mtx = ""
# for double precision: set prec to '' to conform to test_generator.function_definitions
prec = 'f' if kern.precision == Precision.SINGLE else ''

prec2 = 'f' if kern.precision == Precision.SINGLE else ''

f.write("""
testcases += [
"""
{p}alpha = {alpha}; {p}beta = {beta}; ldb = {ldb};
{p}pointers = pre<{T}>({m}, {n}, {k}, {lda}, ldb, {ldc}, "{mtx}");
setup_prefetch({p}prefetch, std::get<3>({p}pointers), {n}, {ldc});
Expand All @@ -130,6 +96,22 @@ def make(kernels, arch):
results.push_back(std::make_tuple("{name}", result));
free(std::get<0>({p}pointers)); free(std::get<1>({p}pointers)); free(std::get<2>({p}pointers)); free(std::get<3>({p}pointers)); free(std::get<4>({p}pointers)); free({p}prefetch);
""".format(m=kern.m, n=kern.n, k=kern.k, lda=kern.lda, ldb=kern.ldb, ldc=kern.ldc, alpha=kern.alpha, beta=kern.beta,
mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec, T="float" if prec == 'f' else "double"))
mtx=mtx, delta=kern.delta, name=name, sparse=2 if kern.ldb == 0 else 1, p=prec2, T="float" if prec == 'f' else "double")
]

f.write('\n')
# necessary functions are defined in testsuite_generator.py
f.write(test_generator.function_definitions)
f.write(setup_prefetching)
f.write(test_generator.setup_main)
# add variable declarations for single precision test cases
f.write(""" std::tuple<float*, float*, float*, float*, float*> fpointers;
float falpha; float fbeta;
double* prefetch;
float* fprefetch;
""")

for testcase in testcases:
f.write(testcase)

f.write(test_generator.end_of_testsuite)
Loading

0 comments on commit 9e7232a

Please sign in to comment.