Skip to content

Commit

Permalink
Add Cube and MaxK scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
davschneller committed Oct 20, 2024
1 parent c480249 commit 2d52952
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 5 deletions.
27 changes: 27 additions & 0 deletions pspamm/codegen/architectures/arm/blocksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,31 @@ def ARM_condition(cls, bm, bn, bk, v_size, elem128):
vk = -(bk // -elem128)
return (bn+bk) * vm + bn*vk <= 32

class Cube:
@classmethod
def getBlocksize(cls, m, n, bk, v_size, prec):
bm = 2
bn = 1
maxval = 0

elem128 = 16 // prec.size()

for i in range(v_size, m+1, v_size):
for j in range(1, n+1):
for k in range(1, 200):
if cls.ARM_condition(i, j, k, v_size, elem128):
if i*j*k > maxval:
maxval = i*j*k
bm = i
bn = j

return (bm, bn, bk)

@classmethod
def ARM_condition(cls, bm, bn, bk, v_size, elem128):
# ceiling division
vm = -(bm // -v_size)
vk = -(bk // -elem128)
return (bn+bk) * vm + bn*vk <= 32

Default = Max
47 changes: 46 additions & 1 deletion pspamm/codegen/architectures/arm_sve/blocksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,57 @@ def getBlocksize(cls, m, n, bk, v_size, prec):

return (bm, bn, bk)

@classmethod
def ARM_condition(cls, bm, bn, bk, v_size, elem128):
# ceiling division
vkext = -(bk // -elem128)
isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8
vm = -(bm // -v_size)
vk = -(bk // -elem128)
vk = vkext if isvkext else bk
return (bn + bk) * vm + bn*vk <= 32

@classmethod
def tileable(cls, m, bm):
return m % bm == 0

class Cube:
@classmethod
def getBlocksize(cls, m, n, bk, v_size, prec):
# v_size default is 2, however for SVE that parameter will always be larger
bm = 2
bn = 1
maxval = 0

elem128 = 16 // prec.size()

for i in range(1, m + 1, 1):
next_multiple = -(i // -v_size)
for j in range(1, n + 1):
for k in range(1, 200):
if cls.ARM_condition(next_multiple, j, k, v_size, elem128) and cls.tileable(m, i):
if i * j * k >= maxval:
maxval = i * j * k
bm = i
bn = j

if maxval == 0:
raise RuntimeError("Could not find an appropriate block size. We suggest padding the matrix dimensions")

while cls.ARM_condition(bm, bn, bk+1, v_size, elem128):
bk += 1

return (bm, bn, bk)

@classmethod
def ARM_condition(cls, bm, bn, bk, v_size, elem128):
# ceiling division
vkext = -(bk // -elem128)
isvkext = bn*vkext < 16 if elem128 == 2 else bn*vkext < 8
vm = -(bm // -v_size)
vk = vkext if isvkext else bk
return (bn + bk) * vm + bn*vk <= 32

@classmethod
def tileable(cls, m, bm):
return m % bm == 0

Expand Down
34 changes: 34 additions & 0 deletions pspamm/codegen/architectures/hsw/blocksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,5 +71,39 @@ def HSW_condition_extended(cls, bm, bn, bk, v_size):
vm = -(bm // -v_size)
return bn * vm + bn * bk + 1 <= 16

class Cube:
@classmethod
def getBlocksize(cls, m, n, bk, v_size, prec):
bm = 4
bn = 1
maxval = 0

for i in range(v_size, m+1, v_size):
for j in range(1, n+1):
for k in range(1, 200):
# can be replaced by cls.HSW_condition_extended here
# (but that seemed to be slower in the end)
if cls.HSW_condition(i, j, bk, v_size):
if i*j*k > maxval and (cls.HSW_condition(i, j, k, v_size) or j > 1):
maxval = i*j*k
bm = i
bn = j

while cls.HSW_condition(bm, bn, bk+1, v_size):
bk += 1

return (bm, bn, bk)

@classmethod
def HSW_condition(cls, bm, bn, bk, v_size):
# ceiling division
vm = -(bm // -v_size)
return (bn + bk) * vm + bn * bk <= 16

@classmethod
def HSW_condition_extended(cls, bm, bn, bk, v_size):
# ceiling division
vm = -(bm // -v_size)
return bn * vm + bn * bk + 1 <= 16

Default = Max
25 changes: 25 additions & 0 deletions pspamm/codegen/architectures/knl/blocksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,4 +91,29 @@ def KNL_condition(cls, bm, bn, bk, v_size):
vm = -(bm // -v_size)
return (bn+bk) * vm <= 32

class CubeBn:
@classmethod
def getBlocksize(cls, m, n, bk, v_size, prec):

bm = v_size
bn = 1

maxval = 0

for j in range(1, n+1):
for k in range(1, 200):
if cls.KNL_condition(bm, j, k, v_size):
if j*k > maxval:
maxval = j*k
bn = j
bk = k

return (bm, bn, bk)

@classmethod
def KNL_condition(cls, bm, bn, bk, v_size):
# ceiling division
vm = -(bm // -v_size)
return (bn+bk) * vm <= 32

Default = MaxBn
8 changes: 4 additions & 4 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
blocksize = import_module("pspamm.codegen.architectures." + archname + ".blocksize")

scripts = {
"arm": lambda blocksize: [blocksize.Old, blocksize.Max],
"arm_sve": lambda blocksize: [blocksize.Max],
"knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn],
"hsw": lambda blocksize: [blocksize.Old, blocksize.Max],
"arm": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxK, blocksize.Cube],
"arm_sve": lambda blocksize: [blocksize.Max, blocksize.MaxK, blocksize.Cube],
"knl": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.MaxBn, blocksize.CubeBn],
"hsw": lambda blocksize: [blocksize.Old, blocksize.Max, blocksize.Cube],
}

blocksize_algs = scripts[archname](blocksize) + [blocksize.Default]
Expand Down

0 comments on commit 2d52952

Please sign in to comment.