Skip to content

Commit

Permalink
FIRESTARTER 1.7
Browse files Browse the repository at this point in the history
  • Loading branch information
rschoene committed Oct 19, 2017
1 parent 9509daf commit acf384f
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ $$ TODO Version 2.0
$$ TODO - add results verification
$$ TODO - improve Haswell/Broadwell, KNL, and Skylake support

Version 1.7
- added Skylake-SP support
- minor fixes

Version 1.6
- added Windows version

Expand Down
17 changes: 13 additions & 4 deletions config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ enable_mac=0
[ISA_AVX512]
template= avx512
feature_req= avx512
fallback= func_knl_xeonphi_avx512_4t
fallback= func_skl_xeonep_avx512_1t, func_skl_xeonep_avx512_2t
flags = -mavx512f
win64_incl = 1

Expand Down Expand Up @@ -121,8 +121,18 @@ lines= 1536
instr_groups= RAM_L,L3_LS_256,L2_LS_256,L1_2LS_256,REG
proportion= 3,5,18,78,40

# TODO Skylake server
# - use AVX-512
# Skylake server
[Skylake-SP]
arch= skl
model= xeonep
threads= 1,2
isa= avx512
cpu_family= 6
cpu_model= 85
buffer_sizes= 32768,1048576,1441792,1048576000
lines= 1536
instr_groups= RAM_S,RAM_P,L3_S,L3_P,L2_S,L2_L,L1_S,L1_L,L1_BROADCAST,REG
proportion= 3,1,1,1,4,70,0,40,120,160

# Haswell/Broadwell desktop
[Haswell]
Expand Down Expand Up @@ -215,4 +225,3 @@ buffer_sizes= 16384,1048576,786432,104857600
lines= 1536
instr_groups= RAM_L,L3_L,L2_LS,L1_L,REG
proportion= 1,1,5,90,45

34 changes: 33 additions & 1 deletion templates/avx512_functions_c.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def init_functions(file,architectures):
flops=32 # two 512 bit FMA operations
elif each.instr_groups[i] == 'L1_L':
flops=32 # two 512 bit FMA operations
elif each.instr_groups[i] == 'L1_BROADCAST':
flops=16 # one 512 bit FMA operation
elif each.instr_groups[i] == 'L1_S':
flops=16 # one 512 bit FMA operation
elif each.instr_groups[i] == 'L1_LS':
Expand Down Expand Up @@ -234,6 +236,16 @@ def work_functions(file,architectures,version):
d3_inst = 'xor %%'+str(shift_reg[(shift_pos+nr_shift_regs-1)%nr_shift_regs])+', %%'+str(temp_reg)+';'
comment = '// REG ops only'
mov_dst = mov_dst +1
elif item == 'L1_BROADCAST':
d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';'
d1_inst = 'vbroadcastsd 64(%%'+l1_addr+'), %%zmm'+str(add_dest)+';'
l1_offset = l1_offset + each.cl_size
if l1_offset < l1_size*each.l1_cover:
d3_inst = 'add %%'+offset_reg+', %%'+l1_addr+';'
else:
l1_offset = 0
d3_inst = 'mov %%'+pointer_reg+', %%'+l1_addr+';'
comment = '// L1 packed single load'
elif item == 'L1_L':
d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';'
d1_inst = 'vfmadd231pd 64(%%'+l1_addr+'), %%zmm1, %%zmm'+str(add_dest)+';'
Expand Down Expand Up @@ -279,7 +291,27 @@ def work_functions(file,architectures,version):
d1_inst = 'vfmadd231pd 128(%%'+l2_addr+'), %%zmm0, %%zmm'+str(add_dest)+';'
d3_inst = 'add %%'+str(offset_reg)+', %%'+l2_addr+';'
comment = '// L2 load, L2 store'
elif item == 'RAM_L':
elif item == 'L3_L':
d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';'
d1_inst = 'vfmadd231pd 64(%%'+l3_addr+'), %%zmm1, %%zmm'+str(add_dest)+';'
d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';'
comment = '// L3 load'
elif item == 'L3_S':
d0_inst = 'vmovapd %%zmm'+str(add_dest)+', 64(%%'+l3_addr+');'
d1_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';'
d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';'
comment = '// L3 store'
elif item == 'L3_LS':
d0_inst = 'vmovapd %%zmm'+str(add_dest)+', 64(%%'+l2_addr+');'
d1_inst = 'vfmadd231pd 128(%%'+l3_addr+'), %%zmm0, %%zmm'+str(add_dest)+';'
d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';'
comment = '// L3 load, L3 store'
elif item == 'L3_P':
d0_inst = 'vfmadd231pd 64(%%'+l1_addr+'), %%zmm0, %%zmm'+str(add_dest)+';'
d1_inst = 'prefetcht2 (%%'+l3_addr+');'
d3_inst = 'add %%'+str(offset_reg)+', %%'+l3_addr+';'
comment = '// L3 prefetch'
elif item == 'RAM_L':
d0_inst = 'vfmadd231pd %%zmm'+str(add_start+(add_dest-add_start+add_regs+1)%add_regs)+', %%zmm0, %%zmm'+str(add_dest)+';'
d1_inst = 'vfmadd231pd 64(%%'+ram_addr+'), %%zmm1, %%'+str(ram_reg)+';'
d3_inst = 'add %%'+str(offset_reg)+', %%'+ram_addr+';'
Expand Down

0 comments on commit acf384f

Please sign in to comment.