Skip to content

Commit

Permalink
Remove clear_mem and Op_ClearArray related changes
Browse files Browse the repository at this point in the history
Summary: I'd like not to backport 8257772: Vectorizing clear memory operation using AVX-512 masked operations,
hence clear_mem and Op_ClearArray related changes are removed from 8262355
  • Loading branch information
JoshuaZhuwj committed Nov 1, 2023
1 parent 630752b commit db0dd20
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 418 deletions.
131 changes: 1 addition & 130 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6194,79 +6194,7 @@ void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp
BIND(L_end);
}

// Clearing constant sized memory using YMM/ZMM registers.
void MacroAssembler::clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask) {
assert(UseAVX > 2 && VM_Version::supports_avx512vlbw(), "");
bool use64byteVector = MaxVectorSize > 32 && AVX3Threshold == 0;

int vector64_count = (cnt & (~0x7)) >> 3;
cnt = cnt & 0x7;

// 64 byte initialization loop.
vpxor(xtmp, xtmp, xtmp, use64byteVector ? AVX_512bit : AVX_256bit);
for (int i = 0; i < vector64_count; i++) {
fill64_avx(base, i * 64, xtmp, use64byteVector);
}

// Clear remaining 64 byte tail.
int disp = vector64_count * 64;
if (cnt) {
switch (cnt) {
case 1:
movq(Address(base, disp), xtmp);
break;
case 2:
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_128bit);
break;
case 3:
movl(rtmp, 0x7);
kmovwl(mask, rtmp);
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_256bit);
break;
case 4:
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
break;
case 5:
if (use64byteVector) {
movl(rtmp, 0x1F);
kmovwl(mask, rtmp);
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
} else {
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
movq(Address(base, disp + 32), xtmp);
}
break;
case 6:
if (use64byteVector) {
movl(rtmp, 0x3F);
kmovwl(mask, rtmp);
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
} else {
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
evmovdqu(T_LONG, k0, Address(base, disp + 32), xtmp, Assembler::AVX_128bit);
}
break;
case 7:
if (use64byteVector) {
movl(rtmp, 0x7F);
kmovwl(mask, rtmp);
evmovdqu(T_LONG, mask, Address(base, disp), xtmp, Assembler::AVX_512bit);
} else {
evmovdqu(T_LONG, k0, Address(base, disp), xtmp, Assembler::AVX_256bit);
movl(rtmp, 0x7);
kmovwl(mask, rtmp);
evmovdqu(T_LONG, mask, Address(base, disp + 32), xtmp, Assembler::AVX_256bit);
}
break;
default:
fatal("Unexpected length : %d\n",cnt);
break;
}
}
}

void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp,
bool is_large, KRegister mask) {
void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) {
// cnt - number of qwords (8-byte words).
// base - start address, qword aligned.
// is_large - if optimizers know cnt is larger than InitArrayShortSize
Expand Down Expand Up @@ -10752,63 +10680,6 @@ void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMR
}
}

#if COMPILER2_OR_JVMCI


// Set memory operation for length "less than" 64 bytes.
void MacroAssembler::fill64_masked_avx(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length,
Register temp, bool use64byteVector) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
assert(shift != 0, "shift value should be 1 (short),2(int) or 3(long)");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
if (!use64byteVector) {
fill32_avx(dst, disp, xmm);
subptr(length, 32 >> shift);
fill32_masked_avx(shift, dst, disp + 32, xmm, mask, length, temp);
} else {
assert(MaxVectorSize == 64, "vector length != 64");
movl(temp, 1);
shlxl(temp, temp, length);
subptr(temp, 1);
kmovwl(mask, temp);
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_512bit);
}
}


void MacroAssembler::fill32_masked_avx(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length,
Register temp) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
assert(shift != 0, "shift value should be 1 (short), 2(int) or 3(long)");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
movl(temp, 1);
shlxl(temp, temp, length);
subptr(temp, 1);
kmovwl(mask, temp);
evmovdqu(type[shift], mask, Address(dst, disp), xmm, Assembler::AVX_256bit);
}


void MacroAssembler::fill32_avx(Register dst, int disp, XMMRegister xmm) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
vmovdqu(Address(dst, disp), xmm);
}

void MacroAssembler::fill64_avx(Register dst, int disp, XMMRegister xmm, bool use64byteVector) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
BasicType type[] = {T_BYTE, T_SHORT, T_INT, T_LONG};
if (!use64byteVector) {
fill32_avx(dst, disp, xmm);
fill32_avx(dst, disp + 32, xmm);
} else {
evmovdquq(Address(dst, disp), xmm, Assembler::AVX_512bit);
}
}

#endif //COMPILER2_OR_JVMCI

Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others
Expand Down
17 changes: 1 addition & 16 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1750,10 +1750,7 @@ class MacroAssembler: public Assembler {

// clear memory of size 'cnt' qwords, starting at 'base';
// if 'is_large' is set, do not try to produce short loop
void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large, KRegister mask=knoreg);

// clear memory initialization sequence for constant size;
void clear_mem(Register base, int cnt, Register rtmp, XMMRegister xtmp, KRegister mask=knoreg);
void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large);

// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers
void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp);
Expand Down Expand Up @@ -1921,18 +1918,6 @@ class MacroAssembler: public Assembler {
void byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2, KRegister mask = knoreg);

void fill64_masked_avx(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length,
Register temp, bool use64byteVector = false);

void fill32_masked_avx(uint shift, Register dst, int disp,
XMMRegister xmm, KRegister mask, Register length,
Register temp);

void fill32_avx(Register dst, int disp, XMMRegister xmm);

void fill64_avx(Register dst, int dis, XMMRegister xmm, bool use64byteVector = false);

void vallones(XMMRegister dst, int vector_len);
};

Expand Down
138 changes: 4 additions & 134 deletions src/hotspot/cpu/x86/x86_32.ad
Original file line number Diff line number Diff line change
Expand Up @@ -11733,10 +11733,8 @@ instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{

// =======================================================================
// fast clearing of an array
// Small ClearArray non-AVX512.
instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(!((ClearArrayNode*)n)->is_large() &&
(UseAVX <= 2 || !VM_Version::supports_avx512vlbw()));
predicate(!((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);

Expand Down Expand Up @@ -11789,76 +11787,13 @@ instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe du
%}
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
$tmp$$XMMRegister, false, knoreg);
$tmp$$XMMRegister, false);
%}
ins_pipe( pipe_slow );
%}

// Small ClearArray AVX512 non-constant length.
instruct rep_stos_evex(eCXRegI cnt, eDIRegP base, regD tmp, kReg ktmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(!((ClearArrayNode*)n)->is_large() &&
UseAVX > 2 && VM_Version::supports_avx512vlbw() &&
!n->in(2)->bottom_type()->is_int()->is_con());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);

format %{ $$template
$$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
$$emit$$"CMP InitArrayShortSize,rcx\n\t"
$$emit$$"JG LARGE\n\t"
$$emit$$"SHL ECX, 1\n\t"
$$emit$$"DEC ECX\n\t"
$$emit$$"JS DONE\t# Zero length\n\t"
$$emit$$"MOV EAX,(EDI,ECX,4)\t# LOOP\n\t"
$$emit$$"DEC ECX\n\t"
$$emit$$"JGE LOOP\n\t"
$$emit$$"JMP DONE\n\t"
$$emit$$"# LARGE:\n\t"
if (UseFastStosb) {
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
} else if (UseXMMForObjInit) {
$$emit$$"MOV RDI,RAX\n\t"
$$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
$$emit$$"JMPQ L_zero_64_bytes\n\t"
$$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
$$emit$$"VMOVDQU YMM0,(RAX)\n\t"
$$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
$$emit$$"ADD 0x40,RAX\n\t"
$$emit$$"# L_zero_64_bytes:\n\t"
$$emit$$"SUB 0x8,RCX\n\t"
$$emit$$"JGE L_loop\n\t"
$$emit$$"ADD 0x4,RCX\n\t"
$$emit$$"JL L_tail\n\t"
$$emit$$"VMOVDQU YMM0,(RAX)\n\t"
$$emit$$"ADD 0x20,RAX\n\t"
$$emit$$"SUB 0x4,RCX\n\t"
$$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
$$emit$$"ADD 0x4,RCX\n\t"
$$emit$$"JLE L_end\n\t"
$$emit$$"DEC RCX\n\t"
$$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
$$emit$$"VMOVQ XMM0,(RAX)\n\t"
$$emit$$"ADD 0x8,RAX\n\t"
$$emit$$"DEC RCX\n\t"
$$emit$$"JGE L_sloop\n\t"
$$emit$$"# L_end:\n\t"
} else {
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
}
$$emit$$"# DONE"
%}
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
$tmp$$XMMRegister, false, $ktmp$$KRegister);
%}
ins_pipe( pipe_slow );
%}

// Large ClearArray non-AVX512.
instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(UseAVX <= 2 && ((ClearArrayNode*)n)->is_large());
predicate(((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr);
format %{ $$template
Expand Down Expand Up @@ -11901,76 +11836,11 @@ instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Unive
%}
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
$tmp$$XMMRegister, true, knoreg);
$tmp$$XMMRegister, true);
%}
ins_pipe( pipe_slow );
%}

// Large ClearArray AVX512.
instruct rep_stos_large_evex(eCXRegI cnt, eDIRegP base, regD tmp, kReg ktmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
predicate(UseAVX > 2 && ((ClearArrayNode*)n)->is_large());
match(Set dummy (ClearArray cnt base));
effect(USE_KILL cnt, USE_KILL base, TEMP tmp, TEMP ktmp, KILL zero, KILL cr);
format %{ $$template
if (UseFastStosb) {
$$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
$$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t"
$$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t"
} else if (UseXMMForObjInit) {
$$emit$$"MOV RDI,RAX\t# ClearArray:\n\t"
$$emit$$"VPXOR YMM0,YMM0,YMM0\n\t"
$$emit$$"JMPQ L_zero_64_bytes\n\t"
$$emit$$"# L_loop:\t# 64-byte LOOP\n\t"
$$emit$$"VMOVDQU YMM0,(RAX)\n\t"
$$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t"
$$emit$$"ADD 0x40,RAX\n\t"
$$emit$$"# L_zero_64_bytes:\n\t"
$$emit$$"SUB 0x8,RCX\n\t"
$$emit$$"JGE L_loop\n\t"
$$emit$$"ADD 0x4,RCX\n\t"
$$emit$$"JL L_tail\n\t"
$$emit$$"VMOVDQU YMM0,(RAX)\n\t"
$$emit$$"ADD 0x20,RAX\n\t"
$$emit$$"SUB 0x4,RCX\n\t"
$$emit$$"# L_tail:\t# Clearing tail bytes\n\t"
$$emit$$"ADD 0x4,RCX\n\t"
$$emit$$"JLE L_end\n\t"
$$emit$$"DEC RCX\n\t"
$$emit$$"# L_sloop:\t# 8-byte short loop\n\t"
$$emit$$"VMOVQ XMM0,(RAX)\n\t"
$$emit$$"ADD 0x8,RAX\n\t"
$$emit$$"DEC RCX\n\t"
$$emit$$"JGE L_sloop\n\t"
$$emit$$"# L_end:\n\t"
} else {
$$emit$$"XOR EAX,EAX\t# ClearArray:\n\t"
$$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t"
$$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t"
}
$$emit$$"# DONE"
%}
ins_encode %{
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register,
$tmp$$XMMRegister, true, $ktmp$$KRegister);
%}
ins_pipe( pipe_slow );
%}

// Small ClearArray AVX512 constant length.
instruct rep_stos_im(immI cnt, kReg ktmp, eRegP base, regD tmp, rRegI zero, Universe dummy, eFlagsReg cr)
%{
predicate(!((ClearArrayNode*)n)->is_large() &&
(UseAVX > 2 && VM_Version::supports_avx512vlbw() &&
n->in(2)->bottom_type()->is_int()->is_con()));
match(Set dummy (ClearArray cnt base));
effect(TEMP tmp, TEMP zero, TEMP ktmp, KILL cr);
format %{ "clear_mem_imm $base , $cnt \n\t" %}
ins_encode %{
__ clear_mem($base$$Register, $cnt$$constant, $zero$$Register, $tmp$$XMMRegister, $ktmp$$KRegister);
%}
ins_pipe(pipe_slow);
%}

instruct string_compareL(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2,
eAXRegI result, regD tmp1, eFlagsReg cr) %{
predicate(UseAVX <= 2 && ((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
Expand Down
Loading

0 comments on commit db0dd20

Please sign in to comment.