diff --git a/asm_amd64.s b/asm_amd64.s index d7cb631..0077bce 100644 --- a/asm_amd64.s +++ b/asm_amd64.s @@ -1826,29 +1826,98 @@ TEXT ·asmAVX2EqualsUint64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VPCMPEQQ Y1, Y0, Y1 VPCMPEQQ Y2, Y0, Y2 + VPCMPEQQ Y3, Y0, Y3 + VPCMPEQQ Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXEqualsUint64(dstMask []byte, b uint64, rows []uint64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXEqualsUint64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VPCMPEQQ X1, X0, X1 + VPCMPEQQ X2, X0, X2 + VPCMPEQQ X3, X0, X3 + VPCMPEQQ X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -1871,29 +1940,98 @@ TEXT ·asmAVX2EqualsFloat64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VCMPPD $0x08, Y1, Y0, Y1 VCMPPD $0x08, Y2, Y0, Y2 + VCMPPD $0x08, Y3, Y0, Y3 + VCMPPD $0x08, Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXEqualsFloat64(dstMask []byte, b float64, rows []float64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXEqualsFloat64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VCMPPD $0x08, X1, X0, X1 + VCMPPD $0x08, X2, X0, X2 + VCMPPD $0x08, X3, X0, X3 + VCMPPD $0x08, X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -1916,29 +2054,98 @@ TEXT ·asmAVX2GreaterThanUint64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VPCMPGTQ Y1, Y0, Y1 VPCMPGTQ Y2, Y0, Y2 + VPCMPGTQ Y3, Y0, Y3 + VPCMPGTQ Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXGreaterThanUint64(dstMask []byte, b uint64, rows []uint64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXGreaterThanUint64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VPCMPGTQ X1, X0, X1 + VPCMPGTQ X2, X0, X2 + VPCMPGTQ X3, X0, X3 + VPCMPGTQ X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -1961,29 +2168,98 @@ TEXT ·asmAVX2GreaterThanFloat64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VCMPPD $0x16, Y1, Y0, Y1 VCMPPD $0x16, Y2, Y0, Y2 + VCMPPD $0x16, Y3, Y0, Y3 + VCMPPD $0x16, Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXGreaterThanFloat64(dstMask []byte, b float64, rows []float64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXGreaterThanFloat64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VCMPPD $0x16, X1, X0, X1 + VCMPPD $0x16, X2, X0, X2 + VCMPPD $0x16, X3, X0, X3 + VCMPPD $0x16, X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2006,29 +2282,98 @@ TEXT ·asmAVX2LessThanUint64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VPCMPGTQ Y0, Y1, Y1 VPCMPGTQ Y0, Y2, Y2 + VPCMPGTQ Y0, Y3, Y3 + VPCMPGTQ Y0, Y4, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXLessThanUint64(dstMask []byte, b uint64, rows []uint64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXLessThanUint64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VPCMPGTQ X0, X1, X1 + VPCMPGTQ X0, X2, X2 + VPCMPGTQ X0, X3, X3 + VPCMPGTQ X0, X4, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2051,29 +2396,98 @@ TEXT ·asmAVX2LessThanFloat64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VCMPPD $0x11, Y1, Y0, Y1 VCMPPD $0x11, Y2, Y0, Y2 + VCMPPD $0x11, Y3, Y0, Y3 + VCMPPD $0x11, Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXLessThanFloat64(dstMask []byte, b float64, rows []float64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXLessThanFloat64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VCMPPD $0x11, X1, X0, X1 + VCMPPD $0x11, X2, X0, X2 + VCMPPD $0x11, X3, X0, X3 + VCMPPD $0x11, X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2096,24 +2510,32 @@ TEXT ·asmAVX2GreaterEqualsUint64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VPCMPGTQ Y0, Y1, Y1 VPCMPGTQ Y0, Y2, Y2 + VPCMPGTQ Y0, Y3, Y3 + VPCMPGTQ Y0, Y4, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // To get GreaterEquals semantics, we flipped the arguments of VPCMPGT and now invert the result // Each register contains 4 bits, so we first combine pairs before writing them back @@ -2121,6 +2543,70 @@ loop: ORB SI, BL NOTB BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + NOTB DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXGreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXGreaterEqualsUint64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VPCMPGTQ X0, X1, X1 + VPCMPGTQ X0, X2, X2 + VPCMPGTQ X0, X3, X3 + VPCMPGTQ X0, X4, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // To get GreaterEquals semantics, we flipped the arguments of VPCMPGT and now invert the result + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + NOTB BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2143,29 +2629,98 @@ TEXT ·asmAVX2GreaterEqualsFloat64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VCMPPD $0x15, Y1, Y0, Y1 VCMPPD $0x15, Y2, Y0, Y2 + VCMPPD $0x15, Y3, Y0, Y3 + VCMPPD $0x15, Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXGreaterEqualsFloat64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VCMPPD $0x15, X1, X0, X1 + VCMPPD $0x15, X2, X0, X2 + VCMPPD $0x15, X3, X0, X3 + VCMPPD $0x15, X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2188,24 +2743,32 @@ TEXT ·asmAVX2LesserEqualsUint64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VPCMPGTQ Y1, Y0, Y1 VPCMPGTQ Y2, Y0, Y2 + VPCMPGTQ Y3, Y0, Y3 + VPCMPGTQ Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // To get LesserEquals semantics, we flipped the arguments of VPCMPGT and now invert the result // Each register contains 4 bits, so we first combine pairs before writing them back @@ -2213,6 +2776,70 @@ loop: ORB SI, BL NOTB BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + NOTB DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXLesserEqualsUint64(dstMask []byte, b uint64, rows []uint64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXLesserEqualsUint64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VPCMPGTQ X1, X0, X1 + VPCMPGTQ X2, X0, X2 + VPCMPGTQ X3, X0, X3 + VPCMPGTQ X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // To get LesserEquals semantics, we flipped the arguments of VPCMPGT and now invert the result + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + NOTB BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX @@ -2235,29 +2862,98 @@ TEXT ·asmAVX2LesserEqualsFloat64(SB), NOSPLIT, $0-56 VPBROADCASTQ b+24(FP), Y0 // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits - MOVL constants<>+8(SB), DI + MOVL constants<>+8(SB), R9 loop: - // Load 2 256-bit chunks into YMM registers + // Load 4 256-bit chunks into YMM registers VMOVDQU (CX), Y1 VMOVDQU 32(CX), Y2 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 // Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) VCMPPD $0x12, Y1, Y0, Y1 VCMPPD $0x12, Y2, Y0, Y2 + VCMPPD $0x12, Y3, Y0, Y3 + VCMPPD $0x12, Y4, Y0, Y4 // Take one bit of each byte and pack it into an R32 VPMOVMSKB Y1, BX VPMOVMSKB Y2, SI + VPMOVMSKB Y3, DI + VPMOVMSKB Y4, R8 // Drop every second-seventh bit from these registers - PEXTL DI, BX, BX - PEXTL DI, SI, SI + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 // Each register contains 4 bits, so we first combine pairs before writing them back SHLB $0x04, SI ORB SI, BL MOVB BL, (AX) + SHLB $0x04, R8 + ORB R8, DI + MOVB DI, 1(AX) + + // Update our offsets into rows and dstMask + ADDQ $0x00000080, CX + ADDQ $0x00000002, AX + + // Decrement loop counter + SUBQ $0x00000010, DX + JNZ loop + VZEROALL + RET + +// func asmAVXLesserEqualsFloat64(dstMask []byte, b float64, rows []float64) +// Requires: AVX, BMI2, SSE2 +TEXT ·asmAVXLesserEqualsFloat64(SB), NOSPLIT, $0-56 + MOVQ dstMask_base+0(FP), AX + MOVQ rows_base+32(FP), CX + MOVQ rows_len+40(FP), DX + + // Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...} + MOVQ b+24(FP), X0 + VPSHUFB const_seven_through_zero<>+0(SB), X0, X0 + + // Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits + MOVL constants<>+8(SB), R9 + +loop: + // Load 4 128-bit chunks into XMM registers + VMOVDQU (CX), X1 + VMOVDQU 16(CX), X2 + VMOVDQU 32(CX), X3 + VMOVDQU 48(CX), X4 + + // Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match) + VCMPPD $0x12, X1, X0, X1 + VCMPPD $0x12, X2, X0, X2 + VCMPPD $0x12, X3, X0, X3 + VCMPPD $0x12, X4, X0, X4 + + // Take one bit of each byte and pack it into an R32 + VPMOVMSKB X1, BX + VPMOVMSKB X2, SI + VPMOVMSKB X3, DI + VPMOVMSKB X4, R8 + + // Drop every second-seventh bit from these registers + PEXTL R9, BX, BX + PEXTL R9, SI, SI + PEXTL R9, DI, DI + PEXTL R9, R8, R8 + + // Each register contains 2 bits, so we first combine them back into bytes before writing them back + SHLB $0x02, SI + SHLB $0x04, DI + SHLB $0x06, R8 + ORB SI, BL + ORB R8, DI + ORB DI, BL + MOVB BL, (AX) // Update our offsets into rows and dstMask ADDQ $0x00000040, CX diff --git a/dispatch_amd64.go b/dispatch_amd64.go index 78473df..cdcbd9b 100644 --- a/dispatch_amd64.go +++ b/dispatch_amd64.go @@ -245,80 +245,120 @@ func VectorLesserEqualsFloat32(dstMask []byte, b float32, rows []float32) { goVectorLesserEquals(dstMask, b, rows) } func VectorEquals64(dstMask []byte, b uint64, rows []uint64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2EqualsUint64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2EqualsUint64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXEqualsUint64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorEquals(dstMask, b, rows) } func VectorEqualsFloat64(dstMask []byte, b float64, rows []float64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2EqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2EqualsFloat64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXEqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorEquals(dstMask, b, rows) } func VectorGreaterThan64(dstMask []byte, b uint64, rows []uint64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2GreaterThanUint64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2GreaterThanUint64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXGreaterThanUint64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorGreaterThan(dstMask, b, rows) } func VectorGreaterThanFloat64(dstMask []byte, b float64, rows []float64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2GreaterThanFloat64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2GreaterThanFloat64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXGreaterThanFloat64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorGreaterThan(dstMask, b, rows) } func VectorLessThan64(dstMask []byte, b uint64, rows []uint64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2LessThanUint64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2LessThanUint64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXLessThanUint64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorLessThan(dstMask, b, rows) } func VectorLessThanFloat64(dstMask []byte, b float64, rows []float64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2LessThanFloat64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2LessThanFloat64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXLessThanFloat64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorLessThan(dstMask, b, rows) } func VectorGreaterEquals64(dstMask []byte, b uint64, rows []uint64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2GreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2GreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXGreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorGreaterEquals(dstMask, b, rows) } func VectorGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2GreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2GreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXGreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorGreaterEquals(dstMask, b, rows) } func VectorLesserEquals64(dstMask []byte, b uint64, rows []uint64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2LesserEqualsUint64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2LesserEqualsUint64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXLesserEqualsUint64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } goVectorLesserEquals(dstMask, b, rows) } func VectorLesserEqualsFloat64(dstMask []byte, b float64, rows []float64) { - if hasAVX2() && len(rows) >= 8 { - asmAVX2LesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) + if hasAVX2() && len(rows) >= 16 { + asmAVX2LesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^15]) + dstMask = dstMask[(len(rows) & ^15)/8:] + rows = rows[len(rows) & ^15:] + } else if hasAVX() && len(rows) >= 8 { + asmAVXLesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^7]) dstMask = dstMask[(len(rows) & ^7)/8:] rows = rows[len(rows) & ^7:] } diff --git a/internal/avo/avo.go b/internal/avo/avo.go index e41615e..223e132 100644 --- a/internal/avo/avo.go +++ b/internal/avo/avo.go @@ -16,8 +16,6 @@ import ( "github.com/mmcloughlin/avo/reg" ) -const rounds = 2 - var ( constants = GLOBL("constants", RODATA|NOPTR) const_zeroes Mem @@ -151,25 +149,25 @@ func fastFilter(width int, cmpOp CmpOp, isfp IsFloating) { if isfp { infix = "Float" } + rounds := 2 + if width == 64 { + rounds = 4 + } fmt.Fprintf(dispatcherBoth, "func Vector%s%s%d(dstMask []byte, b %s, rows []%s) {\n", cmpOp, infix, width, isfp.GoType(width), isfp.GoType(width)) fmt.Fprintf(&dispatcherAmd64, " if hasAVX2() && len(rows) >= %d {\n", 256*rounds/width) fmt.Fprintf(&dispatcherAmd64, " asmAVX2%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 256*rounds/width-1) fmt.Fprintf(&dispatcherAmd64, " dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 256*rounds/width-1) fmt.Fprintf(&dispatcherAmd64, " rows = rows[len(rows) & ^%d:]\n", 256*rounds/width-1) - if width < 64 { - fmt.Fprintf(&dispatcherAmd64, " } else if hasAVX() && len(rows) >= %d {\n", 128*rounds/width) - fmt.Fprintf(&dispatcherAmd64, " asmAVX%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 128*rounds/width-1) - fmt.Fprintf(&dispatcherAmd64, " dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 128*rounds/width-1) - fmt.Fprintf(&dispatcherAmd64, " rows = rows[len(rows) & ^%d:]\n", 128*rounds/width-1) - } + fmt.Fprintf(&dispatcherAmd64, " } else if hasAVX() && len(rows) >= %d {\n", 128*rounds/width) + fmt.Fprintf(&dispatcherAmd64, " asmAVX%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 128*rounds/width-1) + fmt.Fprintf(&dispatcherAmd64, " dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 128*rounds/width-1) + fmt.Fprintf(&dispatcherAmd64, " rows = rows[len(rows) & ^%d:]\n", 128*rounds/width-1) fmt.Fprintf(&dispatcherAmd64, " }\n") fmt.Fprintf(dispatcherBoth, " goVector%s(dstMask, b, rows)\n", cmpOp) fmt.Fprintf(dispatcherBoth, "}\n") - fastFilterImpl(AVX2, width, cmpOp, isfp) - if width < 64 { - fastFilterImpl(AVX, width, cmpOp, isfp) - } + fastFilterImpl(AVX2, width, cmpOp, isfp, rounds) + fastFilterImpl(AVX, width, cmpOp, isfp, rounds) fmt.Fprintf(&generatedTest, "func TestVector%s%s%d(t *testing.T) {\n", cmpOp, infix, width) fmt.Fprintf(&generatedTest, " t.Parallel()\n") @@ -205,7 +203,7 @@ func fastFilter(width int, cmpOp CmpOp, isfp IsFloating) { fmt.Fprintf(&generatedTest, "}\n") } -func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating) { +func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating, rounds int) { TEXT(fmt.Sprintf("asm%s%s%s", avxLevel, cmpOp, isfp.GoName(width)), NOSPLIT, fmt.Sprintf("func(dstMask []byte, b %s, rows []%s)", isfp.GoType(width), isfp.GoType(width))) Pragma("noescape") @@ -382,6 +380,8 @@ func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating) NOTW(r.As16()) case 8: NOTB(r.As8()) + case 4, 2: + // Handled below after combining them } } } @@ -409,7 +409,19 @@ func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating) MOVB(intermediates[2*i+0].As8(), Mem{Base: dstMask, Disp: 1 * i}) } case 2: - // TODO: Implement this for AVX with 64bits widths, yielding 2 bits + Comment("Each register contains 2 bits, so we first combine them back into bytes before writing them back") + for i := 0; rounds/4 > i; i++ { + SHLB(U8(2), intermediates[4*i+1].As8()) + SHLB(U8(4), intermediates[4*i+2].As8()) + SHLB(U8(6), intermediates[4*i+3].As8()) + ORB(intermediates[4*i+1].As8(), intermediates[4*i+0].As8()) + ORB(intermediates[4*i+3].As8(), intermediates[4*i+2].As8()) + ORB(intermediates[4*i+2].As8(), intermediates[4*i+0].As8()) + if implementedWithInversion { + NOTB(intermediates[4*i+0].As8()) + } + MOVB(intermediates[4*i+0].As8(), Mem{Base: dstMask, Disp: 1 * i}) + } } Comment("Update our offsets into rows and dstMask") diff --git a/stubs_amd64.go b/stubs_amd64.go index eb39d8f..b125650 100644 --- a/stubs_amd64.go +++ b/stubs_amd64.go @@ -127,29 +127,59 @@ func asmAVXLesserEqualsFloat32(dstMask []byte, b float32, rows []float32) //go:noescape func asmAVX2EqualsUint64(dstMask []byte, b uint64, rows []uint64) +//go:noescape +func asmAVXEqualsUint64(dstMask []byte, b uint64, rows []uint64) + //go:noescape func asmAVX2EqualsFloat64(dstMask []byte, b float64, rows []float64) +//go:noescape +func asmAVXEqualsFloat64(dstMask []byte, b float64, rows []float64) + //go:noescape func asmAVX2GreaterThanUint64(dstMask []byte, b uint64, rows []uint64) +//go:noescape +func asmAVXGreaterThanUint64(dstMask []byte, b uint64, rows []uint64) + //go:noescape func asmAVX2GreaterThanFloat64(dstMask []byte, b float64, rows []float64) +//go:noescape +func asmAVXGreaterThanFloat64(dstMask []byte, b float64, rows []float64) + //go:noescape func asmAVX2LessThanUint64(dstMask []byte, b uint64, rows []uint64) +//go:noescape +func asmAVXLessThanUint64(dstMask []byte, b uint64, rows []uint64) + //go:noescape func asmAVX2LessThanFloat64(dstMask []byte, b float64, rows []float64) +//go:noescape +func asmAVXLessThanFloat64(dstMask []byte, b float64, rows []float64) + //go:noescape func asmAVX2GreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64) +//go:noescape +func asmAVXGreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64) + //go:noescape func asmAVX2GreaterEqualsFloat64(dstMask []byte, b float64, rows []float64) +//go:noescape +func asmAVXGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64) + //go:noescape func asmAVX2LesserEqualsUint64(dstMask []byte, b uint64, rows []uint64) +//go:noescape +func asmAVXLesserEqualsUint64(dstMask []byte, b uint64, rows []uint64) + //go:noescape func asmAVX2LesserEqualsFloat64(dstMask []byte, b float64, rows []float64) + +//go:noescape +func asmAVXLesserEqualsFloat64(dstMask []byte, b float64, rows []float64)