diff --git a/asm_amd64.s b/asm_amd64.s
index d7cb631..0077bce 100644
--- a/asm_amd64.s
+++ b/asm_amd64.s
@@ -1826,29 +1826,98 @@ TEXT ·asmAVX2EqualsUint64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VPCMPEQQ Y1, Y0, Y1
 	VPCMPEQQ Y2, Y0, Y2
+	VPCMPEQQ Y3, Y0, Y3
+	VPCMPEQQ Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXEqualsUint64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VPCMPEQQ X1, X0, X1
+	VPCMPEQQ X2, X0, X2
+	VPCMPEQQ X3, X0, X3
+	VPCMPEQQ X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -1871,29 +1940,98 @@ TEXT ·asmAVX2EqualsFloat64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VCMPPD $0x08, Y1, Y0, Y1
 	VCMPPD $0x08, Y2, Y0, Y2
+	VCMPPD $0x08, Y3, Y0, Y3
+	VCMPPD $0x08, Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXEqualsFloat64(dstMask []byte, b float64, rows []float64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXEqualsFloat64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VCMPPD $0x08, X1, X0, X1
+	VCMPPD $0x08, X2, X0, X2
+	VCMPPD $0x08, X3, X0, X3
+	VCMPPD $0x08, X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -1916,29 +2054,98 @@ TEXT ·asmAVX2GreaterThanUint64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VPCMPGTQ Y1, Y0, Y1
 	VPCMPGTQ Y2, Y0, Y2
+	VPCMPGTQ Y3, Y0, Y3
+	VPCMPGTQ Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXGreaterThanUint64(dstMask []byte, b uint64, rows []uint64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXGreaterThanUint64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VPCMPGTQ X1, X0, X1
+	VPCMPGTQ X2, X0, X2
+	VPCMPGTQ X3, X0, X3
+	VPCMPGTQ X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -1961,29 +2168,98 @@ TEXT ·asmAVX2GreaterThanFloat64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VCMPPD $0x16, Y1, Y0, Y1
 	VCMPPD $0x16, Y2, Y0, Y2
+	VCMPPD $0x16, Y3, Y0, Y3
+	VCMPPD $0x16, Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXGreaterThanFloat64(dstMask []byte, b float64, rows []float64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXGreaterThanFloat64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VCMPPD $0x16, X1, X0, X1
+	VCMPPD $0x16, X2, X0, X2
+	VCMPPD $0x16, X3, X0, X3
+	VCMPPD $0x16, X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2006,29 +2282,98 @@ TEXT ·asmAVX2LessThanUint64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VPCMPGTQ Y0, Y1, Y1
 	VPCMPGTQ Y0, Y2, Y2
+	VPCMPGTQ Y0, Y3, Y3
+	VPCMPGTQ Y0, Y4, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXLessThanUint64(dstMask []byte, b uint64, rows []uint64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXLessThanUint64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VPCMPGTQ X0, X1, X1
+	VPCMPGTQ X0, X2, X2
+	VPCMPGTQ X0, X3, X3
+	VPCMPGTQ X0, X4, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2051,29 +2396,98 @@ TEXT ·asmAVX2LessThanFloat64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VCMPPD $0x11, Y1, Y0, Y1
 	VCMPPD $0x11, Y2, Y0, Y2
+	VCMPPD $0x11, Y3, Y0, Y3
+	VCMPPD $0x11, Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXLessThanFloat64(dstMask []byte, b float64, rows []float64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXLessThanFloat64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VCMPPD $0x11, X1, X0, X1
+	VCMPPD $0x11, X2, X0, X2
+	VCMPPD $0x11, X3, X0, X3
+	VCMPPD $0x11, X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2096,24 +2510,32 @@ TEXT ·asmAVX2GreaterEqualsUint64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VPCMPGTQ Y0, Y1, Y1
 	VPCMPGTQ Y0, Y2, Y2
+	VPCMPGTQ Y0, Y3, Y3
+	VPCMPGTQ Y0, Y4, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// To get GreaterEquals semantics, we flipped the arguments of VPCMPGT and now invert the result
 	// Each register contains 4 bits, so we first combine pairs before writing them back
@@ -2121,6 +2543,70 @@ loop:
 	ORB  SI, BL
 	NOTB BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	NOTB DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXGreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXGreaterEqualsUint64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VPCMPGTQ X0, X1, X1
+	VPCMPGTQ X0, X2, X2
+	VPCMPGTQ X0, X3, X3
+	VPCMPGTQ X0, X4, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// To get GreaterEquals semantics, we flipped the arguments of VPCMPGT and now invert the result
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	NOTB BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2143,29 +2629,98 @@ TEXT ·asmAVX2GreaterEqualsFloat64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VCMPPD $0x15, Y1, Y0, Y1
 	VCMPPD $0x15, Y2, Y0, Y2
+	VCMPPD $0x15, Y3, Y0, Y3
+	VCMPPD $0x15, Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXGreaterEqualsFloat64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VCMPPD $0x15, X1, X0, X1
+	VCMPPD $0x15, X2, X0, X2
+	VCMPPD $0x15, X3, X0, X3
+	VCMPPD $0x15, X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2188,24 +2743,32 @@ TEXT ·asmAVX2LesserEqualsUint64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VPCMPGTQ Y1, Y0, Y1
 	VPCMPGTQ Y2, Y0, Y2
+	VPCMPGTQ Y3, Y0, Y3
+	VPCMPGTQ Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// To get LesserEquals semantics, we flipped the arguments of VPCMPGT and now invert the result
 	// Each register contains 4 bits, so we first combine pairs before writing them back
@@ -2213,6 +2776,70 @@ loop:
 	ORB  SI, BL
 	NOTB BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	NOTB DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXLesserEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXLesserEqualsUint64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VPCMPGTQ X1, X0, X1
+	VPCMPGTQ X2, X0, X2
+	VPCMPGTQ X3, X0, X3
+	VPCMPGTQ X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// To get LesserEquals semantics, we flipped the arguments of VPCMPGT and now invert the result
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	NOTB BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
@@ -2235,29 +2862,98 @@ TEXT ·asmAVX2LesserEqualsFloat64(SB), NOSPLIT, $0-56
 	VPBROADCASTQ b+24(FP), Y0
 
 	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
-	MOVL constants<>+8(SB), DI
+	MOVL constants<>+8(SB), R9
 
 loop:
-	// Load 2 256-bit chunks into YMM registers
+	// Load 4 256-bit chunks into YMM registers
 	VMOVDQU (CX), Y1
 	VMOVDQU 32(CX), Y2
+	VMOVDQU 64(CX), Y3
+	VMOVDQU 96(CX), Y4
 
 	// Compare all bytes in each YMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
 	VCMPPD $0x12, Y1, Y0, Y1
 	VCMPPD $0x12, Y2, Y0, Y2
+	VCMPPD $0x12, Y3, Y0, Y3
+	VCMPPD $0x12, Y4, Y0, Y4
 
 	// Take one bit of each byte and pack it into an R32
 	VPMOVMSKB Y1, BX
 	VPMOVMSKB Y2, SI
+	VPMOVMSKB Y3, DI
+	VPMOVMSKB Y4, R8
 
 	// Drop every second-seventh bit from these registers
-	PEXTL DI, BX, BX
-	PEXTL DI, SI, SI
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
 
 	// Each register contains 4 bits, so we first combine pairs before writing them back
 	SHLB $0x04, SI
 	ORB  SI, BL
 	MOVB BL, (AX)
+	SHLB $0x04, R8
+	ORB  R8, DI
+	MOVB DI, 1(AX)
+
+	// Update our offsets into rows and dstMask
+	ADDQ $0x00000080, CX
+	ADDQ $0x00000002, AX
+
+	// Decrement loop counter
+	SUBQ $0x00000010, DX
+	JNZ  loop
+	VZEROALL
+	RET
+
+// func asmAVXLesserEqualsFloat64(dstMask []byte, b float64, rows []float64)
+// Requires: AVX, BMI2, SSE2
+TEXT ·asmAVXLesserEqualsFloat64(SB), NOSPLIT, $0-56
+	MOVQ dstMask_base+0(FP), AX
+	MOVQ rows_base+32(FP), CX
+	MOVQ rows_len+40(FP), DX
+
+	// Read param b into XMM register. If b is 0x07, YMM becomes {0x07, 0x07, 0x07...}
+	MOVQ    b+24(FP), X0
+	VPSHUFB const_seven_through_zero<>+0(SB), X0, X0
+
+	// Load the mask 00000001... which we will use with PEXT to drop 7/8th of the bits
+	MOVL constants<>+8(SB), R9
+
+loop:
+	// Load 4 128-bit chunks into XMM registers
+	VMOVDQU (CX), X1
+	VMOVDQU 16(CX), X2
+	VMOVDQU 32(CX), X3
+	VMOVDQU 48(CX), X4
+
+	// Compare all bytes in each XMM register to b. Each byte in the YMMs becomes 0x00 (mismatch) or 0xff (match)
+	VCMPPD $0x12, X1, X0, X1
+	VCMPPD $0x12, X2, X0, X2
+	VCMPPD $0x12, X3, X0, X3
+	VCMPPD $0x12, X4, X0, X4
+
+	// Take one bit of each byte and pack it into an R32
+	VPMOVMSKB X1, BX
+	VPMOVMSKB X2, SI
+	VPMOVMSKB X3, DI
+	VPMOVMSKB X4, R8
+
+	// Drop every second-seventh bit from these registers
+	PEXTL R9, BX, BX
+	PEXTL R9, SI, SI
+	PEXTL R9, DI, DI
+	PEXTL R9, R8, R8
+
+	// Each register contains 2 bits, so we first combine them back into bytes before writing them back
+	SHLB $0x02, SI
+	SHLB $0x04, DI
+	SHLB $0x06, R8
+	ORB  SI, BL
+	ORB  R8, DI
+	ORB  DI, BL
+	MOVB BL, (AX)
 
 	// Update our offsets into rows and dstMask
 	ADDQ $0x00000040, CX
diff --git a/dispatch_amd64.go b/dispatch_amd64.go
index 78473df..cdcbd9b 100644
--- a/dispatch_amd64.go
+++ b/dispatch_amd64.go
@@ -245,80 +245,120 @@ func VectorLesserEqualsFloat32(dstMask []byte, b float32, rows []float32) {
 	goVectorLesserEquals(dstMask, b, rows)
 }
 func VectorEquals64(dstMask []byte, b uint64, rows []uint64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2EqualsUint64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2EqualsUint64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXEqualsUint64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorEquals(dstMask, b, rows)
 }
 func VectorEqualsFloat64(dstMask []byte, b float64, rows []float64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2EqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2EqualsFloat64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXEqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorEquals(dstMask, b, rows)
 }
 func VectorGreaterThan64(dstMask []byte, b uint64, rows []uint64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2GreaterThanUint64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2GreaterThanUint64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXGreaterThanUint64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorGreaterThan(dstMask, b, rows)
 }
 func VectorGreaterThanFloat64(dstMask []byte, b float64, rows []float64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2GreaterThanFloat64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2GreaterThanFloat64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXGreaterThanFloat64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorGreaterThan(dstMask, b, rows)
 }
 func VectorLessThan64(dstMask []byte, b uint64, rows []uint64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2LessThanUint64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2LessThanUint64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXLessThanUint64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorLessThan(dstMask, b, rows)
 }
 func VectorLessThanFloat64(dstMask []byte, b float64, rows []float64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2LessThanFloat64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2LessThanFloat64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXLessThanFloat64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorLessThan(dstMask, b, rows)
 }
 func VectorGreaterEquals64(dstMask []byte, b uint64, rows []uint64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2GreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2GreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXGreaterEqualsUint64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorGreaterEquals(dstMask, b, rows)
 }
 func VectorGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2GreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2GreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXGreaterEqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorGreaterEquals(dstMask, b, rows)
 }
 func VectorLesserEquals64(dstMask []byte, b uint64, rows []uint64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2LesserEqualsUint64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2LesserEqualsUint64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXLesserEqualsUint64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
 	goVectorLesserEquals(dstMask, b, rows)
 }
 func VectorLesserEqualsFloat64(dstMask []byte, b float64, rows []float64) {
-	if hasAVX2() && len(rows) >= 8 {
-		asmAVX2LesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
+	if hasAVX2() && len(rows) >= 16 {
+		asmAVX2LesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^15])
+		dstMask = dstMask[(len(rows) & ^15)/8:]
+		rows = rows[len(rows) & ^15:]
+	} else if hasAVX() && len(rows) >= 8 {
+		asmAVXLesserEqualsFloat64(dstMask, b, rows[:len(rows) & ^7])
 		dstMask = dstMask[(len(rows) & ^7)/8:]
 		rows = rows[len(rows) & ^7:]
 	}
diff --git a/internal/avo/avo.go b/internal/avo/avo.go
index e41615e..223e132 100644
--- a/internal/avo/avo.go
+++ b/internal/avo/avo.go
@@ -16,8 +16,6 @@ import (
 	"github.com/mmcloughlin/avo/reg"
 )
 
-const rounds = 2
-
 var (
 	constants                = GLOBL("constants", RODATA|NOPTR)
 	const_zeroes             Mem
@@ -151,25 +149,25 @@ func fastFilter(width int, cmpOp CmpOp, isfp IsFloating) {
 	if isfp {
 		infix = "Float"
 	}
+	rounds := 2
+	if width == 64 {
+		rounds = 4
+	}
 	fmt.Fprintf(dispatcherBoth, "func Vector%s%s%d(dstMask []byte, b %s, rows []%s) {\n", cmpOp, infix, width, isfp.GoType(width), isfp.GoType(width))
 	fmt.Fprintf(&dispatcherAmd64, "	if hasAVX2() && len(rows) >= %d {\n", 256*rounds/width)
 	fmt.Fprintf(&dispatcherAmd64, "		asmAVX2%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 256*rounds/width-1)
 	fmt.Fprintf(&dispatcherAmd64, "		dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 256*rounds/width-1)
 	fmt.Fprintf(&dispatcherAmd64, "		rows = rows[len(rows) & ^%d:]\n", 256*rounds/width-1)
-	if width < 64 {
-		fmt.Fprintf(&dispatcherAmd64, "	} else if hasAVX() && len(rows) >= %d {\n", 128*rounds/width)
-		fmt.Fprintf(&dispatcherAmd64, "		asmAVX%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 128*rounds/width-1)
-		fmt.Fprintf(&dispatcherAmd64, "		dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 128*rounds/width-1)
-		fmt.Fprintf(&dispatcherAmd64, "		rows = rows[len(rows) & ^%d:]\n", 128*rounds/width-1)
-	}
+	fmt.Fprintf(&dispatcherAmd64, "	} else if hasAVX() && len(rows) >= %d {\n", 128*rounds/width)
+	fmt.Fprintf(&dispatcherAmd64, "		asmAVX%s%s(dstMask, b, rows[:len(rows) & ^%d])\n", cmpOp, isfp.GoName(width), 128*rounds/width-1)
+	fmt.Fprintf(&dispatcherAmd64, "		dstMask = dstMask[(len(rows) & ^%d) / 8:]\n", 128*rounds/width-1)
+	fmt.Fprintf(&dispatcherAmd64, "		rows = rows[len(rows) & ^%d:]\n", 128*rounds/width-1)
 	fmt.Fprintf(&dispatcherAmd64, "	}\n")
 	fmt.Fprintf(dispatcherBoth, "	goVector%s(dstMask, b, rows)\n", cmpOp)
 	fmt.Fprintf(dispatcherBoth, "}\n")
 
-	fastFilterImpl(AVX2, width, cmpOp, isfp)
-	if width < 64 {
-		fastFilterImpl(AVX, width, cmpOp, isfp)
-	}
+	fastFilterImpl(AVX2, width, cmpOp, isfp, rounds)
+	fastFilterImpl(AVX, width, cmpOp, isfp, rounds)
 
 	fmt.Fprintf(&generatedTest, "func TestVector%s%s%d(t *testing.T) {\n", cmpOp, infix, width)
 	fmt.Fprintf(&generatedTest, "	t.Parallel()\n")
@@ -205,7 +203,7 @@ func fastFilter(width int, cmpOp CmpOp, isfp IsFloating) {
 	fmt.Fprintf(&generatedTest, "}\n")
 }
 
-func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating) {
+func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating, rounds int) {
 	TEXT(fmt.Sprintf("asm%s%s%s", avxLevel, cmpOp, isfp.GoName(width)), NOSPLIT, fmt.Sprintf("func(dstMask []byte, b %s, rows []%s)", isfp.GoType(width), isfp.GoType(width)))
 	Pragma("noescape")
 
@@ -382,6 +380,8 @@ func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating)
 				NOTW(r.As16())
 			case 8:
 				NOTB(r.As8())
+			case 4, 2:
+				// Handled below after combining them
 			}
 		}
 	}
@@ -409,7 +409,19 @@ func fastFilterImpl(avxLevel AVXLevel, width int, cmpOp CmpOp, isfp IsFloating)
 			MOVB(intermediates[2*i+0].As8(), Mem{Base: dstMask, Disp: 1 * i})
 		}
 	case 2:
-		// TODO: Implement this for AVX with 64bits widths, yielding 2 bits
+		Comment("Each register contains 2 bits, so we first combine them back into bytes before writing them back")
+		for i := 0; rounds/4 > i; i++ {
+			SHLB(U8(2), intermediates[4*i+1].As8())
+			SHLB(U8(4), intermediates[4*i+2].As8())
+			SHLB(U8(6), intermediates[4*i+3].As8())
+			ORB(intermediates[4*i+1].As8(), intermediates[4*i+0].As8())
+			ORB(intermediates[4*i+3].As8(), intermediates[4*i+2].As8())
+			ORB(intermediates[4*i+2].As8(), intermediates[4*i+0].As8())
+			if implementedWithInversion {
+				NOTB(intermediates[4*i+0].As8())
+			}
+			MOVB(intermediates[4*i+0].As8(), Mem{Base: dstMask, Disp: 1 * i})
+		}
 	}
 
 	Comment("Update our offsets into rows and dstMask")
diff --git a/stubs_amd64.go b/stubs_amd64.go
index eb39d8f..b125650 100644
--- a/stubs_amd64.go
+++ b/stubs_amd64.go
@@ -127,29 +127,59 @@ func asmAVXLesserEqualsFloat32(dstMask []byte, b float32, rows []float32)
 //go:noescape
 func asmAVX2EqualsUint64(dstMask []byte, b uint64, rows []uint64)
 
+//go:noescape
+func asmAVXEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+
 //go:noescape
 func asmAVX2EqualsFloat64(dstMask []byte, b float64, rows []float64)
 
+//go:noescape
+func asmAVXEqualsFloat64(dstMask []byte, b float64, rows []float64)
+
 //go:noescape
 func asmAVX2GreaterThanUint64(dstMask []byte, b uint64, rows []uint64)
 
+//go:noescape
+func asmAVXGreaterThanUint64(dstMask []byte, b uint64, rows []uint64)
+
 //go:noescape
 func asmAVX2GreaterThanFloat64(dstMask []byte, b float64, rows []float64)
 
+//go:noescape
+func asmAVXGreaterThanFloat64(dstMask []byte, b float64, rows []float64)
+
 //go:noescape
 func asmAVX2LessThanUint64(dstMask []byte, b uint64, rows []uint64)
 
+//go:noescape
+func asmAVXLessThanUint64(dstMask []byte, b uint64, rows []uint64)
+
 //go:noescape
 func asmAVX2LessThanFloat64(dstMask []byte, b float64, rows []float64)
 
+//go:noescape
+func asmAVXLessThanFloat64(dstMask []byte, b float64, rows []float64)
+
 //go:noescape
 func asmAVX2GreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64)
 
+//go:noescape
+func asmAVXGreaterEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+
 //go:noescape
 func asmAVX2GreaterEqualsFloat64(dstMask []byte, b float64, rows []float64)
 
+//go:noescape
+func asmAVXGreaterEqualsFloat64(dstMask []byte, b float64, rows []float64)
+
 //go:noescape
 func asmAVX2LesserEqualsUint64(dstMask []byte, b uint64, rows []uint64)
 
+//go:noescape
+func asmAVXLesserEqualsUint64(dstMask []byte, b uint64, rows []uint64)
+
 //go:noescape
 func asmAVX2LesserEqualsFloat64(dstMask []byte, b float64, rows []float64)
+
+//go:noescape
+func asmAVXLesserEqualsFloat64(dstMask []byte, b float64, rows []float64)