-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Tweak EncodeToUtf16_Vector128
#110203
base: main
Are you sure you want to change the base?
Tweak EncodeToUtf16_Vector128
#110203
Conversation
7c5f8fe
to
2de9837
Compare
The PR is marked as "ready-for-review", any details why this is profitable? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copilot reviewed 1 out of 1 changed files in this pull request and generated no comments.
updated the description |
No codegen difference in MihuBot it seems |
https://csharp.godbolt.org/z/MscnhYv83 @MihaZupan @EgorBo do you know why MihuBot might not be showing any diffs for this change? |
No idea, if there is an issue it must be inside jit-utils (PMI mode) rather than in MihuBot itself, would be nice to know what happened here. Overall it seems like this code doesn't save anything for this function - it is still same 50 bytes of codegen inside the main loop, it's just that |
The bot will only show you diffs for methods where code size changed, as diffs are otherwise less likely to be interesting. Do you have a microbenchmark that demonstrates that changes like this matter in practice for this method? |
Good point, I missed the fact that total size is the same |
Extracting the artifacts shows zero diffs. Assembly listing; Assembly listing for method System.HexConverter:EncodeToUtf16_Vector128(System.ReadOnlySpan`1[ubyte],System.Span`1[ushort],uint) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 7 single block inlinees; 4 inlinees without PGO data
; Final local variable assignments
;
;* V00 arg0 [V00 ] ( 0, 0 ) struct (16) zero-ref multireg-arg ld-addr-op single-def <System.ReadOnlySpan`1[ubyte]>
;* V01 arg1 [V01 ] ( 0, 0 ) struct (16) zero-ref multireg-arg single-def <System.Span`1[ushort]>
; V02 arg2 [V02,T05] ( 3, 3 ) int -> r8 single-def
; V03 loc0 [V03,T03] ( 2, 9 ) byref -> rdi single-def
; V04 loc1 [V04,T04] ( 2, 9 ) byref -> rdx single-def
; V05 loc2 [V05,T15] ( 2, 9 ) simd16 -> mm0 <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V06 loc3 [V06,T00] ( 8, 49 ) long -> rax
; V07 loc4 [V07,T02] ( 3, 9 ) long -> rsi
;# V08 OutArgs [V08 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
; V09 tmp1 [V09,T17] ( 3, 2 ) simd16 -> mm0
;* V10 tmp2 [V10 ] ( 0, 0 ) struct (32) zero-ref "location for address-of(RValue)" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]]>
;* V11 tmp3 [V11 ] ( 0, 0 ) struct (32) zero-ref "location for address-of(RValue)" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector128`1[ushort],System.Runtime.Intrinsics.Vector128`1[ushort]]>
;* V12 tmp4 [V12 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
;* V13 tmp5 [V13 ] ( 0, 0 ) struct (16) zero-ref ld-addr-op "Inlining Arg" <System.Span`1[ushort]>
; V14 tmp6 [V14,T10] ( 3, 48 ) simd16 -> mm2 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V15 tmp7 [V15,T11] ( 2, 32 ) simd16 -> mm3 "spilled call-like call argument"
; V16 tmp8 [V16,T14] ( 2, 16 ) simd16 -> mm2 "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V17 tmp9 [V17 ] ( 0, 0 ) simd16 -> zero-ref "Inline stloc first use temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
; V18 tmp10 [V18,T12] ( 2, 32 ) simd16 -> mm2 "impAppendStmt"
;* V19 tmp11 [V19 ] ( 0, 0 ) struct (32) zero-ref ld-addr-op "NewObj constructor temp" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector128`1[ubyte],System.Runtime.Intrinsics.Vector128`1[ubyte]]>
;* V20 tmp12 [V20 ] ( 0, 0 ) simd16 -> zero-ref "spilled call-like call argument"
;* V21 tmp13 [V21 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V22 tmp14 [V22 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V23 tmp15 [V23 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V24 tmp16 [V24 ] ( 0, 0 ) simd16 -> zero-ref "Inline return value spill temp" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V25 tmp17 [V25 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V26 tmp18 [V26 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ubyte]>
;* V27 tmp19 [V27 ] ( 0, 0 ) struct (32) zero-ref ld-addr-op "NewObj constructor temp" <System.ValueTuple`2[System.Runtime.Intrinsics.Vector128`1[ushort],System.Runtime.Intrinsics.Vector128`1[ushort]]>
; V28 tmp20 [V28,T13] ( 2, 32 ) simd16 -> mm2 "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
;* V29 tmp21 [V29 ] ( 0, 0 ) simd16 -> zero-ref "Inlining Arg" <System.Runtime.Intrinsics.Vector128`1[ushort]>
; V30 tmp22 [V30,T06] ( 2, 2 ) byref -> rdi single-def "field V00._reference (fldOffset=0x0)" P-INDEP
; V31 tmp23 [V31,T08] ( 2, 2 ) int -> rsi single-def "field V00._length (fldOffset=0x8)" P-INDEP
; V32 tmp24 [V32,T07] ( 2, 2 ) byref -> rdx single-def "field V01._reference (fldOffset=0x0)" P-INDEP
; V33 tmp25 [V33,T09] ( 1, 1 ) int -> rcx single-def "field V01._length (fldOffset=0x8)" P-INDEP
;* V34 tmp26 [V34 ] ( 0, 0 ) simd16 -> zero-ref "field V10.Item1 (fldOffset=0x0)" P-INDEP
;* V35 tmp27 [V35 ] ( 0, 0 ) simd16 -> zero-ref "field V10.Item2 (fldOffset=0x10)" P-INDEP
;* V36 tmp28 [V36 ] ( 0, 0 ) simd16 -> zero-ref "field V11.Item1 (fldOffset=0x0)" P-INDEP
;* V37 tmp29 [V37 ] ( 0, 0 ) simd16 -> zero-ref "field V11.Item2 (fldOffset=0x10)" P-INDEP
;* V38 tmp30 [V38 ] ( 0, 0 ) byref -> zero-ref single-def "field V12._reference (fldOffset=0x0)" P-INDEP
;* V39 tmp31 [V39 ] ( 0, 0 ) int -> zero-ref "field V12._length (fldOffset=0x8)" P-INDEP
;* V40 tmp32 [V40 ] ( 0, 0 ) byref -> zero-ref single-def "field V13._reference (fldOffset=0x0)" P-INDEP
;* V41 tmp33 [V41 ] ( 0, 0 ) int -> zero-ref "field V13._length (fldOffset=0x8)" P-INDEP
;* V42 tmp34 [V42 ] ( 0, 0 ) simd16 -> zero-ref "field V19.Item1 (fldOffset=0x0)" P-INDEP
;* V43 tmp35 [V43 ] ( 0, 0 ) simd16 -> zero-ref "field V19.Item2 (fldOffset=0x10)" P-INDEP
;* V44 tmp36 [V44 ] ( 0, 0 ) simd16 -> zero-ref "field V27.Item1 (fldOffset=0x0)" P-INDEP
;* V45 tmp37 [V45 ] ( 0, 0 ) simd16 -> zero-ref "field V27.Item2 (fldOffset=0x10)" P-INDEP
; V46 cse0 [V46,T16] ( 2, 9 ) simd16 -> mm1 hoist "CSE #02: aggressive"
; V47 cse1 [V47,T01] ( 3, 10 ) long -> rcx "CSE #01: aggressive"
;
; Lcl frame size = 0
G_M10695_IG01:
push rbp
mov rbp, rsp
;; size=4 bbWeight=1 PerfScore 1.25
G_M10695_IG02:
test r8d, r8d
jne SHORT G_M10695_IG04
;; size=5 bbWeight=1 PerfScore 1.25
G_M10695_IG03:
vmovups xmm0, xmmword ptr [reloc @RWD00]
jmp SHORT G_M10695_IG05
align [0 bytes for IG06]
;; size=10 bbWeight=0.50 PerfScore 2.50
G_M10695_IG04:
vmovups xmm0, xmmword ptr [reloc @RWD16]
;; size=8 bbWeight=0.50 PerfScore 1.50
G_M10695_IG05:
xor eax, eax
mov ecx, esi
lea rsi, [rcx-0x04]
vmovups xmm1, xmmword ptr [reloc @RWD32]
jmp SHORT G_M10695_IG07
;; size=18 bbWeight=1 PerfScore 6.00
G_M10695_IG06:
cmp rax, rsi
jbe SHORT G_M10695_IG07
mov rax, rsi
;; size=8 bbWeight=4 PerfScore 6.00
G_M10695_IG07:
vmovd xmm2, dword ptr [rdi+rax]
vpsrlq xmm3, xmm2, 4
vpunpcklbw xmm2, xmm3, xmm2
vpand xmm2, xmm1, xmm2
vpshufb xmm2, xmm0, xmm2
vpmovzxbw xmm2, xmm2
vmovups xmmword ptr [rdx+4*rax], xmm2
add rax, 4
cmp rax, rcx
jne SHORT G_M10695_IG06
;; size=42 bbWeight=8 PerfScore 94.67
G_M10695_IG08:
pop rbp
ret
;; size=2 bbWeight=1 PerfScore 1.50
RWD00 dq 3736353433323130h, 4645444342413938h
RWD16 dq 3736353433323130h, 6665646362613938h
RWD32 dq 0F0F0F0F0F0F0F0Fh, 0F0F0F0F0F0F0F0Fh
; Total bytes of code 97, prolog size 4, PerfScore 114.67, instruction count 28, allocated bytes for code 97 (MethodHash=925ed638) for method System.HexConverter:EncodeToUtf16_Vector128(System.ReadOnlySpan`1[ubyte],System.Span`1[ushort],uint) (FullOpts)
; ============================================================ |
My basic change detection missed that the |
Here's the updated diff: https://www.diffchecker.com/dqqicPKx/ (still same code size) I agree with Egor that this doesn't seem beneficial unless an actual benchmark shows a difference. |
This is a micro-optimisation that: