Simplify RangeInclusive::next[_back] #48057

scottmcm · 2018-02-07T19:21:48Z

matching on an Option<Ordering> seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple for loops I was trying.

cc #45222 #28237 (comment)

Example:

#[no_mangle]
pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 {
    let mut sum = 0;
    for i in x {
        sum += i ^ (i-1);
    }
    sum
}

Today:

coresum:
    xor r8d, r8d
    mov r9, -1
    xor eax, eax
    jmp .LBB0_1
.LBB0_4:
    lea rcx, [rdi - 1]
    xor rcx, rdi
    add rax, rcx
    mov rsi, rdx
    mov rdi, r10
.LBB0_1:
    cmp rdi, rsi
    mov ecx, 1
    cmovb   rcx, r9
    cmove   rcx, r8
    test    rcx, rcx
    mov edx, 0
    mov r10d, 1
    je  .LBB0_4         // 1
    cmp rcx, -1
    jne .LBB0_5         // 2
    lea r10, [rdi + 1]
    mov rdx, rsi
    jmp .LBB0_4         // 3
.LBB0_5:
    ret

With this PR:

coresum:
	cmp	rcx, rdx
	jbe	.LBB0_2
	xor	eax, eax
	ret
.LBB0_2:
	xor	r8d, r8d
	mov	r9d, 1
	xor	eax, eax
	.p2align	4, 0x90
.LBB0_3:
	lea	r10, [rcx + 1]
	cmp	rcx, rdx
	cmovae	rdx, r8
	cmovae	r10, r9
	lea	r11, [rcx - 1]
	xor	r11, rcx
	add	rax, r11
	mov	rcx, r10
	cmp	r10, rdx
	jbe	.LBB0_3         // Just this
	ret

Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it

coresum_inner:
.Lcfi0:
.seh_proc coresum_inner
	sub	rsp, 168
.Lcfi1:
	.seh_stackalloc 168
	vmovdqa	xmmword ptr [rsp + 144], xmm15
.Lcfi2:
	.seh_savexmm 15, 144
	vmovdqa	xmmword ptr [rsp + 128], xmm14
.Lcfi3:
	.seh_savexmm 14, 128
	vmovdqa	xmmword ptr [rsp + 112], xmm13
.Lcfi4:
	.seh_savexmm 13, 112
	vmovdqa	xmmword ptr [rsp + 96], xmm12
.Lcfi5:
	.seh_savexmm 12, 96
	vmovdqa	xmmword ptr [rsp + 80], xmm11
.Lcfi6:
	.seh_savexmm 11, 80
	vmovdqa	xmmword ptr [rsp + 64], xmm10
.Lcfi7:
	.seh_savexmm 10, 64
	vmovdqa	xmmword ptr [rsp + 48], xmm9
.Lcfi8:
	.seh_savexmm 9, 48
	vmovdqa	xmmword ptr [rsp + 32], xmm8
.Lcfi9:
	.seh_savexmm 8, 32
	vmovdqa	xmmword ptr [rsp + 16], xmm7
.Lcfi10:
	.seh_savexmm 7, 16
	vmovdqa	xmmword ptr [rsp], xmm6
.Lcfi11:
	.seh_savexmm 6, 0
.Lcfi12:
	.seh_endprologue
	cmp	rdx, rcx
	jae	.LBB1_2
	xor	eax, eax
	jmp	.LBB1_13
.LBB1_2:
	mov	r8, rdx
	sub	r8, rcx
	jbe	.LBB1_3
	cmp	r8, 7
	jbe	.LBB1_5
	mov	rax, r8
	and	rax, -8
	mov	r9, r8
	and	r9, -8
	je	.LBB1_5
	add	rax, rcx
	vmovq	xmm0, rcx
	vpshufd	xmm0, xmm0, 68
	mov	ecx, 1
	vmovq	xmm1, rcx
	vpslldq	xmm1, xmm1, 8
	vpaddq	xmm1, xmm0, xmm1
	vpxor	xmm0, xmm0, xmm0
	vpcmpeqd	xmm11, xmm11, xmm11
	vmovdqa	xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001]
	vmovdqa	xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003]
	vmovdqa	xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005]
	vmovdqa	xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008]
	mov	rcx, r9
	vpxor	xmm4, xmm4, xmm4
	vpxor	xmm5, xmm5, xmm5
	vpxor	xmm6, xmm6, xmm6
	.p2align	4, 0x90
.LBB1_9:
	vpaddq	xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002]
	vpaddq	xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004]
	vpaddq	xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006]
	vpaddq	xmm8, xmm1, xmm12
	vpxor	xmm7, xmm8, xmm7
	vpaddq	xmm2, xmm1, xmm13
	vpxor	xmm8, xmm2, xmm9
	vpaddq	xmm3, xmm1, xmm14
	vpxor	xmm3, xmm3, xmm10
	vpaddq	xmm2, xmm1, xmm11
	vpxor	xmm2, xmm2, xmm1
	vpaddq	xmm0, xmm2, xmm0
	vpaddq	xmm4, xmm7, xmm4
	vpaddq	xmm5, xmm8, xmm5
	vpaddq	xmm6, xmm3, xmm6
	vpaddq	xmm1, xmm1, xmm15
	add	rcx, -8
	jne	.LBB1_9
	vpaddq	xmm0, xmm4, xmm0
	vpaddq	xmm0, xmm5, xmm0
	vpaddq	xmm0, xmm6, xmm0
	vpshufd	xmm1, xmm0, 78
	vpaddq	xmm0, xmm0, xmm1
	vmovq	r10, xmm0
	cmp	r8, r9
	jne	.LBB1_6
	jmp	.LBB1_11
.LBB1_3:
	xor	r10d, r10d
	jmp	.LBB1_12
.LBB1_5:
	xor	r10d, r10d
	mov	rax, rcx
	.p2align	4, 0x90
.LBB1_6:
	lea	rcx, [rax - 1]
	xor	rcx, rax
	inc	rax
	add	r10, rcx
	cmp	rdx, rax
	jne	.LBB1_6
.LBB1_11:
	mov	rcx, rdx
.LBB1_12:
	lea	rax, [rcx - 1]
	xor	rax, rcx
	add	rax, r10
.LBB1_13:
	vmovaps	xmm6, xmmword ptr [rsp]
	vmovaps	xmm7, xmmword ptr [rsp + 16]
	vmovaps	xmm8, xmmword ptr [rsp + 32]
	vmovaps	xmm9, xmmword ptr [rsp + 48]
	vmovaps	xmm10, xmmword ptr [rsp + 64]
	vmovaps	xmm11, xmmword ptr [rsp + 80]
	vmovaps	xmm12, xmmword ptr [rsp + 96]
	vmovaps	xmm13, xmmword ptr [rsp + 112]
	vmovaps	xmm14, xmmword ptr [rsp + 128]
	vmovaps	xmm15, xmmword ptr [rsp + 144]
	add	rsp, 168
	ret
	.seh_handlerdata
	.section	.text,"xr",one_only,coresum_inner
.Lcfi13:
	.seh_endproc

`match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying.

rust-highfive · 2018-02-07T19:22:01Z

r? @Mark-Simulacrum

(rust_highfive has picked a reviewer for you, use r? to override)

Mark-Simulacrum · 2018-02-08T00:53:18Z

r? @dtolnay but generally looks good

dtolnay · 2018-02-08T01:22:46Z

@bors r+

bors · 2018-02-08T01:22:47Z

📌 Commit 27d4d51 has been approved by dtolnay

bors · 2018-02-08T01:47:13Z

⌛ Testing commit 27d4d51 with merge bee10b387796d018fe48bc71bac6d74b6bf0ed9e...

bors · 2018-02-08T05:47:13Z

💥 Test timed out

kennytm · 2018-02-08T06:38:28Z

@bors retry

bors · 2018-02-08T06:38:37Z

⌛ Testing commit 27d4d51 with merge 932c736...

Simplify RangeInclusive::next[_back] `match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying. cc #45222 #28237 (comment) Example: ```rust #[no_mangle] pub fn coresum(x: std::ops::RangeInclusive<u64>) -> u64 { let mut sum = 0; for i in x { sum += i ^ (i-1); } sum } ``` Today: ```asm coresum: xor r8d, r8d mov r9, -1 xor eax, eax jmp .LBB0_1 .LBB0_4: lea rcx, [rdi - 1] xor rcx, rdi add rax, rcx mov rsi, rdx mov rdi, r10 .LBB0_1: cmp rdi, rsi mov ecx, 1 cmovb rcx, r9 cmove rcx, r8 test rcx, rcx mov edx, 0 mov r10d, 1 je .LBB0_4 // 1 cmp rcx, -1 jne .LBB0_5 // 2 lea r10, [rdi + 1] mov rdx, rsi jmp .LBB0_4 // 3 .LBB0_5: ret ``` With this PR: ```asm coresum: cmp rcx, rdx jbe .LBB0_2 xor eax, eax ret .LBB0_2: xor r8d, r8d mov r9d, 1 xor eax, eax .p2align 4, 0x90 .LBB0_3: lea r10, [rcx + 1] cmp rcx, rdx cmovae rdx, r8 cmovae r10, r9 lea r11, [rcx - 1] xor r11, rcx add rax, r11 mov rcx, r10 cmp r10, rdx jbe .LBB0_3 // Just this ret ``` <details><summary>Though using internal iteration (`.map(|i| i ^ (i-1)).sum()`) is still shorter to type, and lets the compiler unroll it</summary> ```asm coresum_inner: .Lcfi0: .seh_proc coresum_inner sub rsp, 168 .Lcfi1: .seh_stackalloc 168 vmovdqa xmmword ptr [rsp + 144], xmm15 .Lcfi2: .seh_savexmm 15, 144 vmovdqa xmmword ptr [rsp + 128], xmm14 .Lcfi3: .seh_savexmm 14, 128 vmovdqa xmmword ptr [rsp + 112], xmm13 .Lcfi4: .seh_savexmm 13, 112 vmovdqa xmmword ptr [rsp + 96], xmm12 .Lcfi5: .seh_savexmm 12, 96 vmovdqa xmmword ptr [rsp + 80], xmm11 .Lcfi6: .seh_savexmm 11, 80 vmovdqa xmmword ptr [rsp + 64], xmm10 .Lcfi7: .seh_savexmm 10, 64 vmovdqa xmmword ptr [rsp + 48], xmm9 .Lcfi8: .seh_savexmm 9, 48 vmovdqa xmmword ptr [rsp + 32], xmm8 .Lcfi9: .seh_savexmm 8, 32 vmovdqa xmmword ptr [rsp + 16], xmm7 .Lcfi10: .seh_savexmm 7, 16 vmovdqa xmmword ptr [rsp], xmm6 .Lcfi11: .seh_savexmm 6, 0 .Lcfi12: .seh_endprologue cmp rdx, rcx jae .LBB1_2 xor eax, eax jmp .LBB1_13 .LBB1_2: mov r8, rdx sub r8, rcx jbe .LBB1_3 cmp r8, 7 jbe .LBB1_5 mov rax, r8 and rax, -8 mov r9, r8 and r9, -8 je .LBB1_5 add rax, rcx vmovq xmm0, rcx vpshufd xmm0, xmm0, 68 mov ecx, 1 vmovq xmm1, rcx vpslldq xmm1, xmm1, 8 vpaddq xmm1, xmm0, xmm1 vpxor xmm0, xmm0, xmm0 vpcmpeqd xmm11, xmm11, xmm11 vmovdqa xmm12, xmmword ptr [rip + __xmm@00000000000000010000000000000001] vmovdqa xmm13, xmmword ptr [rip + __xmm@00000000000000030000000000000003] vmovdqa xmm14, xmmword ptr [rip + __xmm@00000000000000050000000000000005] vmovdqa xmm15, xmmword ptr [rip + __xmm@00000000000000080000000000000008] mov rcx, r9 vpxor xmm4, xmm4, xmm4 vpxor xmm5, xmm5, xmm5 vpxor xmm6, xmm6, xmm6 .p2align 4, 0x90 .LBB1_9: vpaddq xmm7, xmm1, xmmword ptr [rip + __xmm@00000000000000020000000000000002] vpaddq xmm9, xmm1, xmmword ptr [rip + __xmm@00000000000000040000000000000004] vpaddq xmm10, xmm1, xmmword ptr [rip + __xmm@00000000000000060000000000000006] vpaddq xmm8, xmm1, xmm12 vpxor xmm7, xmm8, xmm7 vpaddq xmm2, xmm1, xmm13 vpxor xmm8, xmm2, xmm9 vpaddq xmm3, xmm1, xmm14 vpxor xmm3, xmm3, xmm10 vpaddq xmm2, xmm1, xmm11 vpxor xmm2, xmm2, xmm1 vpaddq xmm0, xmm2, xmm0 vpaddq xmm4, xmm7, xmm4 vpaddq xmm5, xmm8, xmm5 vpaddq xmm6, xmm3, xmm6 vpaddq xmm1, xmm1, xmm15 add rcx, -8 jne .LBB1_9 vpaddq xmm0, xmm4, xmm0 vpaddq xmm0, xmm5, xmm0 vpaddq xmm0, xmm6, xmm0 vpshufd xmm1, xmm0, 78 vpaddq xmm0, xmm0, xmm1 vmovq r10, xmm0 cmp r8, r9 jne .LBB1_6 jmp .LBB1_11 .LBB1_3: xor r10d, r10d jmp .LBB1_12 .LBB1_5: xor r10d, r10d mov rax, rcx .p2align 4, 0x90 .LBB1_6: lea rcx, [rax - 1] xor rcx, rax inc rax add r10, rcx cmp rdx, rax jne .LBB1_6 .LBB1_11: mov rcx, rdx .LBB1_12: lea rax, [rcx - 1] xor rax, rcx add rax, r10 .LBB1_13: vmovaps xmm6, xmmword ptr [rsp] vmovaps xmm7, xmmword ptr [rsp + 16] vmovaps xmm8, xmmword ptr [rsp + 32] vmovaps xmm9, xmmword ptr [rsp + 48] vmovaps xmm10, xmmword ptr [rsp + 64] vmovaps xmm11, xmmword ptr [rsp + 80] vmovaps xmm12, xmmword ptr [rsp + 96] vmovaps xmm13, xmmword ptr [rsp + 112] vmovaps xmm14, xmmword ptr [rsp + 128] vmovaps xmm15, xmmword ptr [rsp + 144] add rsp, 168 ret .seh_handlerdata .section .text,"xr",one_only,coresum_inner .Lcfi13: .seh_endproc ``` </details>

bors · 2018-02-08T09:37:23Z

☀️ Test successful - status-appveyor, status-travis
Approved by: dtolnay
Pushing 932c736 to master...

Simplify RangeInclusive::next[_back]

27d4d51

`match`ing on an `Option<Ordering>` seems cause some confusion for LLVM; switching to just using comparison operators removes a few jumps from the simple `for` loops I was trying.

rust-highfive assigned Mark-Simulacrum Feb 7, 2018

scottmcm mentioned this pull request Feb 7, 2018

Tracking issue for ..= inclusive ranges (RFC #1192) -- originally ... #28237

Closed

8 tasks

BatmanAoD added the S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. label Feb 7, 2018

rust-highfive assigned dtolnay and unassigned Mark-Simulacrum Feb 8, 2018

dtolnay approved these changes Feb 8, 2018

View reviewed changes

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels Feb 8, 2018

bors added S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. and removed S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. labels Feb 8, 2018

bors added S-waiting-on-bors Status: Waiting on bors to run and complete tests. Bors will change the label on completion. and removed S-waiting-on-review Status: Awaiting review from the assignee but also interested parties. labels Feb 8, 2018

bors merged commit 27d4d51 into rust-lang:master Feb 8, 2018

scottmcm deleted the less-match-more-compare branch February 8, 2018 10:01

scottmcm mentioned this pull request Feb 9, 2018

Big performance problem with closed intervals looping #45222

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Simplify RangeInclusive::next[_back] #48057

Simplify RangeInclusive::next[_back] #48057

scottmcm commented Feb 7, 2018

rust-highfive commented Feb 7, 2018

Mark-Simulacrum commented Feb 8, 2018

dtolnay commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018

kennytm commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018

Simplify RangeInclusive::next[_back] #48057

Simplify RangeInclusive::next[_back] #48057

Conversation

scottmcm commented Feb 7, 2018

rust-highfive commented Feb 7, 2018

Mark-Simulacrum commented Feb 8, 2018

dtolnay commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018

kennytm commented Feb 8, 2018

bors commented Feb 8, 2018

bors commented Feb 8, 2018