Message247132
FWIW, my approach is to look at the most important code
paths to see if there is any work being done that isn't
essential for the result being computed.
Next, I look at the generated assembly to estimate speed
by counting memory accesses (and whether they are cached
fresh accesses or stale random accesses) and I look at
the branches (and whether they are predictable or not).
The table=so->table assignment was being done for all code
paths but was only needed around the rich compare. Here
is the before and after for the most important path
(the first lookup). Note that the change saves one memory
spill and one reload.
Before:
-------
_set_add_entry:
pushq %r15
pushq %r14
movq %rdx, %r14
pushq %r13
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
pushq %rbx
subq $56, %rsp
movq 40(%rdi), %rax
addq $1, (%rsi)
movq %rax, 16(%rsp) <-- spill
movq 32(%r12), %rdx
movq %rdx, %r15
andq %r14, %r15
movq %r15, %rbx
salq $4, %rbx
addq 16(%rsp), %rbx <-- reload
movq (%rbx), %rcx
testq %rcx, %rcx
je L430
AFTER
-----
_set_add_entry:
pushq %r15
movq %rdx, %r15
pushq %r14
pushq %r13
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
pushq %rbx
subq $56, %rsp
movq 40(%rdi), %rdx
addq $1, (%rsi) <-- no spill
movq %rdx, %r11
L428:
movq 32(%r12), %rcx
movq %rcx, %r13
andq %r15, %r13
movq %r13, %rbx
salq $4, %rbx
addq %r11, %rbx <-- from register
movq (%rbx), %r14
testq %r14, %r14
je L429
The code around the rich compare used to do memory
loads that weren't necessary for the most likely case
(since the 64-bit hash values match, it is very likely
that the comparison will report a match).
BEFORE
------
call _PyObject_RichCompareBool
movq 24(%rsp), %rcx
movq (%rcx), %rdi
leaq -1(%rdi), %rdx
testq %rdx, %rdx
movq %rdx, (%rcx)
je L489
testl %eax, %eax
js L437 <--- predictable error branch
movq 40(%r12), %rdx <--- memory load
cmpq 16(%rsp), %rdx <--- memory load
jne L460
cmpq (%rbx), %rcx <--- memory load
jne L429 <--- predictable restart branch
testl %eax, %eax <--- predictable found_active branch
jne L432 <--- most common exit point
movq 32(%r12), %rdx
AFTER
-----
call _PyObject_RichCompareBool
movq 16(%rsp), %rcx
movq (%rcx), %rdi
leaq -1(%rdi), %rdx
testq %rdx, %rdx
movq %rdx, (%rcx)
je L485
cmpl $0, %eax
jg L431 <-- common exit before the memory loads!
L490:
jne L434
movq 40(%r12), %rdx <--- memory load
cmpq %rdx, 24(%rsp) <--- memory load
movq %rdx, %r11
jne L428
cmpq (%rbx), %rcx <--- memory load
jne L428 |
|
Date |
User |
Action |
Args |
2015-07-22 16:47:34 | rhettinger | set | recipients:
+ rhettinger, vstinner, r.david.murray, serhiy.storchaka |
2015-07-22 16:47:34 | rhettinger | set | messageid: <1437583654.13.0.190471091461.issue24681@psf.upfronthosting.co.za> |
2015-07-22 16:47:34 | rhettinger | link | issue24681 messages |
2015-07-22 16:47:33 | rhettinger | create | |
|