Ramana Radhakrishnan
2014-10-02 08:31:09 UTC
Hi,
I've been digging into why on AArch64 we generate pretty bad code
for the following testcase.
void g2(float, float, float, float, float, float, float, float);
void f2a(void)
{
float x0 = 1.0, x1 = 2.0, x2 = 3.0, x3 = 4.0, x4 = 5.0, x5 = 6.0, x6
= 7.0, x7 = 8.0;
float x8 = 0.5, x9 = 1.5, x10 = 2.5, x11 = 0.25, x12 = 0.125, x13 =
3.5, x14 = 0.75, x15 = 1.25;
g2(x0, x1, x2, x3, x4, x5, x6, x7);
g2(x8, x9, x10, x11, x12, x13, x14, x15);
g2(x0, x1, x2, x3, x4, x5, x6, x7);
g2(x8, x9, x10, x11, x12, x13, x14, x15);
}
And a couple of items caught my attention.
For one the backend doesn't set the costs of a reg-reg move to be the
same as a reg-const move. In the AArch64 backend the approach appears to
be in line with the documentation which is to set the costs of various
instructions relative to a fast integer instruction. The hack to
aarch64.c in the attached patch is for setting the cost properly for a
reg-reg move of the appropriate mode and is only for demonstration
purposes. I expect this to be replaced by an equivalent bit of code in
the backend to achieve the same thing.
However the code in precompute_register_parameters assumes that the
value is forced into a register if the set_src_cost of a constant is >
COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of
an FP immediate constant to a register and the backend not unreasonably
sets it to an appropriate cost. Now to assume that this number should
always be less than 1 is really not appropriate.
The same can be achieved with removing the fpconst case in
aarch64.c:rtx_costs but ...
Instead of putting in what's effectively a lie in the backend, should we
just be moving the midend to a world where all such numbers are compared
to costs from the backend rather than relying on magic numbers. The
costs comparison logic is similar to whats used in lower-subreg. The
thought was to move this to a common area (Richard Sandiford suggested
expmed.h in a private conversation) so that we had common APIs to check
the cost of a SET in this form rather than relying on the rtx_cost
interface.
Some of you might ask what's the impact on other backends, I still need
to do the due diligence there with various test programs but my
expectation based on reading the code is that a sample of backends (x86,
mips, aarch64 and powerpc) handle the reg-reg move cost with a "set"
already and I would expect the same to be in line with other costs.
AArch32 does not but I'll take care of that in a follow up.
Longer term should we move towards cleaning up such magic numbers from
the mid-end and that would make for a more maintainable compiler IMHO.
Thoughts ?
Lightly tested with the testcase and nothing more.
regards
Ramana
2a:
fmov s23, 8.0e+0
fmov s22, 7.0e+0
fmov s21, 6.0e+0
fmov s20, 5.0e+0
fmov s19, 4.0e+0
fmov s18, 3.0e+0
fmov s17, 2.0e+0
fmov s16, 1.0e+0
stp x29, x30, [sp, -112]!
fmov s7, s23
fmov s6, s22
add x29, sp, 0
fmov s5, s21
fmov s4, s20
stp d8, d9, [sp, 16]
fmov s3, s19
stp d10, d11, [sp, 32]
fmov s2, s18
stp d12, d13, [sp, 48]
fmov s1, s17
stp d14, d15, [sp, 64]
fmov s0, s16
fmov s15, 1.25e+0
fmov s14, 7.5e-1
fmov s13, 3.5e+0
fmov s12, 1.25e-1
fmov s11, 2.5e-1
fmov s10, 2.5e+0
fmov s9, 1.5e+0
fmov s8, 5.0e-1
str s23, [x29, 80]
str s22, [x29, 84]
str s21, [x29, 88]
str s20, [x29, 92]
str s19, [x29, 96]
str s18, [x29, 100]
str s17, [x29, 104]
str s16, [x29, 108]
bl g2
fmov s7, s15
fmov s6, s14
fmov s5, s13
fmov s4, s12
fmov s3, s11
fmov s2, s10
fmov s1, s9
fmov s0, s8
bl g2
ldr s23, [x29, 80]
TO
f2a:
stp x29, x30, [sp, -16]!
fmov s7, 8.0e+0
add x29, sp, 0
fmov s6, 7.0e+0
fmov s5, 6.0e+0
fmov s4, 5.0e+0
fmov s3, 4.0e+0
fmov s2, 3.0e+0
fmov s1, 2.0e+0
fmov s0, 1.0e+0
bl g2
fmov s7, 1.25e+0
fmov s6, 7.5e-1
fmov s5, 3.5e+0
fmov s4, 1.25e-1
fmov s3, 2.5e-1
fmov s2, 2.5e+0
fmov s1, 1.5e+0
fmov s0, 5.0e-1
bl g2
fmov s7, 8.0e+0
fmov s6, 7.0e+0
fmov s5, 6.0e+0
fmov s4, 5.0e+0
fmov s3, 4.0e+0
fmov s2, 3.0e+0
fmov s1, 2.0e+0
fmov s0, 1.0e+0
bl g2
ldp x29, x30, [sp], 16
fmov s7, 1.25e+0
fmov s6, 7.5e-1
fmov s5, 3.5e+0
fmov s4, 1.25e-1
fmov s3, 2.5e-1
fmov s2, 2.5e+0
fmov s1, 1.5e+0
fmov s0, 5.0e-1
b g2
I've been digging into why on AArch64 we generate pretty bad code
for the following testcase.
void g2(float, float, float, float, float, float, float, float);
void f2a(void)
{
float x0 = 1.0, x1 = 2.0, x2 = 3.0, x3 = 4.0, x4 = 5.0, x5 = 6.0, x6
= 7.0, x7 = 8.0;
float x8 = 0.5, x9 = 1.5, x10 = 2.5, x11 = 0.25, x12 = 0.125, x13 =
3.5, x14 = 0.75, x15 = 1.25;
g2(x0, x1, x2, x3, x4, x5, x6, x7);
g2(x8, x9, x10, x11, x12, x13, x14, x15);
g2(x0, x1, x2, x3, x4, x5, x6, x7);
g2(x8, x9, x10, x11, x12, x13, x14, x15);
}
And a couple of items caught my attention.
For one the backend doesn't set the costs of a reg-reg move to be the
same as a reg-const move. In the AArch64 backend the approach appears to
be in line with the documentation which is to set the costs of various
instructions relative to a fast integer instruction. The hack to
aarch64.c in the attached patch is for setting the cost properly for a
reg-reg move of the appropriate mode and is only for demonstration
purposes. I expect this to be replaced by an equivalent bit of code in
the backend to achieve the same thing.
However the code in precompute_register_parameters assumes that the
value is forced into a register if the set_src_cost of a constant is >
COSTS_N_INSNS(1). Now this appears to be checking the cost of a set of
an FP immediate constant to a register and the backend not unreasonably
sets it to an appropriate cost. Now to assume that this number should
always be less than 1 is really not appropriate.
The same can be achieved with removing the fpconst case in
aarch64.c:rtx_costs but ...
Instead of putting in what's effectively a lie in the backend, should we
just be moving the midend to a world where all such numbers are compared
to costs from the backend rather than relying on magic numbers. The
costs comparison logic is similar to whats used in lower-subreg. The
thought was to move this to a common area (Richard Sandiford suggested
expmed.h in a private conversation) so that we had common APIs to check
the cost of a SET in this form rather than relying on the rtx_cost
interface.
Some of you might ask what's the impact on other backends, I still need
to do the due diligence there with various test programs but my
expectation based on reading the code is that a sample of backends (x86,
mips, aarch64 and powerpc) handle the reg-reg move cost with a "set"
already and I would expect the same to be in line with other costs.
AArch32 does not but I'll take care of that in a follow up.
Longer term should we move towards cleaning up such magic numbers from
the mid-end and that would make for a more maintainable compiler IMHO.
Thoughts ?
Lightly tested with the testcase and nothing more.
regards
Ramana
2a:
fmov s23, 8.0e+0
fmov s22, 7.0e+0
fmov s21, 6.0e+0
fmov s20, 5.0e+0
fmov s19, 4.0e+0
fmov s18, 3.0e+0
fmov s17, 2.0e+0
fmov s16, 1.0e+0
stp x29, x30, [sp, -112]!
fmov s7, s23
fmov s6, s22
add x29, sp, 0
fmov s5, s21
fmov s4, s20
stp d8, d9, [sp, 16]
fmov s3, s19
stp d10, d11, [sp, 32]
fmov s2, s18
stp d12, d13, [sp, 48]
fmov s1, s17
stp d14, d15, [sp, 64]
fmov s0, s16
fmov s15, 1.25e+0
fmov s14, 7.5e-1
fmov s13, 3.5e+0
fmov s12, 1.25e-1
fmov s11, 2.5e-1
fmov s10, 2.5e+0
fmov s9, 1.5e+0
fmov s8, 5.0e-1
str s23, [x29, 80]
str s22, [x29, 84]
str s21, [x29, 88]
str s20, [x29, 92]
str s19, [x29, 96]
str s18, [x29, 100]
str s17, [x29, 104]
str s16, [x29, 108]
bl g2
fmov s7, s15
fmov s6, s14
fmov s5, s13
fmov s4, s12
fmov s3, s11
fmov s2, s10
fmov s1, s9
fmov s0, s8
bl g2
ldr s23, [x29, 80]
TO
f2a:
stp x29, x30, [sp, -16]!
fmov s7, 8.0e+0
add x29, sp, 0
fmov s6, 7.0e+0
fmov s5, 6.0e+0
fmov s4, 5.0e+0
fmov s3, 4.0e+0
fmov s2, 3.0e+0
fmov s1, 2.0e+0
fmov s0, 1.0e+0
bl g2
fmov s7, 1.25e+0
fmov s6, 7.5e-1
fmov s5, 3.5e+0
fmov s4, 1.25e-1
fmov s3, 2.5e-1
fmov s2, 2.5e+0
fmov s1, 1.5e+0
fmov s0, 5.0e-1
bl g2
fmov s7, 8.0e+0
fmov s6, 7.0e+0
fmov s5, 6.0e+0
fmov s4, 5.0e+0
fmov s3, 4.0e+0
fmov s2, 3.0e+0
fmov s1, 2.0e+0
fmov s0, 1.0e+0
bl g2
ldp x29, x30, [sp], 16
fmov s7, 1.25e+0
fmov s6, 7.5e-1
fmov s5, 3.5e+0
fmov s4, 1.25e-1
fmov s3, 2.5e-1
fmov s2, 2.5e+0
fmov s1, 1.5e+0
fmov s0, 5.0e-1
b g2