Discussion:
[PATCH, rs6000] Generate LE code for vec_lvsl and vec_lvsr that is compatible with BE code
Bill Schmidt
2014-09-29 22:26:14 UTC
Permalink
Hi,

Up till now we have not attempted to generate code for LE usage of
vec_lvsl and vec_lvsr that is compatible with expected BE usage. The LE
code sequence corresponding to lvsl/vperm is not good, and we encourage
programmers to convert those sequences to use direct assignment and the
type system for unaligned loads. However, the issue comes up frequently
enough that it seems best to provide this sequence together with a
warning message (in a previous patch submission) to avoid confusion.

The method used in this patch is to perform a byte-reversal of the
result of the lvsl/lvsr. This is accomplished by loading the vector
char constant {0,1,...,15}, which will appear in the register from left
to right as {15,...,1,0}. A vperm instruction (which uses BE element
ordering) is applied to the result of the lvsl/lvsr using the loaded
constant as the permute control vector.

Bootstrapped and tested on powerpc64le-unknown-linux-gnu with no
regressions. Is this ok for trunk?

Thanks,
Bill


[gcc]

2014-09-29 Bill Schmidt <***@linux.vnet.ibm.com>

* altivec.md (altivec_lvsl): New define_expand.
(altivec_lvsl_direct): Rename define_insn from altivec_lvsl.
(altivec_lvsr): New define_expand.
(altivec_lvsr_direct): Rename define_insn from altivec_lvsr.
* rs6000.c (rs6000_expand_builtin): Change to use
altivec_lvs[lr]_direct; remove commented-out code.

[gcc/testsuite]

2014-09-29 Bill Schmidt <***@linux.vnet.ibm.com>

* gcc.target/powerpc/lvsl-lvsr.c: New test.


Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md (revision 215689)
+++ gcc/config/rs6000/altivec.md (working copy)
@@ -2297,7 +2297,32 @@
"dststt %0,%1,%2"
[(set_attr "type" "vecsimple")])

-(define_insn "altivec_lvsl"
+(define_expand "altivec_lvsl"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" "Z"))]
+ "TARGET_ALTIVEC"
+ "
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsl_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsl_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
+ perm[i] = GEN_INT (i);
+ constv = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, perm));
+ constv = force_reg (V16QImode, constv);
+ vperm = gen_rtx_UNSPEC (V16QImode, gen_rtvec (3, mask, mask, constv),
+ UNSPEC_VPERM);
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], vperm));
+ }
+ DONE;
+}")
+
+(define_insn "altivec_lvsl_direct"
[(set (match_operand:V16QI 0 "register_operand" "=v")
(unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "Z")]
UNSPEC_LVSL))]
@@ -2305,7 +2330,32 @@
"lvsl %0,%y1"
[(set_attr "type" "vecload")])

-(define_insn "altivec_lvsr"
+(define_expand "altivec_lvsr"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" "Z"))]
+ "TARGET_ALTIVEC"
+ "
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsr_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsr_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
+ perm[i] = GEN_INT (i);
+ constv = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, perm));
+ constv = force_reg (V16QImode, constv);
+ vperm = gen_rtx_UNSPEC (V16QImode, gen_rtvec (3, mask, mask, constv),
+ UNSPEC_VPERM);
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], vperm));
+ }
+ DONE;
+}")
+
+(define_insn "altivec_lvsr_direct"
[(set (match_operand:V16QI 0 "register_operand" "=v")
(unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "Z")]
UNSPEC_LVSR))]
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c (revision 215689)
+++ gcc/config/rs6000/rs6000.c (working copy)
@@ -13898,8 +13898,8 @@ rs6000_expand_builtin (tree exp, rtx target, rtx s
case ALTIVEC_BUILTIN_MASK_FOR_LOAD:
case ALTIVEC_BUILTIN_MASK_FOR_STORE:
{
- int icode = (BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr
- : (int) CODE_FOR_altivec_lvsl);
+ int icode = (BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct
+ : (int) CODE_FOR_altivec_lvsl_direct);
enum machine_mode tmode = insn_data[icode].operand[0].mode;
enum machine_mode mode = insn_data[icode].operand[1].mode;
tree arg;
@@ -13927,7 +13927,6 @@ rs6000_expand_builtin (tree exp, rtx target, rtx s
|| ! (*insn_data[icode].operand[0].predicate) (target, tmode))
target = gen_reg_rtx (tmode);

- /*pat = gen_altivec_lvsr (target, op);*/
pat = GEN_FCN (icode) (target, op);
if (!pat)
return 0;
Index: gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c (working copy)
@@ -0,0 +1,19 @@
+/* Test expected code generation for lvsl and lvsr on little endian. */
+
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-O0 -Wno-deprecated" } */
+/* { dg-final { scan-assembler-times "lvsl" 1 } } */
+/* { dg-final { scan-assembler-times "lvsr" 1 } } */
+/* { dg-final { scan-assembler-times "lxvd2x" 2 } } */
+/* { dg-final { scan-assembler-times "vperm" 2 } } */
+
+
+#include <altivec.h>
+
+float f[20];
+
+void foo ()
+{
+ vector unsigned char a = vec_lvsl (4, f);
+ vector unsigned char b = vec_lvsr (8, f);
+}
Segher Boessenkool
2014-09-30 14:50:24 UTC
Permalink
Post by Bill Schmidt
The method used in this patch is to perform a byte-reversal of the
result of the lvsl/lvsr. This is accomplished by loading the vector
char constant {0,1,...,15}, which will appear in the register from left
to right as {15,...,1,0}. A vperm instruction (which uses BE element
ordering) is applied to the result of the lvsl/lvsr using the loaded
constant as the permute control vector.
It would be nice if you could arrange the generated sequence such that
for the common case where the vec_lvsl feeds a vperm it is results in
just lvsr;vnot machine instructions. Not so easy to do though :-(

Some minor comments...
Post by Bill Schmidt
-(define_insn "altivec_lvsl"
+(define_expand "altivec_lvsl"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" "Z"))]
A define_expand should not have constraints.
Post by Bill Schmidt
+ "TARGET_ALTIVEC"
+ "
No need for the quotes.
Post by Bill Schmidt
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsl_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsl_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
i++ is the common style.


Segher
Bill Schmidt
2014-09-30 15:24:23 UTC
Permalink
Post by Segher Boessenkool
Post by Bill Schmidt
The method used in this patch is to perform a byte-reversal of the
result of the lvsl/lvsr. This is accomplished by loading the vector
char constant {0,1,...,15}, which will appear in the register from left
to right as {15,...,1,0}. A vperm instruction (which uses BE element
ordering) is applied to the result of the lvsl/lvsr using the loaded
constant as the permute control vector.
It would be nice if you could arrange the generated sequence such that
for the common case where the vec_lvsl feeds a vperm it is results in
just lvsr;vnot machine instructions. Not so easy to do though :-(
Yes -- as you note, that only works when feeding a vperm, which is what
we expect but generally a lot of work to prove. Again, this is
deprecated usage so it seems not worth spending the effort on this...
Post by Segher Boessenkool
Some minor comments...
Post by Bill Schmidt
-(define_insn "altivec_lvsl"
+(define_expand "altivec_lvsl"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" "Z"))]
A define_expand should not have constraints.
Thanks for catching this -- that one slipped through (pasto).
Post by Segher Boessenkool
Post by Bill Schmidt
+ "TARGET_ALTIVEC"
+ "
No need for the quotes.
Ok.
Post by Segher Boessenkool
Post by Bill Schmidt
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsl_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsl_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
i++ is the common style.
Now that we're being compiled as C++, ++i is the common style there --
is there guidance about this for gcc style these days?

Thanks,
Bill
Post by Segher Boessenkool
Segher
Segher Boessenkool
2014-09-30 16:04:44 UTC
Permalink
Post by Bill Schmidt
Post by Segher Boessenkool
Post by Bill Schmidt
The method used in this patch is to perform a byte-reversal of the
result of the lvsl/lvsr. This is accomplished by loading the vector
char constant {0,1,...,15}, which will appear in the register from left
to right as {15,...,1,0}. A vperm instruction (which uses BE element
ordering) is applied to the result of the lvsl/lvsr using the loaded
constant as the permute control vector.
It would be nice if you could arrange the generated sequence such that
for the common case where the vec_lvsl feeds a vperm it is results in
just lvsr;vnot machine instructions. Not so easy to do though :-(
Yes -- as you note, that only works when feeding a vperm, which is what
we expect but generally a lot of work to prove.
I meant generating a sequence that just "falls out" as you want it after
optimisation. E.g. lvsr;vnot;vand(splat8(31));vperm can have the vand
absorbed by the vperm. But that splat is nasty when not optimised away :-(
Post by Bill Schmidt
Again, this is
deprecated usage so it seems not worth spending the effort on this...
There is that yes :-)
Post by Bill Schmidt
Post by Segher Boessenkool
i++ is the common style.
Now that we're being compiled as C++, ++i is the common style there --
The GCC source code didn't magically change to say "++i" everywhere it
said "i++" before, when we started compiling it with ++C :-P
Post by Bill Schmidt
is there guidance about this for gcc style these days?
codingconventions.html doesn't say.

grep | wc in rs6000/ shows 317 vs. 86; so a lot of stuff has already
leaked in (and in gcc/*.c it is 6227 vs. 793). Some days I think the
world has gone insane :-(

To me "++i" reads as "danger, pre-increment!" Old habits I suppose.
I'll shut up now.


Segher
Bill Schmidt
2014-09-30 16:18:39 UTC
Permalink
Post by Segher Boessenkool
Post by Bill Schmidt
Post by Segher Boessenkool
Post by Bill Schmidt
The method used in this patch is to perform a byte-reversal of the
result of the lvsl/lvsr. This is accomplished by loading the vector
char constant {0,1,...,15}, which will appear in the register from left
to right as {15,...,1,0}. A vperm instruction (which uses BE element
ordering) is applied to the result of the lvsl/lvsr using the loaded
constant as the permute control vector.
It would be nice if you could arrange the generated sequence such that
for the common case where the vec_lvsl feeds a vperm it is results in
just lvsr;vnot machine instructions. Not so easy to do though :-(
Yes -- as you note, that only works when feeding a vperm, which is what
we expect but generally a lot of work to prove.
I meant generating a sequence that just "falls out" as you want it after
optimisation. E.g. lvsr;vnot;vand(splat8(31));vperm can have the vand
absorbed by the vperm. But that splat is nasty when not optimised away :-(
Especially since splat8(31) requires vsub(splat8(15),splat8(-16))...

To get something that is correct with and without feeding a vperm and
actually performs well just ain't happening here...
Post by Segher Boessenkool
Post by Bill Schmidt
Again, this is
deprecated usage so it seems not worth spending the effort on this...
There is that yes :-)
Post by Bill Schmidt
Post by Segher Boessenkool
i++ is the common style.
Now that we're being compiled as C++, ++i is the common style there --
The GCC source code didn't magically change to say "++i" everywhere it
said "i++" before, when we started compiling it with ++C :-P
Post by Bill Schmidt
is there guidance about this for gcc style these days?
codingconventions.html doesn't say.
grep | wc in rs6000/ shows 317 vs. 86; so a lot of stuff has already
leaked in (and in gcc/*.c it is 6227 vs. 793). Some days I think the
world has gone insane :-(
To me "++i" reads as "danger, pre-increment!" Old habits I suppose.
I'll shut up now.
Heh. I have to go back and forth between C and C++ a lot these days and
find it's best for my sanity to just stick with the preincrement form
now...

Thanks,
Bill
Post by Segher Boessenkool
Segher
Segher Boessenkool
2014-09-30 20:37:22 UTC
Permalink
Post by Bill Schmidt
Post by Segher Boessenkool
I meant generating a sequence that just "falls out" as you want it after
optimisation. E.g. lvsr;vnot;vand(splat8(31));vperm can have the vand
absorbed by the vperm. But that splat is nasty when not optimised away :-(
Especially since splat8(31) requires vsub(splat8(15),splat8(-16))...
vspltisb vT,-5 ; vsrb vD,vT,vT # :-)
Post by Bill Schmidt
To get something that is correct with and without feeding a vperm and
actually performs well just ain't happening here...
Yeah.


Segher
Bill Schmidt
2014-10-02 19:20:17 UTC
Permalink
Hi,

Here's a revised version of the patch that addresses Segher's comments.
Bootstrapped and tested on powerpc64le-unknown-linux-gnu. Ok for trunk?

Thanks,
Bill


[gcc]

2014-10-02 Bill Schmidt <***@linux.vnet.ibm.com>

* altivec.md (altivec_lvsl): New define_expand.
(altivec_lvsl_direct): Rename define_insn from altivec_lvsl.
(altivec_lvsr): New define_expand.
(altivec_lvsr_direct): Rename define_insn from altivec_lvsr.
* rs6000.c (rs6000_expand_builtin): Change to use
altivec_lvs[lr]_direct; remove commented-out code.

[gcc/testsuite]

2014-10-02 Bill Schmidt <***@linux.vnet.ibm.com>

* gcc.target/powerpc/lvsl-lvsr.c: New test.


Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md (revision 215689)
+++ gcc/config/rs6000/altivec.md (working copy)
@@ -2297,7 +2297,31 @@
"dststt %0,%1,%2"
[(set_attr "type" "vecsimple")])

-(define_insn "altivec_lvsl"
+(define_expand "altivec_lvsl"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" ""))]
+ "TARGET_ALTIVEC"
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsl_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsl_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
+ perm[i] = GEN_INT (i);
+ constv = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, perm));
+ constv = force_reg (V16QImode, constv);
+ vperm = gen_rtx_UNSPEC (V16QImode, gen_rtvec (3, mask, mask, constv),
+ UNSPEC_VPERM);
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], vperm));
+ }
+ DONE;
+})
+
+(define_insn "altivec_lvsl_direct"
[(set (match_operand:V16QI 0 "register_operand" "=v")
(unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "Z")]
UNSPEC_LVSL))]
@@ -2305,7 +2329,31 @@
"lvsl %0,%y1"
[(set_attr "type" "vecload")])

-(define_insn "altivec_lvsr"
+(define_expand "altivec_lvsr"
+ [(use (match_operand:V16QI 0 "register_operand" ""))
+ (use (match_operand:V16QI 1 "memory_operand" ""))]
+ "TARGET_ALTIVEC"
+{
+ if (VECTOR_ELT_ORDER_BIG)
+ emit_insn (gen_altivec_lvsr_direct (operands[0], operands[1]));
+ else
+ {
+ int i;
+ rtx mask, perm[16], constv, vperm;
+ mask = gen_reg_rtx (V16QImode);
+ emit_insn (gen_altivec_lvsr_direct (mask, operands[1]));
+ for (i = 0; i < 16; ++i)
+ perm[i] = GEN_INT (i);
+ constv = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, perm));
+ constv = force_reg (V16QImode, constv);
+ vperm = gen_rtx_UNSPEC (V16QImode, gen_rtvec (3, mask, mask, constv),
+ UNSPEC_VPERM);
+ emit_insn (gen_rtx_SET (VOIDmode, operands[0], vperm));
+ }
+ DONE;
+})
+
+(define_insn "altivec_lvsr_direct"
[(set (match_operand:V16QI 0 "register_operand" "=v")
(unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "Z")]
UNSPEC_LVSR))]
Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c (revision 215689)
+++ gcc/config/rs6000/rs6000.c (working copy)
@@ -13898,8 +13898,8 @@ rs6000_expand_builtin (tree exp, rtx target, rtx s
case ALTIVEC_BUILTIN_MASK_FOR_LOAD:
case ALTIVEC_BUILTIN_MASK_FOR_STORE:
{
- int icode = (BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr
- : (int) CODE_FOR_altivec_lvsl);
+ int icode = (BYTES_BIG_ENDIAN ? (int) CODE_FOR_altivec_lvsr_direct
+ : (int) CODE_FOR_altivec_lvsl_direct);
enum machine_mode tmode = insn_data[icode].operand[0].mode;
enum machine_mode mode = insn_data[icode].operand[1].mode;
tree arg;
@@ -13927,7 +13927,6 @@ rs6000_expand_builtin (tree exp, rtx target, rtx s
|| ! (*insn_data[icode].operand[0].predicate) (target, tmode))
target = gen_reg_rtx (tmode);

- /*pat = gen_altivec_lvsr (target, op);*/
pat = GEN_FCN (icode) (target, op);
if (!pat)
return 0;
Index: gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c (revision 0)
+++ gcc/testsuite/gcc.target/powerpc/lvsl-lvsr.c (working copy)
@@ -0,0 +1,21 @@
+/* Test expected code generation for lvsl and lvsr on little endian.
+ Note that lvsl and lvsr are each produced once, but the filename
+ causes them to appear twice in the file. */
+
+/* { dg-do compile { target { powerpc64le-*-* } } } */
+/* { dg-options "-O0 -Wno-deprecated" } */
+/* { dg-final { scan-assembler-times "lvsl" 2 } } */
+/* { dg-final { scan-assembler-times "lvsr" 2 } } */
+/* { dg-final { scan-assembler-times "lxvd2x" 2 } } */
+/* { dg-final { scan-assembler-times "vperm" 2 } } */
+
+
+#include <altivec.h>
+
+float f[20];
+
+void foo ()
+{
+ vector unsigned char a = vec_lvsl (4, f);
+ vector unsigned char b = vec_lvsr (8, f);
+}
David Edelsohn
2014-10-03 18:18:48 UTC
Permalink
On Thu, Oct 2, 2014 at 3:20 PM, Bill Schmidt
Post by Bill Schmidt
Hi,
Here's a revised version of the patch that addresses Segher's comments.
Bootstrapped and tested on powerpc64le-unknown-linux-gnu. Ok for trunk?
Thanks,
Bill
[gcc]
* altivec.md (altivec_lvsl): New define_expand.
(altivec_lvsl_direct): Rename define_insn from altivec_lvsl.
(altivec_lvsr): New define_expand.
(altivec_lvsr_direct): Rename define_insn from altivec_lvsr.
* rs6000.c (rs6000_expand_builtin): Change to use
altivec_lvs[lr]_direct; remove commented-out code.
[gcc/testsuite]
* gcc.target/powerpc/lvsl-lvsr.c: New test.
Okay.

Thanks, Segher, for providing a second set of eyes.

Thanks, David

Loading...