Discussion:
[PATCH 0/4] [AARCH64,NEON] Improve various NEON load/store intrinsics
Charles Baylis
2014-09-18 19:38:25 UTC
Permalink
This patch series improves the code generation for NEON structure loads and
stores.

Tested with make check on aarch64-oe-linux with qemu, and also passes clyon's
NEON intrinsics tests.

Charles Baylis (4):
[AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_*
intrinsics
[AARCH64,NEON] Convert arm_neon.h to use new builtins for
vld[234](q?)_lane_*
[AARCH64,NEON] Fix unnecessary moves in vld[234]q_* intrinsics
[AARCH64,NEON] Fix unnecessary moves in vst[234]q_* intrinsics

gcc/config/aarch64/aarch64-builtins.c | 5 +
gcc/config/aarch64/aarch64-simd-builtins.def | 4 +
gcc/config/aarch64/aarch64-simd.md | 95 +++
gcc/config/aarch64/aarch64.md | 3 +
gcc/config/aarch64/arm_neon.h | 1043 ++++++++++++--------------
5 files changed, 596 insertions(+), 554 deletions(-)
--
1.9.1
Charles Baylis
2014-09-18 19:38:26 UTC
Permalink
This patch adds new patterns and builtins to represent single lane structure
loads instructions, which will be used to implement the vld[234](q?)_lane_*
intrinsics.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>
* config/aarch64/aarch64-builtins.c
(aarch64_types_loadstruct_lane_qualifiers): Define.
* config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
ld4_lane): New builtins.
* config/aarch64/aarch64-simd.md (vec_load_lanesoi_lane<mode>): New
pattern.
(vec_load_lanesci_lane<mode>): Likewise.
(vec_load_lanesxi_lane<mode>): Likewise.
(aarch64_ld2_lane<VQ:mode>): New expand.
(aarch64_ld3_lane<VQ:mode>): Likewise.
(aarch64_ld4_lane<VQ:mode>): Likewise.

Change-Id: I205ab46aa3f3f2486cc163b93e1da080a87c3419
---
gcc/config/aarch64/aarch64-builtins.c | 5 ++
gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++
gcc/config/aarch64/aarch64-simd.md | 95 ++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 3 +
4 files changed, 107 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 395b4ec..818729c 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -201,6 +201,11 @@ aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_const_pointer_map_mode };
#define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
#define TYPES_LOADSTRUCT (aarch64_types_load1_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_loadstruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_const_pointer_map_mode,
+ qualifier_none, qualifier_none };
+#define TYPES_LOADSTRUCT_LANE (aarch64_types_loadstruct_lane_qualifiers)

static enum aarch64_type_qualifiers
aarch64_types_bsl_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index de264c4..5d3e122 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -83,6 +83,10 @@
BUILTIN_VQ (LOADSTRUCT, ld2, 0)
BUILTIN_VQ (LOADSTRUCT, ld3, 0)
BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+ /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>. */
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld2_lane, 0)
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld3_lane, 0)
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld4_lane, 0)
/* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>. */
BUILTIN_VDC (STORESTRUCT, st2, 0)
BUILTIN_VDC (STORESTRUCT, st3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 493e886..f6c4018 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4003,6 +4003,18 @@
[(set_attr "type" "neon_load2_2reg<q>")]
)

+(define_insn "vec_load_lanesoi_lane<mode>"
+ [(set (match_operand:OI 0 "register_operand" "=w")
+ (unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+ UNSPEC_LD2_LANE))]
+ "TARGET_SIMD"
+ "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load2_one_lane<q>")]
+)
+
(define_insn "vec_store_lanesoi<mode>"
[(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:OI [(match_operand:OI 1 "register_operand" "w")
@@ -4034,6 +4046,18 @@
[(set_attr "type" "neon_load3_3reg<q>")]
)

+(define_insn "vec_load_lanesci_lane<mode>"
+ [(set (match_operand:CI 0 "register_operand" "=w")
+ (unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD3_LANE))]
+ "TARGET_SIMD"
+ "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load3_one_lane<q>")]
+)
+
(define_insn "vec_store_lanesci<mode>"
[(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:CI [(match_operand:CI 1 "register_operand" "w")
@@ -4065,6 +4089,18 @@
[(set_attr "type" "neon_load4_4reg<q>")]
)

+(define_insn "vec_load_lanesxi_lane<mode>"
+ [(set (match_operand:XI 0 "register_operand" "=w")
+ (unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD4_LANE))]
+ "TARGET_SIMD"
+ "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load4_one_lane<q>")]
+)
+
(define_insn "vec_store_lanesxi<mode>"
[(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:XI [(match_operand:XI 1 "register_operand" "w")
@@ -4378,6 +4414,65 @@
DONE;
})

+(define_expand "aarch64_ld2_lane<VQ:mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld3_lane<VQ:mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld4_lane<VQ:mode>"
+ [(match_operand:XI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+
+
;; Expanders for builtins to extract vector registers from large
;; opaque integer modes.

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c60038a..ea924ab 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
UNSPEC_LD2
UNSPEC_LD3
UNSPEC_LD4
+ UNSPEC_LD2_LANE
+ UNSPEC_LD3_LANE
+ UNSPEC_LD4_LANE
UNSPEC_MB
UNSPEC_NOP
UNSPEC_PRLG_STK
--
1.9.1
Kyrill Tkachov
2014-09-19 08:40:38 UTC
Permalink
Hi Charles,

Good to see these intrinsics being brought into the modern world :)
Some style comments inline.
Post by Charles Baylis
This patch adds new patterns and builtins to represent single lane structure
loads instructions, which will be used to implement the vld[234](q?)_lane_*
intrinsics.
Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
* config/aarch64/aarch64-builtins.c
(aarch64_types_loadstruct_lane_qualifiers): Define.
* config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
ld4_lane): New builtins.
* config/aarch64/aarch64-simd.md (vec_load_lanesoi_lane<mode>): New
pattern.
(vec_load_lanesci_lane<mode>): Likewise.
(vec_load_lanesxi_lane<mode>): Likewise.
(aarch64_ld2_lane<VQ:mode>): New expand.
(aarch64_ld3_lane<VQ:mode>): Likewise.
(aarch64_ld4_lane<VQ:mode>): Likewise.
This is missing an entry for the config/aarch64/aarch64.md hunk.
Post by Charles Baylis
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 493e886..f6c4018 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4003,6 +4003,18 @@
[(set_attr "type" "neon_load2_2reg<q>")]
)
+(define_insn "vec_load_lanesoi_lane<mode>"
+ [(set (match_operand:OI 0 "register_operand" "=w")
+ (unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+ UNSPEC_LD2_LANE))]
+ "TARGET_SIMD"
+ "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load2_one_lane<q>")]
+)
The VQ mode iterator goes over the 128-wide modes so the "type"
attribute here will always be neon_load2_one_lane_q. Using the <q> mode
attribute is still correct but personally I think it makes it just that
little bit harder to figure out for a newbie who will have to open
iterators.md to figure out the meaning of it, or for someone who's not
sure whether the 'q' is added with an underscore or without. I would
just use neon_load2_one_lane_q.
Post by Charles Baylis
+(define_insn "vec_load_lanesci_lane<mode>"
+ [(set (match_operand:CI 0 "register_operand" "=w")
+ (unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD3_LANE))]
+ "TARGET_SIMD"
+ "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load3_one_lane<q>")]
+)
Likewise.
Post by Charles Baylis
+(define_insn "vec_load_lanesxi_lane<mode>"
+ [(set (match_operand:XI 0 "register_operand" "=w")
+ (unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD4_LANE))]
+ "TARGET_SIMD"
+ "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load4_one_lane<q>")]
+)
Same here.
Post by Charles Baylis
+(define_expand "aarch64_ld2_lane<VQ:mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
I think saying <VQ:mode> is redundant since VQ is the only mode iterator
in the pattern.
Just <mode> should work, right?
Post by Charles Baylis
+
+(define_expand "aarch64_ld3_lane<VQ:mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
Likewise.
Post by Charles Baylis
+
+(define_expand "aarch64_ld4_lane<VQ:mode>"
+ [(match_operand:XI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
+ emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
Likewise.
Post by Charles Baylis
+
;; Expanders for builtins to extract vector registers from large
;; opaque integer modes.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c60038a..ea924ab 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
UNSPEC_LD2
UNSPEC_LD3
UNSPEC_LD4
+ UNSPEC_LD2_LANE
+ UNSPEC_LD3_LANE
+ UNSPEC_LD4_LANE
UNSPEC_MB
UNSPEC_NOP
UNSPEC_PRLG_STK
Tejas Belagod
2014-09-19 10:45:54 UTC
Permalink
Post by Charles Baylis
+(define_expand "aarch64_ld2_lane<VQ:mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
The endianess lane correction breaks this for BE.

You don't need the endianess lane correction here - we always call neon
intrinsics with the architectural lane number - irrespective of
endianness. Unless ofcourse you flip it somewhere to make it a part of
RTL vec_select lane patterns, which you don't here.

You could also do some lane-bounds checking here in the expander.
Post by Charles Baylis
+ emit_insn (gen_vec_load_lanesoi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld3_lane<VQ:mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
No endianness correction for lanes necessary.
Post by Charles Baylis
+ emit_insn (gen_vec_load_lanesci_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld4_lane<VQ:mode>"
+ [(match_operand:XI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+ operands[3] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[3])));
+
Same.
Post by Charles Baylis
+ emit_insn (gen_vec_load_lanesxi_lane<VQ:mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+
+
;; Expanders for builtins to extract vector registers from large
;; opaque integer modes.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c60038a..ea924ab 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
UNSPEC_LD2
UNSPEC_LD3
UNSPEC_LD4
+ UNSPEC_LD2_LANE
+ UNSPEC_LD3_LANE
+ UNSPEC_LD4_LANE
UNSPEC_MB
UNSPEC_NOP
UNSPEC_PRLG_STK
Thanks,
Tejas.
Charles Baylis
2014-09-24 16:36:36 UTC
Permalink
Kyril, Tejas,

Thanks for the review. I agree with all points and will respin v2 accordingly

Charles
Charles Baylis
2014-09-18 19:38:27 UTC
Permalink
This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>

* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
update uses to use new macro arguments.
(__LD3_LANE_FUNC): Likewise.
(__LD4_LANE_FUNC): Likewise.

Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
1 file changed, 237 insertions(+), 122 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index e62c783..c1fcb47 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
__LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
__LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)

-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
- "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17"); \
- return result; \
- }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \
+ return __b; \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_oi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)

#define __LD3R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
__LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
__LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)

-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
- "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18"); \
- return result; \
- }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \
+ return __b; \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_xi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)

#define __LD4R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
__LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
__LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)

-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
- "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18", "v19"); \
- return result; \
- }

-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __temp.val[3] = \
+ vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[3], \
+ 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \
+ __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \
+ return __b; \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_xi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)

#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
mode, ptr_mode, funcsuffix, signedtype) \
--
1.9.1
Tejas Belagod
2014-09-19 11:21:32 UTC
Permalink
Post by Charles Baylis
This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.
Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
update uses to use new macro arguments.
(__LD3_LANE_FUNC): Likewise.
(__LD4_LANE_FUNC): Likewise.
Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++--------------
1 file changed, 237 insertions(+), 122 deletions(-)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index e62c783..c1fcb47 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
__LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
__LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
- "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17"); \
- return result; \
- }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \
+ return __b; \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_oi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
#define __LD3R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
__LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
__LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
- "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18"); \
- return result; \
- }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \
+ return __b; \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_xi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
#define __LD4R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
__LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
__LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
- "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18", "v19"); \
- return result; \
- }
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __temp.val[3] = \
+ vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[3], \
+ 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \
+ __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \
+ return __b; \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_xi __o; } __temp = { __b }; \
+ __temp.__o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \
+ return __temp.__i; \
+}
+
The reason we avoided using type-punning using unions was that reload
would get confused with potential subreg(mem) that could be introduced
because of memory xfer caused by unions and large int modes. As a
result, we would get incorrect or sub-optimal code. But this seems to
have fixed itself. :-)

Because this involves xfers between large int modes and
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
what impact your patch has with C_C_M_C removed, so that it will be
easier to fix the fallout once we remove C_C_M_C eventually. To test
this you will need Richard's patch set
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.

Same for your other 2 patches in this series(3,4).

Thanks,
Tejas.
Post by Charles Baylis
+__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
mode, ptr_mode, funcsuffix, signedtype) \
--
1.9.1
Charles Baylis
2014-09-26 01:16:28 UTC
Permalink
The reason we avoided using type-punning using unions was that reload would
get confused with potential subreg(mem) that could be introduced because of
memory xfer caused by unions and large int modes. As a result, we would get
incorrect or sub-optimal code. But this seems to have fixed itself. :-)
Because this involves xfers between large int modes and
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
what impact your patch has with C_C_M_C removed, so that it will be easier
to fix the fallout once we remove C_C_M_C eventually. To test this you will
need Richard's patch set
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
Same for your other 2 patches in this series(3,4).
I tried those patches, and altered aarch64_cannot_change_mode_class to
return false for all cases.

However, this does not avoid the unnecessary moves.

Taking a really simple test case:

#include <arm_neon.h>

int32x2x2_t xvld2_s32(int32_t *__a)
{
int32x2x2_t ret;
__builtin_aarch64_simd_oi __o;
__o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
return ret;
}

(disabling scheduling for clarity)
$ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
-fno-schedule-insns2
...
xvld2_s32:
ld2 {v2.2s - v3.2s}, [x0]
orr v0.8b, v2.8b, v2.8b
orr v1.8b, v3.8b, v3.8b
ret
...


The reason is apparent in the rtl dump from ira:
...
Allocno a0r73 of FP_REGS(32) has 31 avail. regs 33-63, node:
33-63 (confl regs = 0-32 64 65)
...
(insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
(reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
(expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
(nil)))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
(subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
(mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8])
] UNSPEC_LD2)
(vec_duplicate:V2SI (const_int 0 [0])))
(vec_concat:V4SI (unspec:V2SI [
(mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8])
] UNSPEC_LD2)
(vec_duplicate:V2SI (const_int 0 [0])))) 0))
simd.c:8 2149 {aarch64_ld2v2si_dreg}
(expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
(nil)))
(insn 20 6 21 2 (set (reg:V2SI 32 v0)
(subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
{*aarch64_simd_movv2si}
(nil))
(insn 21 20 22 2 (set (reg:V2SI 33 v1)
(subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
{*aarch64_simd_movv2si}
(expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
(nil)))
(insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
(nil))
(insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
(nil))

The register allocator considers r73 to conflict with v0, because they
are simultaneously live after insn 20. Without the 2nd use of v73 (eg
if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
allocator does do the right thing with the subreg and allocates v73 to
{v0,v1}.

I haven't read all of the old threads relating to Richard's patches
yet, but I don't see why they would affect this issue.

I don't think the register allocator is able to resolve this unless
the conversion between the __builtin_simd type and the int32x4x2_t
type is done as a single operation.

However, type-punning is not possible with the arrays of 64 bit
vectors, as the arrays are not the same size as the corresponding
__builtin_simd types, and any solution for those would probably help
with the q variants too. Maybe the solution is to pass the NEON
intrinsic types directly to the builtins? Is there a reason that it
wasn't done that way before?

Thanks
Charles
Tejas Belagod
2014-09-26 12:47:15 UTC
Permalink
Post by Charles Baylis
The reason we avoided using type-punning using unions was that reload would
get confused with potential subreg(mem) that could be introduced because of
memory xfer caused by unions and large int modes. As a result, we would get
incorrect or sub-optimal code. But this seems to have fixed itself. :-)
Because this involves xfers between large int modes and
CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test
what impact your patch has with C_C_M_C removed, so that it will be easier
to fix the fallout once we remove C_C_M_C eventually. To test this you will
need Richard's patch set
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html.
Same for your other 2 patches in this series(3,4).
I tried those patches, and altered aarch64_cannot_change_mode_class to
return false for all cases.
However, this does not avoid the unnecessary moves.
#include <arm_neon.h>
int32x2x2_t xvld2_s32(int32_t *__a)
{
int32x2x2_t ret;
__builtin_aarch64_simd_oi __o;
__o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a);
ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
return ret;
}
(disabling scheduling for clarity)
$ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns
-fno-schedule-insns2
...
ld2 {v2.2s - v3.2s}, [x0]
orr v0.8b, v2.8b, v2.8b
orr v1.8b, v3.8b, v3.8b
ret
...
...
33-63 (confl regs = 0-32 64 65)
...
(insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ])
(reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64}
(expr_list:REG_DEAD (reg:DI 0 x0 [ __a ])
(nil)))
(note 3 2 6 2 NOTE_INSN_FUNCTION_BEG)
(insn 6 3 20 2 (set (reg/v:OI 73 [ __o ])
(subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [
(mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8])
] UNSPEC_LD2)
(vec_duplicate:V2SI (const_int 0 [0])))
(vec_concat:V4SI (unspec:V2SI [
(mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8])
] UNSPEC_LD2)
(vec_duplicate:V2SI (const_int 0 [0])))) 0))
simd.c:8 2149 {aarch64_ld2v2si_dreg}
(expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ])
(nil)))
(insn 20 6 21 2 (set (reg:V2SI 32 v0)
(subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778
{*aarch64_simd_movv2si}
(nil))
(insn 21 20 22 2 (set (reg:V2SI 33 v1)
(subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778
{*aarch64_simd_movv2si}
(expr_list:REG_DEAD (reg/v:OI 73 [ __o ])
(nil)))
(insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1
(nil))
(insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1
(nil))
The register allocator considers r73 to conflict with v0, because they
are simultaneously live after insn 20. Without the 2nd use of v73 (eg
if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the
allocator does do the right thing with the subreg and allocates v73 to
{v0,v1}.
I haven't read all of the old threads relating to Richard's patches
yet, but I don't see why they would affect this issue.
I don't think the register allocator is able to resolve this unless
the conversion between the __builtin_simd type and the int32x4x2_t
type is done as a single operation.
For this piece of code,

#include "arm_neon.h"

int32x2x2_t xvld2_s32(int32_t *__a)
{
union { int32x2x2_t __i;
__builtin_aarch64_simd_oi __o; } __temp;
__temp.__o = __builtin_aarch64_ld2v2si ((const
__builtin_aarch64_simd_si *) __a);
return __temp.__i;
}

int32x2x2_t yvld2_s32(int32_t *__a)
{
int32x2x2_t ret;
__builtin_aarch64_simd_oi __o;
__o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *)
__a);
ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0);
ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1);
return ret;
}

currently my gcc HEAD generates at -O3:

xvld2_s32:
ld2 {v0.2s - v1.2s}, [x0]
sub sp, sp, #64
st1 {v0.16b - v1.16b}, [sp]
ldr x1, [sp]
ldr x0, [sp, 8]
add sp, sp, 64
ins v0.d[0], x1
ins v1.d[0], x0
ret
....
yvld2_s32:
ld2 {v2.2s - v3.2s}, [x0]
orr v1.8b, v3.8b, v3.8b
orr v0.8b, v2.8b, v2.8b
ret

If we use type-punning, there are unnecessary spills that are generated
which is also incorrect for BE because of of the way we spill (st1
{v0.16b - v1.16b}, [sp]) and restore. The implementation without
type-punning seems to give a more optimal result. Did your patches
improve on the spills for the type-punning solution?
Post by Charles Baylis
However, type-punning is not possible with the arrays of 64 bit
vectors, as the arrays are not the same size as the corresponding
__builtin_simd types, and any solution for those would probably help
with the q variants too.
That is because we fill a zero-extended D-reg value into a 128-bit reg
and pack them into an large int mode(eg. OI). We don't have large int
modes made up of purely D-regs because we run into ambiguities like 4
D-regs is an OImode and 2 Q-regs is also an OImode.
Post by Charles Baylis
Maybe the solution is to pass the NEON
intrinsic types directly to the builtins? Is there a reason that it
wasn't done that way before?
How do you mean? Do you mean pass a loaded value int32x2x2_t into a
__builtin? How will that work?

If you mean why we don't pass an int32x2x2_t into a builtin as a
structure, I don't think that would work as it is struct type which
would correspond to a BLK mode, but we need RTL patterns with reg-lists
to work with large int modes for the regalloc to allocate consecutive
regs for the reglists.

Thanks,
Tejas.
Charles Baylis
2014-10-08 18:47:15 UTC
Permalink
This post might be inappropriate. Click to display it.
Charles Baylis
2014-09-18 19:38:28 UTC
Permalink
This patch improves code generation of vld[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_get_qreg_* builtins to generate a temporary result
variable. Instead, a union is used for type-punning, which avoids generation of
some unnecessary move instructions. This idiom is already used in several other
intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>

* config/aarch64/arm_neon.h (vld2q_s8, vld2q_p8, vld2q_s16, vld2q_p16,
vld2q_s32, vld2q_s64, vld2q_u8, vld2q_u16, vld2q_u32, vld2q_u64,
vld2q_f32, vld2q_f64, vld3q_s8, vld3q_p8, vld3q_s16, vld3q_p16,
vld3q_s32, vld3q_s64, vld3q_u8, vld3q_u16, vld3q_u32, vld3q_u64,
vld3q_f32, vld3q_f64, vld4q_s8, vld4q_p8, vld4q_s16, vld4q_p16,
vld4q_s32, vld4q_s64, vld4q_u8, vld4q_u16, vld4q_u32, vld4q_u64,
vld4q_f32, vld4q_f64): Use type-punning to convert between NEON
intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I61efa29138b13c7a83679885343211d604a73b15
---
gcc/config/aarch64/arm_neon.h | 396 +++++++++++++++---------------------------
1 file changed, 144 insertions(+), 252 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index c1fcb47..87e3baf 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -16969,133 +16969,109 @@ vld2_f32 (const float32_t * __a)
__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
vld2q_s8 (const int8_t * __a)
{
- int8x16x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
- ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
- return ret;
+ union { int8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
vld2q_p8 (const poly8_t * __a)
{
- poly8x16x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
- ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
- return ret;
+ union { poly8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
vld2q_s16 (const int16_t * __a)
{
- int16x8x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
- ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
- return ret;
+ union { int16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
vld2q_p16 (const poly16_t * __a)
{
- poly16x8x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
- ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
- return ret;
+ union { poly16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
vld2q_s32 (const int32_t * __a)
{
- int32x4x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
- ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
- return ret;
+ union { int32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline int64x2x2_t __attribute__ ((__always_inline__))
vld2q_s64 (const int64_t * __a)
{
- int64x2x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
- ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
- return ret;
+ union { int64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
vld2q_u8 (const uint8_t * __a)
{
- uint8x16x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 0);
- ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregoiv16qi (__o, 1);
- return ret;
+ union { uint8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
vld2q_u16 (const uint16_t * __a)
{
- uint16x8x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 0);
- ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregoiv8hi (__o, 1);
- return ret;
+ union { uint16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
vld2q_u32 (const uint32_t * __a)
{
- uint32x4x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 0);
- ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregoiv4si (__o, 1);
- return ret;
+ union { uint32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint64x2x2_t __attribute__ ((__always_inline__))
vld2q_u64 (const uint64_t * __a)
{
- uint64x2x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 0);
- ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregoiv2di (__o, 1);
- return ret;
+ union { uint64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vld2q_f32 (const float32_t * __a)
{
- float32x4x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
- ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 0);
- ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregoiv4sf (__o, 1);
- return ret;
+ union { float32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v4sf ((const __builtin_aarch64_simd_sf *) __a);
+ return __temp.__i;
}

__extension__ static __inline float64x2x2_t __attribute__ ((__always_inline__))
vld2q_f64 (const float64_t * __a)
{
- float64x2x2_t ret;
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
- ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 0);
- ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregoiv2df (__o, 1);
- return ret;
+ union { float64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld2v2df ((const __builtin_aarch64_simd_df *) __a);
+ return __temp.__i;
}

__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
@@ -17245,145 +17221,109 @@ vld3_f32 (const float32_t * __a)
__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
vld3q_s8 (const int8_t * __a)
{
- int8x16x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
- ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
- ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
- return ret;
+ union { int8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
vld3q_p8 (const poly8_t * __a)
{
- poly8x16x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
- ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
- ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
- return ret;
+ union { poly8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
vld3q_s16 (const int16_t * __a)
{
- int16x8x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
- ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
- ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
- return ret;
+ union { int16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
vld3q_p16 (const poly16_t * __a)
{
- poly16x8x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
- ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
- ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
- return ret;
+ union { poly16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
vld3q_s32 (const int32_t * __a)
{
- int32x4x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
- ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
- ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
- return ret;
+ union { int32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline int64x2x3_t __attribute__ ((__always_inline__))
vld3q_s64 (const int64_t * __a)
{
- int64x2x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
- ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
- ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
- return ret;
+ union { int64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
vld3q_u8 (const uint8_t * __a)
{
- uint8x16x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 0);
- ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 1);
- ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregciv16qi (__o, 2);
- return ret;
+ union { uint8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
vld3q_u16 (const uint16_t * __a)
{
- uint16x8x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 0);
- ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 1);
- ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregciv8hi (__o, 2);
- return ret;
+ union { uint16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
vld3q_u32 (const uint32_t * __a)
{
- uint32x4x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 0);
- ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 1);
- ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregciv4si (__o, 2);
- return ret;
+ union { uint32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint64x2x3_t __attribute__ ((__always_inline__))
vld3q_u64 (const uint64_t * __a)
{
- uint64x2x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 0);
- ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 1);
- ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregciv2di (__o, 2);
- return ret;
+ union { uint64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
vld3q_f32 (const float32_t * __a)
{
- float32x4x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
- ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 0);
- ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 1);
- ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregciv4sf (__o, 2);
- return ret;
+ union { float32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v4sf ((const __builtin_aarch64_simd_sf *) __a);
+ return __temp.__i;
}

__extension__ static __inline float64x2x3_t __attribute__ ((__always_inline__))
vld3q_f64 (const float64_t * __a)
{
- float64x2x3_t ret;
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
- ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 0);
- ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 1);
- ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregciv2df (__o, 2);
- return ret;
+ union { float64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld3v2df ((const __builtin_aarch64_simd_df *) __a);
+ return __temp.__i;
}

__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
@@ -17545,157 +17485,109 @@ vld4_f32 (const float32_t * __a)
__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
vld4q_s8 (const int8_t * __a)
{
- int8x16x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
- ret.val[1] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
- ret.val[2] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
- ret.val[3] = (int8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
- return ret;
+ union { int8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
vld4q_p8 (const poly8_t * __a)
{
- poly8x16x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
- ret.val[1] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
- ret.val[2] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
- ret.val[3] = (poly8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
- return ret;
+ union { poly8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
vld4q_s16 (const int16_t * __a)
{
- int16x8x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
- ret.val[1] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
- ret.val[2] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
- ret.val[3] = (int16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
- return ret;
+ union { int16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
vld4q_p16 (const poly16_t * __a)
{
- poly16x8x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
- ret.val[1] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
- ret.val[2] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
- ret.val[3] = (poly16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
- return ret;
+ union { poly16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
vld4q_s32 (const int32_t * __a)
{
- int32x4x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
- ret.val[1] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
- ret.val[2] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
- ret.val[3] = (int32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
- return ret;
+ union { int32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline int64x2x4_t __attribute__ ((__always_inline__))
vld4q_s64 (const int64_t * __a)
{
- int64x2x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
- ret.val[1] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
- ret.val[2] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
- ret.val[3] = (int64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
- return ret;
+ union { int64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
vld4q_u8 (const uint8_t * __a)
{
- uint8x16x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
- ret.val[0] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 0);
- ret.val[1] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 1);
- ret.val[2] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 2);
- ret.val[3] = (uint8x16_t) __builtin_aarch64_get_qregxiv16qi (__o, 3);
- return ret;
+ union { uint8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v16qi ((const __builtin_aarch64_simd_qi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
vld4q_u16 (const uint16_t * __a)
{
- uint16x8x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
- ret.val[0] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 0);
- ret.val[1] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 1);
- ret.val[2] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 2);
- ret.val[3] = (uint16x8_t) __builtin_aarch64_get_qregxiv8hi (__o, 3);
- return ret;
+ union { uint16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v8hi ((const __builtin_aarch64_simd_hi *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
vld4q_u32 (const uint32_t * __a)
{
- uint32x4x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
- ret.val[0] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 0);
- ret.val[1] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 1);
- ret.val[2] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 2);
- ret.val[3] = (uint32x4_t) __builtin_aarch64_get_qregxiv4si (__o, 3);
- return ret;
+ union { uint32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v4si ((const __builtin_aarch64_simd_si *) __a);
+ return __temp.__i;
}

__extension__ static __inline uint64x2x4_t __attribute__ ((__always_inline__))
vld4q_u64 (const uint64_t * __a)
{
- uint64x2x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
- ret.val[0] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 0);
- ret.val[1] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 1);
- ret.val[2] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 2);
- ret.val[3] = (uint64x2_t) __builtin_aarch64_get_qregxiv2di (__o, 3);
- return ret;
+ union { uint64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v2di ((const __builtin_aarch64_simd_di *) __a);
+ return __temp.__i;
}

__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
vld4q_f32 (const float32_t * __a)
{
- float32x4x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
- ret.val[0] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 0);
- ret.val[1] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 1);
- ret.val[2] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 2);
- ret.val[3] = (float32x4_t) __builtin_aarch64_get_qregxiv4sf (__o, 3);
- return ret;
+ union { float32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v4sf ((const __builtin_aarch64_simd_sf *) __a);
+ return __temp.__i;
}

__extension__ static __inline float64x2x4_t __attribute__ ((__always_inline__))
vld4q_f64 (const float64_t * __a)
{
- float64x2x4_t ret;
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
- ret.val[0] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 0);
- ret.val[1] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 1);
- ret.val[2] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 2);
- ret.val[3] = (float64x2_t) __builtin_aarch64_get_qregxiv2df (__o, 3);
- return ret;
+ union { float64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp;
+ __temp.__o = __builtin_aarch64_ld4v2df ((const __builtin_aarch64_simd_df *) __a);
+ return __temp.__i;
}

/* vmax */
--
1.9.1
Charles Baylis
2014-09-18 19:38:29 UTC
Permalink
This patch improves code generation of vst[234]q_* intrinsics by avoiding use
of the __builtin_aarch64_set_qreg_* builtins to generate a temporary
__builtin_aarch64_simd_XX variable. Instead, a union is used for type-punning,
which avoids generation of some unnecessary move instructions. This idiom is
already used in several other intrinsics.

This patch is independent of the previous patches in the series.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>

* config/aarch64/arm_neon.h (vst2q_s8, vst2q_p8, vst2q_s16, vst2q_p16,
vst2q_s32, vst2q_s64, vst2q_u8, vst2q_u16, vst2q_u32, vst2q_u64,
vst2q_f32, vst2q_f64, vst3q_s8, vst3q_p8, vst3q_s16, vst3q_p16,
vst3q_s32, vst3q_s64, vst3q_u8, vst3q_u16, vst3q_u32, vst3q_u64,
vst3q_f32, vst3q_f64, vst4q_s8, vst4q_p8, vst4q_s16, vst4q_p16,
vst4q_s32, vst4q_s64, vst4q_u8, vst4q_u16, vst4q_u32, vst4q_u64,
vst4q_f32, vst4q_f64): Use type-punning to convert between NEON
intrinsic types and __builtin_aarch64_simd* types.

Change-Id: I789c68fc8d9458638eb00a15ffa28073bdc969a8
---
gcc/config/aarch64/arm_neon.h | 288 ++++++++++++++++--------------------------
1 file changed, 108 insertions(+), 180 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 87e3baf..3292ce0 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -22493,109 +22493,97 @@ vst2_f32 (float32_t * __a, float32x2x2_t val)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_s8 (int8_t * __a, int8x16x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
- __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { int8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_p8 (poly8_t * __a, poly8x16x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
- __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { poly8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_s16 (int16_t * __a, int16x8x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
- __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { int16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_p16 (poly16_t * __a, poly16x8x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
- __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { poly16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_s32 (int32_t * __a, int32x4x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
- __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { int32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_s64 (int64_t * __a, int64x2x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
- __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { int64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_u8 (uint8_t * __a, uint8x16x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv16qi (__o, (int8x16_t) val.val[1], 1);
- __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { uint8x16x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_u16 (uint16_t * __a, uint16x8x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8hi (__o, (int16x8_t) val.val[1], 1);
- __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { uint16x8x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_u32 (uint32_t * __a, uint32x4x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) val.val[1], 1);
- __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { uint32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_u64 (uint64_t * __a, uint64x2x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv2di (__o, (int64x2_t) val.val[1], 1);
- __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { uint64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_f32 (float32_t * __a, float32x4x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv4sf (__o, (float32x4_t) val.val[1], 1);
- __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ union { float32x4x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_f64 (float64_t * __a, float64x2x2_t val)
{
- __builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv2df (__o, (float64x2_t) val.val[1], 1);
- __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ union { float64x2x2_t __i;
+ __builtin_aarch64_simd_oi __o; } __temp = { val };
+ __builtin_aarch64_st2v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
}

__extension__ static __inline void
@@ -22769,121 +22757,97 @@ vst3_f32 (float32_t * __a, float32x2x3_t val)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_s8 (int8_t * __a, int8x16x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
- __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { int8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_p8 (poly8_t * __a, poly8x16x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
- __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { poly8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_s16 (int16_t * __a, int16x8x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
- __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { int16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_p16 (poly16_t * __a, poly16x8x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
- __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { poly16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_s32 (int32_t * __a, int32x4x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
- __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { int32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_s64 (int64_t * __a, int64x2x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
- __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { int64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_u8 (uint8_t * __a, uint8x16x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv16qi (__o, (int8x16_t) val.val[2], 2);
- __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { uint8x16x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_u16 (uint16_t * __a, uint16x8x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8hi (__o, (int16x8_t) val.val[2], 2);
- __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { uint16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_u32 (uint32_t * __a, uint32x4x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) val.val[2], 2);
- __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { uint32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_u64 (uint64_t * __a, uint64x2x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv2di (__o, (int64x2_t) val.val[2], 2);
- __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { uint64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_f32 (float32_t * __a, float32x4x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv4sf (__o, (float32x4_t) val.val[2], 2);
- __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ union { float32x4x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_f64 (float64_t * __a, float64x2x3_t val)
{
- __builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv2df (__o, (float64x2_t) val.val[2], 2);
- __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ union { float64x2x3_t __i;
+ __builtin_aarch64_simd_ci __o; } __temp = { val };
+ __builtin_aarch64_st3v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
}

__extension__ static __inline void
@@ -23081,133 +23045,97 @@ vst4_f32 (float32_t * __a, float32x2x4_t val)
__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_s8 (int8_t * __a, int8x16x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
- __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { int8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_p8 (poly8_t * __a, poly8x16x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
- __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { poly8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_s16 (int16_t * __a, int16x8x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
- __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { int16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_p16 (poly16_t * __a, poly16x8x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
- __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { poly16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_s32 (int32_t * __a, int32x4x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
- __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { int32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_s64 (int64_t * __a, int64x2x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
- __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { int64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_u8 (uint8_t * __a, uint8x16x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv16qi (__o, (int8x16_t) val.val[3], 3);
- __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __o);
+ union { uint8x16x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v16qi ((__builtin_aarch64_simd_qi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_u16 (uint16_t * __a, uint16x8x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv8hi (__o, (int16x8_t) val.val[3], 3);
- __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __o);
+ union { uint16x8x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v8hi ((__builtin_aarch64_simd_hi *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_u32 (uint32_t * __a, uint32x4x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) val.val[3], 3);
- __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __o);
+ union { uint32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v4si ((__builtin_aarch64_simd_si *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_u64 (uint64_t * __a, uint64x2x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv2di (__o, (int64x2_t) val.val[3], 3);
- __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
+ union { uint64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_f32 (float32_t * __a, float32x4x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv4sf (__o, (float32x4_t) val.val[3], 3);
- __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __o);
+ union { float32x4x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v4sf ((__builtin_aarch64_simd_sf *) __a, __temp.__o);
}

__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_f64 (float64_t * __a, float64x2x4_t val)
{
- __builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv2df (__o, (float64x2_t) val.val[3], 3);
- __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __o);
+ union { float64x2x4_t __i;
+ __builtin_aarch64_simd_xi __o; } __temp = { val };
+ __builtin_aarch64_st4v2df ((__builtin_aarch64_simd_df *) __a, __temp.__o);
}

/* vsub */
--
1.9.1
Loading...