[PATCH 0/2] [AARCH64,NEON] Improve vld[234](q?)

Discussion:

[PATCH 0/2] [AARCH64,NEON] Improve vld[234](q?)_lane intrinsics v2

c***@linaro.org

2014-10-08 17:27:14 UTC

From: Charles Baylis <***@linaro.org>

This patch series converts the vld[234](q?)_lane intrinsics to use builtin
functions instead of the previous inline assembler syntax.

Changes since v1:
. the type-punning to change between the array of vector types and the internal
builtin types has been removed, as this is a separate, more complex problem.
(patches 3&4 dropped, patch 2 reworked)
. iterator style cleanups (patch 1)
. removed broken bigendian lane number conversion. (patch 1)

Tested with make check on aarch64-oe-linux with qemu, and also passes clyon's
NEON intrinsics tests.

Charles Baylis (2):
[AARCH64,NEON] Add patterns + builtins for vld[234](q?)_lane_*
intrinsics
[AARCH64,NEON] Convert arm_neon.h to use new builtins for
vld[234](q?)_lane_*

gcc/config/aarch64/aarch64-builtins.c | 5 +
gcc/config/aarch64/aarch64-simd-builtins.def | 4 +
gcc/config/aarch64/aarch64-simd.md | 95 +++++++
gcc/config/aarch64/aarch64.md | 3 +
gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++---------
5 files changed, 362 insertions(+), 122 deletions(-)

--
1.9.1

c***@linaro.org

2014-10-08 17:27:15 UTC

Permalink

From: Charles Baylis <***@linaro.org>

This patch adds new patterns and builtins to represent single lane structure
loads instructions, which will be used to implement the vld[234](q?)_lane_*
intrinsics.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>
* config/aarch64/aarch64-builtins.c
(aarch64_types_loadstruct_lane_qualifiers): Define.
* config/aarch64/aarch64-simd-builtins.def (ld2_lane, ld3_lane,
ld4_lane): New builtins.
* config/aarch64/aarch64-simd.md (vec_load_lanesoi_lane<mode>): New
pattern.
(vec_load_lanesci_lane<mode>): Likewise.
(vec_load_lanesxi_lane<mode>): Likewise.
(aarch64_ld2_lane<mode>): New expand.
(aarch64_ld3_lane<mode>): Likewise.
(aarch64_ld4_lane<mode>): Likewise.
* config/aarch64/aarch64.md (define_c_enum "unspec"): Add
UNSPEC_LD2_LANE, UNSPEC_LD3_LANE, UNSPEC_LD4_LANE.
---
gcc/config/aarch64/aarch64-builtins.c | 5 ++
gcc/config/aarch64/aarch64-simd-builtins.def | 4 ++
gcc/config/aarch64/aarch64-simd.md | 95 ++++++++++++++++++++++++++++
gcc/config/aarch64/aarch64.md | 3 +
4 files changed, 107 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 3dba1b2..368d3a7 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -201,6 +201,11 @@ aarch64_types_load1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_none, qualifier_const_pointer_map_mode };
#define TYPES_LOAD1 (aarch64_types_load1_qualifiers)
#define TYPES_LOADSTRUCT (aarch64_types_load1_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_loadstruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_none, qualifier_const_pointer_map_mode,
+ qualifier_none, qualifier_none };
+#define TYPES_LOADSTRUCT_LANE (aarch64_types_loadstruct_lane_qualifiers)

static enum aarch64_type_qualifiers
aarch64_types_bsl_p_qualifiers[SIMD_MAX_BUILTIN_ARGS]
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 2367436..348f0d2 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -83,6 +83,10 @@
BUILTIN_VQ (LOADSTRUCT, ld2, 0)
BUILTIN_VQ (LOADSTRUCT, ld3, 0)
BUILTIN_VQ (LOADSTRUCT, ld4, 0)
+ /* Implemented by aarch64_ld<VSTRUCT:nregs>_lane<VQ:mode>. */
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld2_lane, 0)
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld3_lane, 0)
+ BUILTIN_VQ (LOADSTRUCT_LANE, ld4_lane, 0)
/* Implemented by aarch64_st<VSTRUCT:nregs><VDC:mode>. */
BUILTIN_VDC (STORESTRUCT, st2, 0)
BUILTIN_VDC (STORESTRUCT, st3, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index cab26a3..ff71291 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3991,6 +3991,18 @@
[(set_attr "type" "neon_load2_2reg<q>")]
)

+(define_insn "vec_load_lanesoi_lane<mode>"
+ [(set (match_operand:OI 0 "register_operand" "=w")
+ (unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+ UNSPEC_LD2_LANE))]
+ "TARGET_SIMD"
+ "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load2_one_lane")]
+)
+
(define_insn "vec_store_lanesoi<mode>"
[(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:OI [(match_operand:OI 1 "register_operand" "w")
@@ -4022,6 +4034,18 @@
[(set_attr "type" "neon_load3_3reg<q>")]
)

+(define_insn "vec_load_lanesci_lane<mode>"
+ [(set (match_operand:CI 0 "register_operand" "=w")
+ (unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD3_LANE))]
+ "TARGET_SIMD"
+ "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load3_one_lane")]
+)
+
(define_insn "vec_store_lanesci<mode>"
[(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:CI [(match_operand:CI 1 "register_operand" "w")
@@ -4053,6 +4077,18 @@
[(set_attr "type" "neon_load4_4reg<q>")]
)

+(define_insn "vec_load_lanesxi_lane<mode>"
+ [(set (match_operand:XI 0 "register_operand" "=w")
+ (unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD4_LANE))]
+ "TARGET_SIMD"
+ "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load4_one_lane")]
+)
+
(define_insn "vec_store_lanesxi<mode>"
[(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:XI [(match_operand:XI 1 "register_operand" "w")
@@ -4366,6 +4402,65 @@
DONE;
})

+(define_expand "aarch64_ld2_lane<mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesoi_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld3_lane<mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesci_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld4_lane<mode>"
+ [(match_operand:XI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesxi_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+
+
;; Expanders for builtins to extract vector registers from large
;; opaque integer modes.

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 74b554e..6b5f51f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
UNSPEC_LD2
UNSPEC_LD3
UNSPEC_LD4
+ UNSPEC_LD2_LANE
+ UNSPEC_LD3_LANE
+ UNSPEC_LD4_LANE
UNSPEC_MB
UNSPEC_NOP
UNSPEC_PRLG_STK

--
1.9.1

Tejas Belagod

2014-10-09 15:03:04 UTC

Permalink

Post by c***@linaro.org
+(define_insn "vec_load_lanesoi_lane<mode>"

Best to prepend "aarch64_" the pattern name, IMHO, else it looks like a
standard pattern name(eg. vec_load_lanes<m><n>) at first glance.

Otherwise, LGTM(but I can't approve it). Thanks for this patch.

Thanks,
Tejas.

Post by c***@linaro.org
+ [(set (match_operand:OI 0 "register_operand" "=w")
+ (unspec:OI [(match_operand:<V_TWO_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY) ]
+ UNSPEC_LD2_LANE))]
+ "TARGET_SIMD"
+ "ld2\\t{%S0.<Vetype> - %T0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load2_one_lane")]
+)
+
(define_insn "vec_store_lanesoi<mode>"
[(set (match_operand:OI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:OI [(match_operand:OI 1 "register_operand" "w")
@@ -4022,6 +4034,18 @@
[(set_attr "type" "neon_load3_3reg<q>")]
)
+(define_insn "vec_load_lanesci_lane<mode>"
+ [(set (match_operand:CI 0 "register_operand" "=w")
+ (unspec:CI [(match_operand:<V_THREE_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD3_LANE))]
+ "TARGET_SIMD"
+ "ld3\\t{%S0.<Vetype> - %U0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load3_one_lane")]
+)
+
(define_insn "vec_store_lanesci<mode>"
[(set (match_operand:CI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:CI [(match_operand:CI 1 "register_operand" "w")
@@ -4053,6 +4077,18 @@
[(set_attr "type" "neon_load4_4reg<q>")]
)
+(define_insn "vec_load_lanesxi_lane<mode>"
+ [(set (match_operand:XI 0 "register_operand" "=w")
+ (unspec:XI [(match_operand:<V_FOUR_ELEM> 1 "aarch64_simd_struct_operand" "Utv")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_LD4_LANE))]
+ "TARGET_SIMD"
+ "ld4\\t{%S0.<Vetype> - %V0.<Vetype>}[%3], %1"
+ [(set_attr "type" "neon_load4_one_lane")]
+)
+
(define_insn "vec_store_lanesxi<mode>"
[(set (match_operand:XI 0 "aarch64_simd_struct_operand" "=Utv")
(unspec:XI [(match_operand:XI 1 "register_operand" "w")
@@ -4366,6 +4402,65 @@
DONE;
})
+(define_expand "aarch64_ld2_lane<mode>"
+ [(match_operand:OI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:OI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesoi_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld3_lane<mode>"
+ [(match_operand:CI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:CI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesci_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+(define_expand "aarch64_ld4_lane<mode>"
+ [(match_operand:XI 0 "register_operand" "=w")
+ (match_operand:DI 1 "register_operand" "w")
+ (match_operand:XI 2 "register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[1]);
+
+ aarch64_simd_lane_bounds (operands[3], 0, GET_MODE_NUNITS (<VCONQ>mode));
+ emit_insn (gen_vec_load_lanesxi_lane<mode> (operands[0],
+ mem,
+ operands[2],
+ operands[3]));
+ DONE;
+})
+
+
+
;; Expanders for builtins to extract vector registers from large
;; opaque integer modes.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 74b554e..6b5f51f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -92,6 +92,9 @@
UNSPEC_LD2
UNSPEC_LD3
UNSPEC_LD4
+ UNSPEC_LD2_LANE
+ UNSPEC_LD3_LANE
+ UNSPEC_LD4_LANE
UNSPEC_MB
UNSPEC_NOP
UNSPEC_PRLG_STK

c***@linaro.org

2014-10-08 17:27:16 UTC

Permalink

From: Charles Baylis <***@linaro.org>

This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.

Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.

<DATE> Charles Baylis <***@linaro.org>

* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
update uses to use new macro arguments.
(__LD3_LANE_FUNC): Likewise.
(__LD4_LANE_FUNC): Likewise.

Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++++++++++++--------------
1 file changed, 255 insertions(+), 122 deletions(-)

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9b1873f..19ce261 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,83 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
__LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
__LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)

-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
- "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17"); \
- return result; \
- }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \
+ return __b; \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \
+ return ret; \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)

#define __LD3R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11887,47 +11923,91 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
__LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
__LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)

-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
- "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18"); \
- return result; \
- }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \
+ return __b; \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \
+ ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \
+ return ret; \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)

#define __LD4R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11969,47 +12049,100 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
__LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
__LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)

-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
- "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18", "v19"); \
- return result; \
- }

-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __temp.val[3] = \
+ vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[3], \
+ 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \
+ __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \
+ return __b; \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \
+ ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \
+ ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \
+ return ret; \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)

#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
mode, ptr_mode, funcsuffix, signedtype) \

--
1.9.1

Tejas Belagod

2014-10-09 15:09:14 UTC

Permalink

Post by c***@linaro.org
This patch replaces the inline assembler implementations of the
vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin
functions added in patch 1.
Tested (with the rest of the patch series) with make check on aarch64-oe-linux
with qemu, and also causes no regressions in clyon's NEON intrinsics tests.
* config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins,
update uses to use new macro arguments.
(__LD3_LANE_FUNC): Likewise.
(__LD4_LANE_FUNC): Likewise.
Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a
---
gcc/config/aarch64/arm_neon.h | 377 ++++++++++++++++++++++++++++--------------
1 file changed, 255 insertions(+), 122 deletions(-)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9b1873f..19ce261 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11805,47 +11805,83 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q)
__LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q)
__LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q)
-#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
- "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17"); \
- return result; \
- }
-
-__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,)
-__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
+#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \
+ return __b; \
+}
+
+__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD2_LANE_FUNC
+#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregoiv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_ld2_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregoiv4si (__o, 1); \
+ return ret; \
+}
+
+__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
#define __LD3R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11887,47 +11923,91 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q)
__LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q)
__LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q)
-#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
- "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18"); \
- return result; \
- }
-
-__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,)
-__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
+#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \
+ return __b; \
+}
+
+__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD3_LANE_FUNC
+#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_set_qregciv4si (__o, (int32x4_t) __b.val[2], 2); \
+ __o = __builtin_aarch64_ld3_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 1); \
+ ret.val[2] = (vtype) __builtin_aarch64_get_qregciv4si (__o, 2); \
+ return ret; \
+}
+
+__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
#define __LD4R_FUNC(rettype, structtype, ptrtype, \
regsuffix, funcsuffix, Q) \
@@ -11969,47 +12049,100 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q)
__LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q)
__LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q)
-#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- __extension__ static __inline rettype \
- __attribute__ ((__always_inline__)) \
- vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \
- rettype b, const int c) \
- { \
- rettype result; \
- __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
- "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \
- "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \
- : "=Q"(result) \
- : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \
- : "memory", "v16", "v17", "v18", "v19"); \
- return result; \
- }
-__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,)
-__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \
+ mode, ptrmode, funcsuffix, signedtype) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ largetype __temp; \
+ __temp.val[0] = \
+ vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \
+ __temp.val[1] = \
+ vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \
+ __temp.val[2] = \
+ vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \
+ __temp.val[3] = \
+ vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[0], \
+ 0); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[1], \
+ 1); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[2], \
+ 2); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[3], \
+ 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \
+ __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \
+ __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \
+ __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \
+ return __b; \
+}
+
+__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf,
+ sf, f32, float32x4_t)
+__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df,
+ df, f64, float64x2_t)
+__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8,
+ int8x16_t)
+__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi,
+ p16, int16x8_t)
+__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8,
+ int8x16_t)
+__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16,
+ int16x8_t)
+__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32,
+ int32x4_t)
+__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64,
+ int64x2_t)
+__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8,
+ int8x16_t)
+__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi,
+ u16, int16x8_t)
+__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si,
+ u32, int32x4_t)
+__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di,
+ u64, int64x2_t)
+
+#undef __LD4_LANE_FUNC
+#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+__extension__ static __inline intype __attribute__ ((__always_inline__)) \
+vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ intype ret; \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[0], 0); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[1], 1); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[2], 2); \
+ __o = __builtin_aarch64_set_qregxiv4si (__o, (int32x4_t) __b.val[3], 3); \
+ __o = __builtin_aarch64_ld4_lane##mode ( \
+ (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \
+ ret.val[0] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 0); \
+ ret.val[1] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 1); \
+ ret.val[2] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 2); \
+ ret.val[3] = (vtype) __builtin_aarch64_get_qregxiv4si (__o, 3); \
+ return ret; \
+}
+
+__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
mode, ptr_mode, funcsuffix, signedtype) \
--
1.9.1

LGTM(but I can't approve it). Thanks for this patch.

Tejas.