Kirill Yukhin
2014-10-06 12:55:28 UTC
Hello,
This patch extends permutations for AVX-512*.
Comments are welcome!
Bootstrapped.
AVX-512* tests on top of patch-set all pass
under simulator.
Is it ok for trunk?
gcc/
* config/i386/i386.c
(ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V2DFmode, V4DFmode.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX2): Add V64QI mode.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.
--
Thanks, K
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..d759a45 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
+ case V64QImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ rtx mask_lowpart, op0_lowpart, op1_lowpart;
+ rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+ mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+ op0_lowpart = gen_lowpart (V32HImode, op0);
+ op1_lowpart = gen_lowpart (V32HImode, op1);
+ tmp = gen_reg_rtx (V32HImode);
+ tmp2 = gen_reg_rtx (V32HImode);
+ perm_lo = gen_reg_rtx (V32HImode);
+ perm_hi = gen_reg_rtx (V32HImode);
+ res_lo = gen_reg_rtx (V32HImode);
+ res_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+ emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+ emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+ perm_lo, op1_lowpart));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+ perm_hi, op1_lowpart));
+ emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+ emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+ gen_lowpart (V64QImode, res_hi),
+ force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+ return true;
+ case V8HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
@@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX512VL)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43034,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;
if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
if (GET_MODE_SIZE (d.vmode) == 16)
@@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..d3e9635 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
[V8DI (V4DI "TARGET_AVX512VL")])
(define_mode_iterator VI1_AVX2
- [(V32QI "TARGET_AVX2") V16QI])
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -10621,7 +10621,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10643,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -13559,21 +13561,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
(unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+ [(match_operand:VI1_AVX2 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
This patch extends permutations for AVX-512*.
Comments are welcome!
Bootstrapped.
AVX-512* tests on top of patch-set all pass
under simulator.
Is it ok for trunk?
gcc/
* config/i386/i386.c
(ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V2DFmode, V4DFmode.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX2): Add V64QI mode.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.
--
Thanks, K
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..d759a45 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
+ case V64QImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ rtx mask_lowpart, op0_lowpart, op1_lowpart;
+ rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+ mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+ op0_lowpart = gen_lowpart (V32HImode, op0);
+ op1_lowpart = gen_lowpart (V32HImode, op1);
+ tmp = gen_reg_rtx (V32HImode);
+ tmp2 = gen_reg_rtx (V32HImode);
+ perm_lo = gen_reg_rtx (V32HImode);
+ perm_hi = gen_reg_rtx (V32HImode);
+ res_lo = gen_reg_rtx (V32HImode);
+ res_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+ emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+ emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+ perm_lo, op1_lowpart));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+ perm_hi, op1_lowpart));
+ emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+ emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+ gen_lowpart (V64QImode, res_hi),
+ force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+ return true;
+ case V8HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
@@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX512VL)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43034,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;
if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
if (GET_MODE_SIZE (d.vmode) == 16)
@@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..d3e9635 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
[V8DI (V4DI "TARGET_AVX512VL")])
(define_mode_iterator VI1_AVX2
- [(V32QI "TARGET_AVX2") V16QI])
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -10621,7 +10621,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10643,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -13559,21 +13561,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
(unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+ [(match_operand:VI1_AVX2 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])