Discussion:
[PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen.
Kirill Yukhin
2014-10-06 12:55:28 UTC
Permalink
Hello,
This patch extends permutations for AVX-512*.
Comments are welcome!

Bootstrapped.
AVX-512* tests on top of patch-set all pass
under simulator.

Is it ok for trunk?

gcc/
* config/i386/i386.c
(ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V2DFmode, V4DFmode.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX2): Add V64QI mode.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.

--
Thanks, K

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..d759a45 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
+ case V64QImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ rtx mask_lowpart, op0_lowpart, op1_lowpart;
+ rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+ mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+ op0_lowpart = gen_lowpart (V32HImode, op0);
+ op1_lowpart = gen_lowpart (V32HImode, op1);
+ tmp = gen_reg_rtx (V32HImode);
+ tmp2 = gen_reg_rtx (V32HImode);
+ perm_lo = gen_reg_rtx (V32HImode);
+ perm_hi = gen_reg_rtx (V32HImode);
+ res_lo = gen_reg_rtx (V32HImode);
+ res_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+ emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+ emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+ perm_lo, op1_lowpart));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+ perm_hi, op1_lowpart));
+ emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+ emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+ gen_lowpart (V64QImode, res_hi),
+ force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+ return true;
+ case V8HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
@@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)

switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)

if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX512VL)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)

switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43034,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;

nelt = d->nelt;

@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;

@@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;

/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;

if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
if (GET_MODE_SIZE (d.vmode) == 16)
@@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..d3e9635 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
[V8DI (V4DI "TARGET_AVX512VL")])

(define_mode_iterator VI1_AVX2
- [(V32QI "TARGET_AVX2") V16QI])
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])

(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -10621,7 +10621,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])

(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10643,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])

(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -13559,21 +13561,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])

-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
(unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+ [(match_operand:VI1_AVX2 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
Jakub Jelinek
2014-10-06 14:10:35 UTC
Permalink
Post by Kirill Yukhin
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
+ if (!TARGET_AVX512BW)
+ return false;
+ rtx mask_lowpart, op0_lowpart, op1_lowpart;
+ rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+ mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+ op0_lowpart = gen_lowpart (V32HImode, op0);
+ op1_lowpart = gen_lowpart (V32HImode, op1);
+ tmp = gen_reg_rtx (V32HImode);
+ tmp2 = gen_reg_rtx (V32HImode);
+ perm_lo = gen_reg_rtx (V32HImode);
+ perm_hi = gen_reg_rtx (V32HImode);
+ res_lo = gen_reg_rtx (V32HImode);
+ res_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+ emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+ emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+ perm_lo, op1_lowpart));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+ perm_hi, op1_lowpart));
+ emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+ emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+ gen_lowpart (V64QImode, res_hi),
+ force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+ return true;
I believe this case doesn't belong to this function, other than this
case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
so it should always do that, and there should be a separate function
that expands the worst case of V64QImode full 2 operand permutation.
See my previous mail, IMHO it is doable with 5 instructions rather than 7.
And IMHO we should have a separate function which emits that, supposedly
one for the constant permutations, one for the variable case (perhaps
then your 7 insn sequence is best?).

Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
right now plus D, either D would be NULL (then it would behave as now), or
SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
I.e. the function would start with a switch that would just contain the
if (...)
return false;
hunks plus break; for the success case, then code to generate CONST_VECTOR
if sel is NULL_RTX from d, and finally another switch with just the emit
cases. Or, the first switch could just set a function pointer before
break, and just use one common
emit_insn (gen (target, op0, force_reg (vmode, mask), op1));
Post by Kirill Yukhin
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
think neither of these depends on the other one in GCC. That's unlike insns
where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
AVX512VL depends on AVX512F.
Post by Kirill Yukhin
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
Formatting, && belongs on the second line.
Post by Kirill Yukhin
+ ;
+ else if (TARGET_AVX512VL)
I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
and the == 32 and == 16 cases are handled because AVX512VL implies
TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
Post by Kirill Yukhin
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
Missing comment, I'd duplicate the
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
comment there.
Post by Kirill Yukhin
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ if (TARGET_AVX512VL)
VL? Isn't that BW?
Post by Kirill Yukhin
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ if (TARGET_AVX512VL)
Ditto.
Post by Kirill Yukhin
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
See above comment about CONST_VECTOR.
Post by Kirill Yukhin
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;
if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate
(except for the V64QI case discussed above) a single insn? Then
expand_vec_perm_1 should have handled that already, so this is just a waste
of resources here.
Post by Kirill Yukhin
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
1) Shouldn't this be guarded with TARGET_AVX512F &&
and in the V32HImode/V64QImode also with TARGET_AVX512BW?
The comment is not correct for V64QImode.

2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes.
Perhaps it would be best to turn this into
switch (d.vmode)
{
case V16SImode:
case V16SFmode:
case V8DFmode:
case V8DImode:
if (TARGET_AVX512F)
/* All implementable with a single vpermi2 insn. */
return true;
break;
case V32HImode:
if (TARGET_AVX512BW)
/* Implementable with a single vpermi2 insn. */
return true;
break;
case V64HImode:
if (TARGET_AVX512BW)
/* Implementable with 2 vpermi2w, 2 vpshufb and one vpor insns. */
return true;
break;
case V8SImode:
case V8SFmode:
case V4DFmode:
case V4DImode:
if (TARGET_AVX512VL)
/* Implementable with a single vpermi2 insn. */
return true;
break;
case V16HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
/* Implementable with a single vpermi2 insn. */
return true;
if (TARGET_AVX2)
/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
return true;
break;
case V32QImode:
if (TARGET_AVX2)
/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
return true;
break;
case V4SImode:
case V4SFmode:
case V8HImode:
case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
break;
case V2DImode:
case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
return true;
}

Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2)
any expanders that guarantee we generate some sequence for all possible
2 operand constant permutations. I think ix86_expand_vec_perm is able
to emit the non-constant permutations for all of these, so in theory
we should have an upper bound for all these.

Jakub
Ilya Tocar
2014-10-09 12:15:23 UTC
Permalink
Hi,

I think this patch should be split in 2 parts:
V64QI related and non-V64QI related.
This part contains non-V64QI related changes.
Also I've noticed, that not all patterns using VI1_AVX2,
actually have AVX512 versions, so fixed bogus patterns.
Post by Jakub Jelinek
Post by Kirill Yukhin
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
...
Post by Jakub Jelinek
I believe this case doesn't belong to this function, other than this
case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
so it should always do that, and there should be a separate function
that expands the worst case of V64QImode full 2 operand permutation.
See my previous mail, IMHO it is doable with 5 instructions rather than 7.
And IMHO we should have a separate function which emits that, supposedly
one for the constant permutations, one for the variable case (perhaps
then your 7 insn sequence is best?).
This will be done in following patch.
Post by Jakub Jelinek
Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
right now plus D, either D would be NULL (then it would behave as now), or
SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
I.e. the function would start with a switch that would just contain the
if (...)
return false;
hunks plus break; for the success case, then code to generate CONST_VECTOR
if sel is NULL_RTX from d, and finally another switch with just the emit
cases.
Done.
Post by Jakub Jelinek
Post by Kirill Yukhin
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
think neither of these depends on the other one in GCC. That's unlike insns
where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
AVX512VL depends on AVX512F.
Good catch!
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
Formatting, && belongs on the second line.
Fixed.
Post by Jakub Jelinek
Post by Kirill Yukhin
+ ;
+ else if (TARGET_AVX512VL)
I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
and the == 32 and == 16 cases are handled because AVX512VL implies
TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and
works only on 32/16-byte mode this case is redundant, so I've removed
it.
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
Missing comment, I'd duplicate the
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
comment there.
Done.
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ if (TARGET_AVX512VL)
VL? Isn't that BW?
Post by Kirill Yukhin
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ if (TARGET_AVX512VL)
Ditto.
Fixed.
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
See above comment about CONST_VECTOR.
Done.
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;
if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate
(except for the V64QI case discussed above) a single insn? Then
expand_vec_perm_1 should have handled that already, so this is just a waste
of resources here.
Removed.
Post by Jakub Jelinek
Post by Kirill Yukhin
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
1) Shouldn't this be guarded with TARGET_AVX512F &&
and in the V32HImode/V64QImode also with TARGET_AVX512BW?
The comment is not correct for V64QImode.
This are probably no 512-bit modes without AVX512F, but I've
refactored it as per your suggestion below.
Post by Jakub Jelinek
2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes.
Perhaps it would be best to turn this into
switch (d.vmode)
{
if (TARGET_AVX512F)
/* All implementable with a single vpermi2 insn. */
break;
...
Post by Jakub Jelinek
Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2)
any expanders that guarantee we generate some sequence for all possible
2 operand constant permutations. I think ix86_expand_vec_perm is able
to emit the non-constant permutations for all of these, so in theory
we should have an upper bound for all these.
I'm not sure about it, so for now I've left V8SI/V8SF/V4DI/V4DF out.

Updated patch below:

gcc/
* config/i386/i386.c
(MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2.
(struct expand_vec_perm_d): Ditto.
(ix86_expand_vec_perm_vpermi2): Handle V8HImode, V16HImode, V2DFmode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V4DFmode.
(ix86_expand_vec_perm): Update ix86_expand_vec_perm_vpermi2 signature.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX512): New.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.
(mul<mode>3): Use VI1_AVX512.
(<sse2_avx2>_packsswb): Ditto.
(<sse2_avx2>_packuswb): Ditto.
(<ssse3_avx2>_pshufb<mode>3): Ditto.
(<shift_insn><mode>3): Ditto.

---
gcc/config/i386/i386.c | 293 ++++++++++++++++++++++++++++++++++++++++++-------
gcc/config/i386/sse.md | 45 ++++----
2 files changed, 278 insertions(+), 60 deletions(-)

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..426ea9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
return true;
}

+/* AVX512F does support 64-byte integer vector operations,
+ thus the longest vector we are faced with is V64QImode. */
+#define MAX_VECT_LEN 64
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool one_operand_p;
+ bool testing_p;
+};
+
static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d)
{
- enum machine_mode mode = GET_MODE (op0);
+ enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+
switch (mode)
{
+ case V8HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ break;
+ case V16HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ case V32HImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ break;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V16SImode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V16SFmode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8DImode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8DFmode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const expander,
+ so args are either in d, or in op0, op1 etc. */
+ if (d)
+ {
+ rtx vec[64];
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec));
+ }
+
+ switch (mode)
+ {
+ case V8HImode:
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
default:
- return false;
+ gcc_unreachable ();
}
}

@@ -21407,7 +21544,7 @@ ix86_expand_vec_perm (rtx operands[])
e = GET_MODE_UNIT_SIZE (mode);
gcc_assert (w <= 64);

- if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+ if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
return;

if (TARGET_AVX2)
@@ -21779,6 +21916,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)

switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -39603,20 +39749,6 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}

-/* AVX512F does support 64-byte integer vector operations,
- thus the longest vector we are faced with is V64QImode. */
-#define MAX_VECT_LEN 64
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- enum machine_mode vmode;
- unsigned char nelt;
- bool one_operand_p;
- bool testing_p;
-};
-
static bool canonicalize_perm (struct expand_vec_perm_d *d);
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
@@ -42662,7 +42794,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)

if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+ && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42828,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)

switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43062,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;

nelt = d->nelt;

@@ -43012,6 +43153,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43183,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;

@@ -43054,6 +43210,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43267,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43294,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43210,16 +43392,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return true;

/* Try the AVX512F vpermi2 instructions. */
- rtx vec[64];
- enum machine_mode mode = d->vmode;
- if (mode == V8DFmode)
- mode = V8DImode;
- else if (mode == V16SFmode)
- mode = V16SImode;
- for (i = 0; i < nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
- if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
+ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
return true;

return false;
@@ -44932,21 +45105,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,

/* Given sufficient ISA support we can just return true here
for selected vector modes. */
- if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
- /* All implementable with a single vpermi2 insn. */
- return true;
- if (GET_MODE_SIZE (d.vmode) == 16)
+ switch (d.vmode)
{
+ case V16SFmode:
+ case V16SImode:
+ case V8DImode:
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V8SImode:
+ case V8SFmode:
+ case V4DFmode:
+ case V4DImode:
+ if (TARGET_AVX512VL)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V16HImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V32QImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V4SImode:
+ case V4SFmode:
+ case V8HImode:
+ case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
+ break;
+ case V2DImode:
+ case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
- if (d.nelt == 2)
- return true;
+ return true;
+ default:
+ return false;
}

/* Extract the values from the vector CST into the permutation
@@ -45066,6 +45274,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45339,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45347,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..460cbff 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -300,6 +300,9 @@
(define_mode_iterator VI1_AVX2
[(V32QI "TARGET_AVX2") V16QI])

+(define_mode_iterator VI1_AVX512
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
+
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])

@@ -9239,9 +9242,9 @@
(set_attr "mode" "TI")])

(define_expand "mul<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
- (match_operand:VI1_AVX2 2 "register_operand")))]
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
+ (match_operand:VI1_AVX512 2 "register_operand")))]
"TARGET_SSE2"
{
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
@@ -10621,7 +10624,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])

(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10646,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])

(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -11006,8 +11011,8 @@
})

(define_insn "<sse2_avx2>_packsswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(ss_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(ss_truncate:<ssehalfvecmode>
@@ -11040,8 +11045,8 @@
(set_attr "mode" "<sseinsnmode>")])

(define_insn "<sse2_avx2>_packuswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(us_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(us_truncate:<ssehalfvecmode>
@@ -13559,21 +13564,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])

-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v")
+ (unspec:VI1_AVX512
+ [(match_operand:VI1_AVX512 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])

@@ -15948,9 +15953,9 @@
(set_attr "mode" "TI")])

(define_expand "<shift_insn><mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (any_shift:VI1_AVX2
- (match_operand:VI1_AVX2 1 "register_operand")
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (any_shift:VI1_AVX512
+ (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_SSE2"
{
--
1.8.3.1
Loading...