core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use core::hint::unreachable_unchecked;
22
23use crate::core_arch::{simd::*, x86::*};
24use crate::intrinsics::simd::*;
25
26#[cfg(test)]
27use stdarch_test::assert_instr;
28
29/// Computes the absolute values of packed 32-bit integers in `a`.
30///
31/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
32#[inline]
33#[target_feature(enable = "avx2")]
34#[cfg_attr(test, assert_instr(vpabsd))]
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
37    let a = a.as_i32x8();
38    let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
39    transmute(r)
40}
41
42/// Computes the absolute values of packed 16-bit integers in `a`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
45#[inline]
46#[target_feature(enable = "avx2")]
47#[cfg_attr(test, assert_instr(vpabsw))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
50    let a = a.as_i16x16();
51    let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
52    transmute(r)
53}
54
55/// Computes the absolute values of packed 8-bit integers in `a`.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
58#[inline]
59#[target_feature(enable = "avx2")]
60#[cfg_attr(test, assert_instr(vpabsb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
63    let a = a.as_i8x32();
64    let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
65    transmute(r)
66}
67
68/// Adds packed 64-bit integers in `a` and `b`.
69///
70/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
71#[inline]
72#[target_feature(enable = "avx2")]
73#[cfg_attr(test, assert_instr(vpaddq))]
74#[stable(feature = "simd_x86", since = "1.27.0")]
75pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
76    transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
77}
78
79/// Adds packed 32-bit integers in `a` and `b`.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
82#[inline]
83#[target_feature(enable = "avx2")]
84#[cfg_attr(test, assert_instr(vpaddd))]
85#[stable(feature = "simd_x86", since = "1.27.0")]
86pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
87    transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
88}
89
90/// Adds packed 16-bit integers in `a` and `b`.
91///
92/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
93#[inline]
94#[target_feature(enable = "avx2")]
95#[cfg_attr(test, assert_instr(vpaddw))]
96#[stable(feature = "simd_x86", since = "1.27.0")]
97pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
98    transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
99}
100
101/// Adds packed 8-bit integers in `a` and `b`.
102///
103/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
104#[inline]
105#[target_feature(enable = "avx2")]
106#[cfg_attr(test, assert_instr(vpaddb))]
107#[stable(feature = "simd_x86", since = "1.27.0")]
108pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
109    transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
110}
111
112/// Adds packed 8-bit integers in `a` and `b` using saturation.
113///
114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
115#[inline]
116#[target_feature(enable = "avx2")]
117#[cfg_attr(test, assert_instr(vpaddsb))]
118#[stable(feature = "simd_x86", since = "1.27.0")]
119pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
120    transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32()))
121}
122
123/// Adds packed 16-bit integers in `a` and `b` using saturation.
124///
125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
126#[inline]
127#[target_feature(enable = "avx2")]
128#[cfg_attr(test, assert_instr(vpaddsw))]
129#[stable(feature = "simd_x86", since = "1.27.0")]
130pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
131    transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16()))
132}
133
134/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
135///
136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
137#[inline]
138#[target_feature(enable = "avx2")]
139#[cfg_attr(test, assert_instr(vpaddusb))]
140#[stable(feature = "simd_x86", since = "1.27.0")]
141pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
142    transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32()))
143}
144
145/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
146///
147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
148#[inline]
149#[target_feature(enable = "avx2")]
150#[cfg_attr(test, assert_instr(vpaddusw))]
151#[stable(feature = "simd_x86", since = "1.27.0")]
152pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
153    transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16()))
154}
155
156/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
157/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
158///
159/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
160#[inline]
161#[target_feature(enable = "avx2")]
162#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
163#[rustc_legacy_const_generics(2)]
164#[stable(feature = "simd_x86", since = "1.27.0")]
165pub unsafe fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
166    static_assert_uimm_bits!(IMM8, 8);
167    // If palignr is shifting the pair of vectors more than the size of two
168    // lanes, emit zero.
169    if IMM8 >= 32 {
170        return _mm256_setzero_si256();
171    }
172    // If palignr is shifting the pair of input vectors more than one lane,
173    // but less than two lanes, convert to shifting in zeroes.
174    let (a, b) = if IMM8 > 16 {
175        (_mm256_setzero_si256(), a)
176    } else {
177        (a, b)
178    };
179
180    let a = a.as_i8x32();
181    let b = b.as_i8x32();
182
183    if IMM8 == 16 {
184        return transmute(a);
185    }
186
187    let r: i8x32 = match IMM8 % 16 {
188        0 => simd_shuffle!(
189            b,
190            a,
191            [
192                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
193                23, 24, 25, 26, 27, 28, 29, 30, 31,
194            ],
195        ),
196        1 => simd_shuffle!(
197            b,
198            a,
199            [
200                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
201                24, 25, 26, 27, 28, 29, 30, 31, 48,
202            ],
203        ),
204        2 => simd_shuffle!(
205            b,
206            a,
207            [
208                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 18, 19, 20, 21, 22, 23, 24,
209                25, 26, 27, 28, 29, 30, 31, 48, 49,
210            ],
211        ),
212        3 => simd_shuffle!(
213            b,
214            a,
215            [
216                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 19, 20, 21, 22, 23, 24,
217                25, 26, 27, 28, 29, 30, 31, 48, 49, 50,
218            ],
219        ),
220        4 => simd_shuffle!(
221            b,
222            a,
223            [
224                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 20, 21, 22, 23, 24, 25,
225                26, 27, 28, 29, 30, 31, 48, 49, 50, 51,
226            ],
227        ),
228        5 => simd_shuffle!(
229            b,
230            a,
231            [
232                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 21, 22, 23, 24, 25, 26,
233                27, 28, 29, 30, 31, 48, 49, 50, 51, 52,
234            ],
235        ),
236        6 => simd_shuffle!(
237            b,
238            a,
239            [
240                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 22, 23, 24, 25, 26, 27,
241                28, 29, 30, 31, 48, 49, 50, 51, 52, 53,
242            ],
243        ),
244        7 => simd_shuffle!(
245            b,
246            a,
247            [
248                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 23, 24, 25, 26, 27,
249                28, 29, 30, 31, 48, 49, 50, 51, 52, 53, 54,
250            ],
251        ),
252        8 => simd_shuffle!(
253            b,
254            a,
255            [
256                8, 9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 24, 25, 26, 27, 28,
257                29, 30, 31, 48, 49, 50, 51, 52, 53, 54, 55,
258            ],
259        ),
260        9 => simd_shuffle!(
261            b,
262            a,
263            [
264                9, 10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 25, 26, 27, 28, 29,
265                30, 31, 48, 49, 50, 51, 52, 53, 54, 55, 56,
266            ],
267        ),
268        10 => simd_shuffle!(
269            b,
270            a,
271            [
272                10, 11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 26, 27, 28, 29, 30,
273                31, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
274            ],
275        ),
276        11 => simd_shuffle!(
277            b,
278            a,
279            [
280                11, 12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 27, 28, 29, 30, 31,
281                48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
282            ],
283        ),
284        12 => simd_shuffle!(
285            b,
286            a,
287            [
288                12, 13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 28, 29, 30, 31, 48,
289                49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
290            ],
291        ),
292        13 => simd_shuffle!(
293            b,
294            a,
295            [
296                13, 14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 29, 30, 31, 48, 49,
297                50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
298            ],
299        ),
300        14 => simd_shuffle!(
301            b,
302            a,
303            [
304                14, 15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 30, 31, 48, 49, 50,
305                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
306            ],
307        ),
308        15 => simd_shuffle!(
309            b,
310            a,
311            [
312                15, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 31, 48, 49, 50, 51,
313                52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
314            ],
315        ),
316        _ => unreachable_unchecked(),
317    };
318    transmute(r)
319}
320
321/// Computes the bitwise AND of 256 bits (representing integer data)
322/// in `a` and `b`.
323///
324/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
325#[inline]
326#[target_feature(enable = "avx2")]
327#[cfg_attr(test, assert_instr(vandps))]
328#[stable(feature = "simd_x86", since = "1.27.0")]
329pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
330    transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
331}
332
333/// Computes the bitwise NOT of 256 bits (representing integer data)
334/// in `a` and then AND with `b`.
335///
336/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
337#[inline]
338#[target_feature(enable = "avx2")]
339#[cfg_attr(test, assert_instr(vandnps))]
340#[stable(feature = "simd_x86", since = "1.27.0")]
341pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
342    let all_ones = _mm256_set1_epi8(-1);
343    transmute(simd_and(
344        simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
345        b.as_i64x4(),
346    ))
347}
348
349/// Averages packed unsigned 16-bit integers in `a` and `b`.
350///
351/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
352#[inline]
353#[target_feature(enable = "avx2")]
354#[cfg_attr(test, assert_instr(vpavgw))]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
357    let a = simd_cast::<_, u32x16>(a.as_u16x16());
358    let b = simd_cast::<_, u32x16>(b.as_u16x16());
359    let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
360    transmute(simd_cast::<_, u16x16>(r))
361}
362
363/// Averages packed unsigned 8-bit integers in `a` and `b`.
364///
365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
366#[inline]
367#[target_feature(enable = "avx2")]
368#[cfg_attr(test, assert_instr(vpavgb))]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
371    let a = simd_cast::<_, u16x32>(a.as_u8x32());
372    let b = simd_cast::<_, u16x32>(b.as_u8x32());
373    let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
374    transmute(simd_cast::<_, u8x32>(r))
375}
376
377/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
378///
379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
380#[inline]
381#[target_feature(enable = "avx2")]
382#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
383#[rustc_legacy_const_generics(2)]
384#[stable(feature = "simd_x86", since = "1.27.0")]
385pub unsafe fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
386    static_assert_uimm_bits!(IMM4, 4);
387    let a = a.as_i32x4();
388    let b = b.as_i32x4();
389    let r: i32x4 = simd_shuffle!(
390        a,
391        b,
392        [
393            [0, 4, 0, 4][IMM4 as usize & 0b11],
394            [1, 1, 5, 5][IMM4 as usize & 0b11],
395            [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
396            [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
397        ],
398    );
399    transmute(r)
400}
401
402/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
405#[inline]
406#[target_feature(enable = "avx2")]
407#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
408#[rustc_legacy_const_generics(2)]
409#[stable(feature = "simd_x86", since = "1.27.0")]
410pub unsafe fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
411    static_assert_uimm_bits!(IMM8, 8);
412    let a = a.as_i32x8();
413    let b = b.as_i32x8();
414    let r: i32x8 = simd_shuffle!(
415        a,
416        b,
417        [
418            [0, 8, 0, 8][IMM8 as usize & 0b11],
419            [1, 1, 9, 9][IMM8 as usize & 0b11],
420            [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
421            [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
422            [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
423            [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
424            [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
425            [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
426        ],
427    );
428    transmute(r)
429}
430
431/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
432///
433/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
434#[inline]
435#[target_feature(enable = "avx2")]
436#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
437#[rustc_legacy_const_generics(2)]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub unsafe fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
440    static_assert_uimm_bits!(IMM8, 8);
441    let a = a.as_i16x16();
442    let b = b.as_i16x16();
443
444    let r: i16x16 = simd_shuffle!(
445        a,
446        b,
447        [
448            [0, 16, 0, 16][IMM8 as usize & 0b11],
449            [1, 1, 17, 17][IMM8 as usize & 0b11],
450            [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
451            [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
452            [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
453            [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
454            [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
455            [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
456            [8, 24, 8, 24][IMM8 as usize & 0b11],
457            [9, 9, 25, 25][IMM8 as usize & 0b11],
458            [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
459            [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
460            [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
461            [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
462            [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
463            [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
464        ],
465    );
466    transmute(r)
467}
468
469/// Blends packed 8-bit integers from `a` and `b` using `mask`.
470///
471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
472#[inline]
473#[target_feature(enable = "avx2")]
474#[cfg_attr(test, assert_instr(vpblendvb))]
475#[stable(feature = "simd_x86", since = "1.27.0")]
476pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
477    let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
478    transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
479}
480
481/// Broadcasts the low packed 8-bit integer from `a` to all elements of
482/// the 128-bit returned value.
483///
484/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
485#[inline]
486#[target_feature(enable = "avx2")]
487#[cfg_attr(test, assert_instr(vpbroadcastb))]
488#[stable(feature = "simd_x86", since = "1.27.0")]
489pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
490    let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
491    transmute::<i8x16, _>(ret)
492}
493
494/// Broadcasts the low packed 8-bit integer from `a` to all elements of
495/// the 256-bit returned value.
496///
497/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
498#[inline]
499#[target_feature(enable = "avx2")]
500#[cfg_attr(test, assert_instr(vpbroadcastb))]
501#[stable(feature = "simd_x86", since = "1.27.0")]
502pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
503    let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
504    transmute::<i8x32, _>(ret)
505}
506
507// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
508// often compiled to `vbroadcastss`.
509/// Broadcasts the low packed 32-bit integer from `a` to all elements of
510/// the 128-bit returned value.
511///
512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
513#[inline]
514#[target_feature(enable = "avx2")]
515#[cfg_attr(test, assert_instr(vbroadcastss))]
516#[stable(feature = "simd_x86", since = "1.27.0")]
517pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
518    let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
519    transmute::<i32x4, _>(ret)
520}
521
522// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
523// often compiled to `vbroadcastss`.
524/// Broadcasts the low packed 32-bit integer from `a` to all elements of
525/// the 256-bit returned value.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
528#[inline]
529#[target_feature(enable = "avx2")]
530#[cfg_attr(test, assert_instr(vbroadcastss))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
533    let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
534    transmute::<i32x8, _>(ret)
535}
536
537/// Broadcasts the low packed 64-bit integer from `a` to all elements of
538/// the 128-bit returned value.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
541#[inline]
542#[target_feature(enable = "avx2")]
543// Emits `vmovddup` instead of `vpbroadcastq`
544// See https://github.com/rust-lang/stdarch/issues/791
545#[cfg_attr(test, assert_instr(vmovddup))]
546#[stable(feature = "simd_x86", since = "1.27.0")]
547pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
548    let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
549    transmute::<i64x2, _>(ret)
550}
551
552/// Broadcasts the low packed 64-bit integer from `a` to all elements of
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[cfg_attr(test, assert_instr(vbroadcastsd))]
559#[stable(feature = "simd_x86", since = "1.27.0")]
560pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
561    let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
562    transmute::<i64x4, _>(ret)
563}
564
565/// Broadcasts the low double-precision (64-bit) floating-point element
566/// from `a` to all elements of the 128-bit returned value.
567///
568/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
569#[inline]
570#[target_feature(enable = "avx2")]
571#[cfg_attr(test, assert_instr(vmovddup))]
572#[stable(feature = "simd_x86", since = "1.27.0")]
573pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
574    simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2])
575}
576
577/// Broadcasts the low double-precision (64-bit) floating-point element
578/// from `a` to all elements of the 256-bit returned value.
579///
580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
581#[inline]
582#[target_feature(enable = "avx2")]
583#[cfg_attr(test, assert_instr(vbroadcastsd))]
584#[stable(feature = "simd_x86", since = "1.27.0")]
585pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
586    simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4])
587}
588
589/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
590/// the 256-bit returned value.
591///
592/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
593#[inline]
594#[target_feature(enable = "avx2")]
595#[stable(feature = "simd_x86_updates", since = "1.82.0")]
596pub unsafe fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
597    let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
598    transmute::<i64x4, _>(ret)
599}
600
601// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
602// `vbroadcastf128`.
603/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
604/// the 256-bit returned value.
605///
606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
607#[inline]
608#[target_feature(enable = "avx2")]
609#[stable(feature = "simd_x86", since = "1.27.0")]
610pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
611    let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
612    transmute::<i64x4, _>(ret)
613}
614
615/// Broadcasts the low single-precision (32-bit) floating-point element
616/// from `a` to all elements of the 128-bit returned value.
617///
618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
619#[inline]
620#[target_feature(enable = "avx2")]
621#[cfg_attr(test, assert_instr(vbroadcastss))]
622#[stable(feature = "simd_x86", since = "1.27.0")]
623pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
624    simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4])
625}
626
627/// Broadcasts the low single-precision (32-bit) floating-point element
628/// from `a` to all elements of the 256-bit returned value.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
631#[inline]
632#[target_feature(enable = "avx2")]
633#[cfg_attr(test, assert_instr(vbroadcastss))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
636    simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8])
637}
638
639/// Broadcasts the low packed 16-bit integer from a to all elements of
640/// the 128-bit returned value
641///
642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
643#[inline]
644#[target_feature(enable = "avx2")]
645#[cfg_attr(test, assert_instr(vpbroadcastw))]
646#[stable(feature = "simd_x86", since = "1.27.0")]
647pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
648    let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
649    transmute::<i16x8, _>(ret)
650}
651
652/// Broadcasts the low packed 16-bit integer from a to all elements of
653/// the 256-bit returned value
654///
655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
656#[inline]
657#[target_feature(enable = "avx2")]
658#[cfg_attr(test, assert_instr(vpbroadcastw))]
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
661    let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
662    transmute::<i16x16, _>(ret)
663}
664
665/// Compares packed 64-bit integers in `a` and `b` for equality.
666///
667/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
668#[inline]
669#[target_feature(enable = "avx2")]
670#[cfg_attr(test, assert_instr(vpcmpeqq))]
671#[stable(feature = "simd_x86", since = "1.27.0")]
672pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
673    transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
674}
675
676/// Compares packed 32-bit integers in `a` and `b` for equality.
677///
678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
679#[inline]
680#[target_feature(enable = "avx2")]
681#[cfg_attr(test, assert_instr(vpcmpeqd))]
682#[stable(feature = "simd_x86", since = "1.27.0")]
683pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
684    transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
685}
686
687/// Compares packed 16-bit integers in `a` and `b` for equality.
688///
689/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
690#[inline]
691#[target_feature(enable = "avx2")]
692#[cfg_attr(test, assert_instr(vpcmpeqw))]
693#[stable(feature = "simd_x86", since = "1.27.0")]
694pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
695    transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
696}
697
698/// Compares packed 8-bit integers in `a` and `b` for equality.
699///
700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
701#[inline]
702#[target_feature(enable = "avx2")]
703#[cfg_attr(test, assert_instr(vpcmpeqb))]
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
706    transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
707}
708
709/// Compares packed 64-bit integers in `a` and `b` for greater-than.
710///
711/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
712#[inline]
713#[target_feature(enable = "avx2")]
714#[cfg_attr(test, assert_instr(vpcmpgtq))]
715#[stable(feature = "simd_x86", since = "1.27.0")]
716pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
717    transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
718}
719
720/// Compares packed 32-bit integers in `a` and `b` for greater-than.
721///
722/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
723#[inline]
724#[target_feature(enable = "avx2")]
725#[cfg_attr(test, assert_instr(vpcmpgtd))]
726#[stable(feature = "simd_x86", since = "1.27.0")]
727pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
728    transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
729}
730
731/// Compares packed 16-bit integers in `a` and `b` for greater-than.
732///
733/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
734#[inline]
735#[target_feature(enable = "avx2")]
736#[cfg_attr(test, assert_instr(vpcmpgtw))]
737#[stable(feature = "simd_x86", since = "1.27.0")]
738pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
739    transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
740}
741
742/// Compares packed 8-bit integers in `a` and `b` for greater-than.
743///
744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
745#[inline]
746#[target_feature(enable = "avx2")]
747#[cfg_attr(test, assert_instr(vpcmpgtb))]
748#[stable(feature = "simd_x86", since = "1.27.0")]
749pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
750    transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
751}
752
753/// Sign-extend 16-bit integers to 32-bit integers.
754///
755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
756#[inline]
757#[target_feature(enable = "avx2")]
758#[cfg_attr(test, assert_instr(vpmovsxwd))]
759#[stable(feature = "simd_x86", since = "1.27.0")]
760pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
761    transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
762}
763
764/// Sign-extend 16-bit integers to 64-bit integers.
765///
766/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
767#[inline]
768#[target_feature(enable = "avx2")]
769#[cfg_attr(test, assert_instr(vpmovsxwq))]
770#[stable(feature = "simd_x86", since = "1.27.0")]
771pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
772    let a = a.as_i16x8();
773    let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
774    transmute::<i64x4, _>(simd_cast(v64))
775}
776
777/// Sign-extend 32-bit integers to 64-bit integers.
778///
779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
780#[inline]
781#[target_feature(enable = "avx2")]
782#[cfg_attr(test, assert_instr(vpmovsxdq))]
783#[stable(feature = "simd_x86", since = "1.27.0")]
784pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
785    transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
786}
787
788/// Sign-extend 8-bit integers to 16-bit integers.
789///
790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
791#[inline]
792#[target_feature(enable = "avx2")]
793#[cfg_attr(test, assert_instr(vpmovsxbw))]
794#[stable(feature = "simd_x86", since = "1.27.0")]
795pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
796    transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
797}
798
799/// Sign-extend 8-bit integers to 32-bit integers.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
802#[inline]
803#[target_feature(enable = "avx2")]
804#[cfg_attr(test, assert_instr(vpmovsxbd))]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
807    let a = a.as_i8x16();
808    let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
809    transmute::<i32x8, _>(simd_cast(v64))
810}
811
812/// Sign-extend 8-bit integers to 64-bit integers.
813///
814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
815#[inline]
816#[target_feature(enable = "avx2")]
817#[cfg_attr(test, assert_instr(vpmovsxbq))]
818#[stable(feature = "simd_x86", since = "1.27.0")]
819pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
820    let a = a.as_i8x16();
821    let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
822    transmute::<i64x4, _>(simd_cast(v32))
823}
824
825/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
826/// integers, and stores the results in `dst`.
827///
828/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
829#[inline]
830#[target_feature(enable = "avx2")]
831#[cfg_attr(test, assert_instr(vpmovzxwd))]
832#[stable(feature = "simd_x86", since = "1.27.0")]
833pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
834    transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
835}
836
837/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
838/// integers. The upper four elements of `a` are unused.
839///
840/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
841#[inline]
842#[target_feature(enable = "avx2")]
843#[cfg_attr(test, assert_instr(vpmovzxwq))]
844#[stable(feature = "simd_x86", since = "1.27.0")]
845pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
846    let a = a.as_u16x8();
847    let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
848    transmute::<i64x4, _>(simd_cast(v64))
849}
850
851/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
852///
853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
854#[inline]
855#[target_feature(enable = "avx2")]
856#[cfg_attr(test, assert_instr(vpmovzxdq))]
857#[stable(feature = "simd_x86", since = "1.27.0")]
858pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
859    transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
860}
861
862/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
863///
864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
865#[inline]
866#[target_feature(enable = "avx2")]
867#[cfg_attr(test, assert_instr(vpmovzxbw))]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
870    transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
871}
872
873/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
874/// integers. The upper eight elements of `a` are unused.
875///
876/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
877#[inline]
878#[target_feature(enable = "avx2")]
879#[cfg_attr(test, assert_instr(vpmovzxbd))]
880#[stable(feature = "simd_x86", since = "1.27.0")]
881pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
882    let a = a.as_u8x16();
883    let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
884    transmute::<i32x8, _>(simd_cast(v64))
885}
886
887/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
888/// integers. The upper twelve elements of `a` are unused.
889///
890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
891#[inline]
892#[target_feature(enable = "avx2")]
893#[cfg_attr(test, assert_instr(vpmovzxbq))]
894#[stable(feature = "simd_x86", since = "1.27.0")]
895pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
896    let a = a.as_u8x16();
897    let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
898    transmute::<i64x4, _>(simd_cast(v32))
899}
900
901/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
902///
903/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
904#[inline]
905#[target_feature(enable = "avx2")]
906#[cfg_attr(
907    all(test, not(target_env = "msvc")),
908    assert_instr(vextractf128, IMM1 = 1)
909)]
910#[rustc_legacy_const_generics(1)]
911#[stable(feature = "simd_x86", since = "1.27.0")]
912pub unsafe fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
913    static_assert_uimm_bits!(IMM1, 1);
914    let a = a.as_i64x4();
915    let b = i64x4::ZERO;
916    let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
917    transmute(dst)
918}
919
920/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
921///
922/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
923#[inline]
924#[target_feature(enable = "avx2")]
925#[cfg_attr(test, assert_instr(vphaddw))]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
928    transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
929}
930
931/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
932///
933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
934#[inline]
935#[target_feature(enable = "avx2")]
936#[cfg_attr(test, assert_instr(vphaddd))]
937#[stable(feature = "simd_x86", since = "1.27.0")]
938pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
939    transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
940}
941
942/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
943/// using saturation.
944///
945/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
946#[inline]
947#[target_feature(enable = "avx2")]
948#[cfg_attr(test, assert_instr(vphaddsw))]
949#[stable(feature = "simd_x86", since = "1.27.0")]
950pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
951    transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
952}
953
954/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
955///
956/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
957#[inline]
958#[target_feature(enable = "avx2")]
959#[cfg_attr(test, assert_instr(vphsubw))]
960#[stable(feature = "simd_x86", since = "1.27.0")]
961pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
962    transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
963}
964
965/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
968#[inline]
969#[target_feature(enable = "avx2")]
970#[cfg_attr(test, assert_instr(vphsubd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
973    transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
974}
975
976/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
977/// using saturation.
978///
979/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
980#[inline]
981#[target_feature(enable = "avx2")]
982#[cfg_attr(test, assert_instr(vphsubsw))]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
985    transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
986}
987
988/// Returns values from `slice` at offsets determined by `offsets * scale`,
989/// where
990/// `scale` should be 1, 2, 4 or 8.
991///
992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
993#[inline]
994#[target_feature(enable = "avx2")]
995#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
996#[rustc_legacy_const_generics(2)]
997#[stable(feature = "simd_x86", since = "1.27.0")]
998pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
999    slice: *const i32,
1000    offsets: __m128i,
1001) -> __m128i {
1002    static_assert_imm8_scale!(SCALE);
1003    let zero = i32x4::ZERO;
1004    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1005    let offsets = offsets.as_i32x4();
1006    let slice = slice as *const i8;
1007    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1008    transmute(r)
1009}
1010
1011/// Returns values from `slice` at offsets determined by `offsets * scale`,
1012/// where
1013/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1014/// that position instead.
1015///
1016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1017#[inline]
1018#[target_feature(enable = "avx2")]
1019#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1020#[rustc_legacy_const_generics(4)]
1021#[stable(feature = "simd_x86", since = "1.27.0")]
1022pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1023    src: __m128i,
1024    slice: *const i32,
1025    offsets: __m128i,
1026    mask: __m128i,
1027) -> __m128i {
1028    static_assert_imm8_scale!(SCALE);
1029    let src = src.as_i32x4();
1030    let mask = mask.as_i32x4();
1031    let offsets = offsets.as_i32x4();
1032    let slice = slice as *const i8;
1033    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1034    transmute(r)
1035}
1036
1037/// Returns values from `slice` at offsets determined by `offsets * scale`,
1038/// where
1039/// `scale` should be 1, 2, 4 or 8.
1040///
1041/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1042#[inline]
1043#[target_feature(enable = "avx2")]
1044#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1045#[rustc_legacy_const_generics(2)]
1046#[stable(feature = "simd_x86", since = "1.27.0")]
1047pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1048    slice: *const i32,
1049    offsets: __m256i,
1050) -> __m256i {
1051    static_assert_imm8_scale!(SCALE);
1052    let zero = i32x8::ZERO;
1053    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1054    let offsets = offsets.as_i32x8();
1055    let slice = slice as *const i8;
1056    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1057    transmute(r)
1058}
1059
1060/// Returns values from `slice` at offsets determined by `offsets * scale`,
1061/// where
1062/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1063/// that position instead.
1064///
1065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1066#[inline]
1067#[target_feature(enable = "avx2")]
1068#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1069#[rustc_legacy_const_generics(4)]
1070#[stable(feature = "simd_x86", since = "1.27.0")]
1071pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1072    src: __m256i,
1073    slice: *const i32,
1074    offsets: __m256i,
1075    mask: __m256i,
1076) -> __m256i {
1077    static_assert_imm8_scale!(SCALE);
1078    let src = src.as_i32x8();
1079    let mask = mask.as_i32x8();
1080    let offsets = offsets.as_i32x8();
1081    let slice = slice as *const i8;
1082    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1083    transmute(r)
1084}
1085
1086/// Returns values from `slice` at offsets determined by `offsets * scale`,
1087/// where
1088/// `scale` should be 1, 2, 4 or 8.
1089///
1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1091#[inline]
1092#[target_feature(enable = "avx2")]
1093#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1094#[rustc_legacy_const_generics(2)]
1095#[stable(feature = "simd_x86", since = "1.27.0")]
1096pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1097    static_assert_imm8_scale!(SCALE);
1098    let zero = _mm_setzero_ps();
1099    let neg_one = _mm_set1_ps(-1.0);
1100    let offsets = offsets.as_i32x4();
1101    let slice = slice as *const i8;
1102    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1103}
1104
1105/// Returns values from `slice` at offsets determined by `offsets * scale`,
1106/// where
1107/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1108/// that position instead.
1109///
1110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1111#[inline]
1112#[target_feature(enable = "avx2")]
1113#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1114#[rustc_legacy_const_generics(4)]
1115#[stable(feature = "simd_x86", since = "1.27.0")]
1116pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1117    src: __m128,
1118    slice: *const f32,
1119    offsets: __m128i,
1120    mask: __m128,
1121) -> __m128 {
1122    static_assert_imm8_scale!(SCALE);
1123    let offsets = offsets.as_i32x4();
1124    let slice = slice as *const i8;
1125    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1126}
1127
1128/// Returns values from `slice` at offsets determined by `offsets * scale`,
1129/// where
1130/// `scale` should be 1, 2, 4 or 8.
1131///
1132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1133#[inline]
1134#[target_feature(enable = "avx2")]
1135#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1136#[rustc_legacy_const_generics(2)]
1137#[stable(feature = "simd_x86", since = "1.27.0")]
1138pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1139    static_assert_imm8_scale!(SCALE);
1140    let zero = _mm256_setzero_ps();
1141    let neg_one = _mm256_set1_ps(-1.0);
1142    let offsets = offsets.as_i32x8();
1143    let slice = slice as *const i8;
1144    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1145}
1146
1147/// Returns values from `slice` at offsets determined by `offsets * scale`,
1148/// where
1149/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1150/// that position instead.
1151///
1152/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1153#[inline]
1154#[target_feature(enable = "avx2")]
1155#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1156#[rustc_legacy_const_generics(4)]
1157#[stable(feature = "simd_x86", since = "1.27.0")]
1158pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1159    src: __m256,
1160    slice: *const f32,
1161    offsets: __m256i,
1162    mask: __m256,
1163) -> __m256 {
1164    static_assert_imm8_scale!(SCALE);
1165    let offsets = offsets.as_i32x8();
1166    let slice = slice as *const i8;
1167    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1168}
1169
1170/// Returns values from `slice` at offsets determined by `offsets * scale`,
1171/// where
1172/// `scale` should be 1, 2, 4 or 8.
1173///
1174/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1175#[inline]
1176#[target_feature(enable = "avx2")]
1177#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1178#[rustc_legacy_const_generics(2)]
1179#[stable(feature = "simd_x86", since = "1.27.0")]
1180pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1181    slice: *const i64,
1182    offsets: __m128i,
1183) -> __m128i {
1184    static_assert_imm8_scale!(SCALE);
1185    let zero = i64x2::ZERO;
1186    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1187    let offsets = offsets.as_i32x4();
1188    let slice = slice as *const i8;
1189    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1190    transmute(r)
1191}
1192
1193/// Returns values from `slice` at offsets determined by `offsets * scale`,
1194/// where
1195/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1196/// that position instead.
1197///
1198/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1199#[inline]
1200#[target_feature(enable = "avx2")]
1201#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1202#[rustc_legacy_const_generics(4)]
1203#[stable(feature = "simd_x86", since = "1.27.0")]
1204pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1205    src: __m128i,
1206    slice: *const i64,
1207    offsets: __m128i,
1208    mask: __m128i,
1209) -> __m128i {
1210    static_assert_imm8_scale!(SCALE);
1211    let src = src.as_i64x2();
1212    let mask = mask.as_i64x2();
1213    let offsets = offsets.as_i32x4();
1214    let slice = slice as *const i8;
1215    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1216    transmute(r)
1217}
1218
1219/// Returns values from `slice` at offsets determined by `offsets * scale`,
1220/// where
1221/// `scale` should be 1, 2, 4 or 8.
1222///
1223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1224#[inline]
1225#[target_feature(enable = "avx2")]
1226#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1227#[rustc_legacy_const_generics(2)]
1228#[stable(feature = "simd_x86", since = "1.27.0")]
1229pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1230    slice: *const i64,
1231    offsets: __m128i,
1232) -> __m256i {
1233    static_assert_imm8_scale!(SCALE);
1234    let zero = i64x4::ZERO;
1235    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1236    let offsets = offsets.as_i32x4();
1237    let slice = slice as *const i8;
1238    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1239    transmute(r)
1240}
1241
1242/// Returns values from `slice` at offsets determined by `offsets * scale`,
1243/// where
1244/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1245/// that position instead.
1246///
1247/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1248#[inline]
1249#[target_feature(enable = "avx2")]
1250#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1251#[rustc_legacy_const_generics(4)]
1252#[stable(feature = "simd_x86", since = "1.27.0")]
1253pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1254    src: __m256i,
1255    slice: *const i64,
1256    offsets: __m128i,
1257    mask: __m256i,
1258) -> __m256i {
1259    static_assert_imm8_scale!(SCALE);
1260    let src = src.as_i64x4();
1261    let mask = mask.as_i64x4();
1262    let offsets = offsets.as_i32x4();
1263    let slice = slice as *const i8;
1264    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1265    transmute(r)
1266}
1267
1268/// Returns values from `slice` at offsets determined by `offsets * scale`,
1269/// where
1270/// `scale` should be 1, 2, 4 or 8.
1271///
1272/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1273#[inline]
1274#[target_feature(enable = "avx2")]
1275#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1276#[rustc_legacy_const_generics(2)]
1277#[stable(feature = "simd_x86", since = "1.27.0")]
1278pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1279    static_assert_imm8_scale!(SCALE);
1280    let zero = _mm_setzero_pd();
1281    let neg_one = _mm_set1_pd(-1.0);
1282    let offsets = offsets.as_i32x4();
1283    let slice = slice as *const i8;
1284    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1285}
1286
1287/// Returns values from `slice` at offsets determined by `offsets * scale`,
1288/// where
1289/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1290/// that position instead.
1291///
1292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1293#[inline]
1294#[target_feature(enable = "avx2")]
1295#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1296#[rustc_legacy_const_generics(4)]
1297#[stable(feature = "simd_x86", since = "1.27.0")]
1298pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1299    src: __m128d,
1300    slice: *const f64,
1301    offsets: __m128i,
1302    mask: __m128d,
1303) -> __m128d {
1304    static_assert_imm8_scale!(SCALE);
1305    let offsets = offsets.as_i32x4();
1306    let slice = slice as *const i8;
1307    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1308}
1309
1310/// Returns values from `slice` at offsets determined by `offsets * scale`,
1311/// where
1312/// `scale` should be 1, 2, 4 or 8.
1313///
1314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1315#[inline]
1316#[target_feature(enable = "avx2")]
1317#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1318#[rustc_legacy_const_generics(2)]
1319#[stable(feature = "simd_x86", since = "1.27.0")]
1320pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1321    slice: *const f64,
1322    offsets: __m128i,
1323) -> __m256d {
1324    static_assert_imm8_scale!(SCALE);
1325    let zero = _mm256_setzero_pd();
1326    let neg_one = _mm256_set1_pd(-1.0);
1327    let offsets = offsets.as_i32x4();
1328    let slice = slice as *const i8;
1329    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1330}
1331
1332/// Returns values from `slice` at offsets determined by `offsets * scale`,
1333/// where
1334/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1335/// that position instead.
1336///
1337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1338#[inline]
1339#[target_feature(enable = "avx2")]
1340#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1341#[rustc_legacy_const_generics(4)]
1342#[stable(feature = "simd_x86", since = "1.27.0")]
1343pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1344    src: __m256d,
1345    slice: *const f64,
1346    offsets: __m128i,
1347    mask: __m256d,
1348) -> __m256d {
1349    static_assert_imm8_scale!(SCALE);
1350    let offsets = offsets.as_i32x4();
1351    let slice = slice as *const i8;
1352    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1353}
1354
1355/// Returns values from `slice` at offsets determined by `offsets * scale`,
1356/// where
1357/// `scale` should be 1, 2, 4 or 8.
1358///
1359/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1360#[inline]
1361#[target_feature(enable = "avx2")]
1362#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1363#[rustc_legacy_const_generics(2)]
1364#[stable(feature = "simd_x86", since = "1.27.0")]
1365pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1366    slice: *const i32,
1367    offsets: __m128i,
1368) -> __m128i {
1369    static_assert_imm8_scale!(SCALE);
1370    let zero = i32x4::ZERO;
1371    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1372    let offsets = offsets.as_i64x2();
1373    let slice = slice as *const i8;
1374    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1375    transmute(r)
1376}
1377
1378/// Returns values from `slice` at offsets determined by `offsets * scale`,
1379/// where
1380/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1381/// that position instead.
1382///
1383/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1384#[inline]
1385#[target_feature(enable = "avx2")]
1386#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1387#[rustc_legacy_const_generics(4)]
1388#[stable(feature = "simd_x86", since = "1.27.0")]
1389pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1390    src: __m128i,
1391    slice: *const i32,
1392    offsets: __m128i,
1393    mask: __m128i,
1394) -> __m128i {
1395    static_assert_imm8_scale!(SCALE);
1396    let src = src.as_i32x4();
1397    let mask = mask.as_i32x4();
1398    let offsets = offsets.as_i64x2();
1399    let slice = slice as *const i8;
1400    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1401    transmute(r)
1402}
1403
1404/// Returns values from `slice` at offsets determined by `offsets * scale`,
1405/// where
1406/// `scale` should be 1, 2, 4 or 8.
1407///
1408/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1409#[inline]
1410#[target_feature(enable = "avx2")]
1411#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1412#[rustc_legacy_const_generics(2)]
1413#[stable(feature = "simd_x86", since = "1.27.0")]
1414pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1415    slice: *const i32,
1416    offsets: __m256i,
1417) -> __m128i {
1418    static_assert_imm8_scale!(SCALE);
1419    let zero = i32x4::ZERO;
1420    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1421    let offsets = offsets.as_i64x4();
1422    let slice = slice as *const i8;
1423    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1424    transmute(r)
1425}
1426
1427/// Returns values from `slice` at offsets determined by `offsets * scale`,
1428/// where
1429/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1430/// that position instead.
1431///
1432/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1433#[inline]
1434#[target_feature(enable = "avx2")]
1435#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1436#[rustc_legacy_const_generics(4)]
1437#[stable(feature = "simd_x86", since = "1.27.0")]
1438pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1439    src: __m128i,
1440    slice: *const i32,
1441    offsets: __m256i,
1442    mask: __m128i,
1443) -> __m128i {
1444    static_assert_imm8_scale!(SCALE);
1445    let src = src.as_i32x4();
1446    let mask = mask.as_i32x4();
1447    let offsets = offsets.as_i64x4();
1448    let slice = slice as *const i8;
1449    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1450    transmute(r)
1451}
1452
1453/// Returns values from `slice` at offsets determined by `offsets * scale`,
1454/// where
1455/// `scale` should be 1, 2, 4 or 8.
1456///
1457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1458#[inline]
1459#[target_feature(enable = "avx2")]
1460#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1461#[rustc_legacy_const_generics(2)]
1462#[stable(feature = "simd_x86", since = "1.27.0")]
1463pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1464    static_assert_imm8_scale!(SCALE);
1465    let zero = _mm_setzero_ps();
1466    let neg_one = _mm_set1_ps(-1.0);
1467    let offsets = offsets.as_i64x2();
1468    let slice = slice as *const i8;
1469    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1470}
1471
1472/// Returns values from `slice` at offsets determined by `offsets * scale`,
1473/// where
1474/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1475/// that position instead.
1476///
1477/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1478#[inline]
1479#[target_feature(enable = "avx2")]
1480#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1481#[rustc_legacy_const_generics(4)]
1482#[stable(feature = "simd_x86", since = "1.27.0")]
1483pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1484    src: __m128,
1485    slice: *const f32,
1486    offsets: __m128i,
1487    mask: __m128,
1488) -> __m128 {
1489    static_assert_imm8_scale!(SCALE);
1490    let offsets = offsets.as_i64x2();
1491    let slice = slice as *const i8;
1492    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1493}
1494
1495/// Returns values from `slice` at offsets determined by `offsets * scale`,
1496/// where
1497/// `scale` should be 1, 2, 4 or 8.
1498///
1499/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1500#[inline]
1501#[target_feature(enable = "avx2")]
1502#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1503#[rustc_legacy_const_generics(2)]
1504#[stable(feature = "simd_x86", since = "1.27.0")]
1505pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1506    static_assert_imm8_scale!(SCALE);
1507    let zero = _mm_setzero_ps();
1508    let neg_one = _mm_set1_ps(-1.0);
1509    let offsets = offsets.as_i64x4();
1510    let slice = slice as *const i8;
1511    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1512}
1513
1514/// Returns values from `slice` at offsets determined by `offsets * scale`,
1515/// where
1516/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1517/// that position instead.
1518///
1519/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1520#[inline]
1521#[target_feature(enable = "avx2")]
1522#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1523#[rustc_legacy_const_generics(4)]
1524#[stable(feature = "simd_x86", since = "1.27.0")]
1525pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1526    src: __m128,
1527    slice: *const f32,
1528    offsets: __m256i,
1529    mask: __m128,
1530) -> __m128 {
1531    static_assert_imm8_scale!(SCALE);
1532    let offsets = offsets.as_i64x4();
1533    let slice = slice as *const i8;
1534    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1535}
1536
1537/// Returns values from `slice` at offsets determined by `offsets * scale`,
1538/// where
1539/// `scale` should be 1, 2, 4 or 8.
1540///
1541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1542#[inline]
1543#[target_feature(enable = "avx2")]
1544#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1545#[rustc_legacy_const_generics(2)]
1546#[stable(feature = "simd_x86", since = "1.27.0")]
1547pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1548    slice: *const i64,
1549    offsets: __m128i,
1550) -> __m128i {
1551    static_assert_imm8_scale!(SCALE);
1552    let zero = i64x2::ZERO;
1553    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1554    let slice = slice as *const i8;
1555    let offsets = offsets.as_i64x2();
1556    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1557    transmute(r)
1558}
1559
1560/// Returns values from `slice` at offsets determined by `offsets * scale`,
1561/// where
1562/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1563/// that position instead.
1564///
1565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1566#[inline]
1567#[target_feature(enable = "avx2")]
1568#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1569#[rustc_legacy_const_generics(4)]
1570#[stable(feature = "simd_x86", since = "1.27.0")]
1571pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1572    src: __m128i,
1573    slice: *const i64,
1574    offsets: __m128i,
1575    mask: __m128i,
1576) -> __m128i {
1577    static_assert_imm8_scale!(SCALE);
1578    let src = src.as_i64x2();
1579    let mask = mask.as_i64x2();
1580    let offsets = offsets.as_i64x2();
1581    let slice = slice as *const i8;
1582    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1583    transmute(r)
1584}
1585
1586/// Returns values from `slice` at offsets determined by `offsets * scale`,
1587/// where
1588/// `scale` should be 1, 2, 4 or 8.
1589///
1590/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1591#[inline]
1592#[target_feature(enable = "avx2")]
1593#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1594#[rustc_legacy_const_generics(2)]
1595#[stable(feature = "simd_x86", since = "1.27.0")]
1596pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1597    slice: *const i64,
1598    offsets: __m256i,
1599) -> __m256i {
1600    static_assert_imm8_scale!(SCALE);
1601    let zero = i64x4::ZERO;
1602    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1603    let slice = slice as *const i8;
1604    let offsets = offsets.as_i64x4();
1605    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1606    transmute(r)
1607}
1608
1609/// Returns values from `slice` at offsets determined by `offsets * scale`,
1610/// where
1611/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1612/// that position instead.
1613///
1614/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1615#[inline]
1616#[target_feature(enable = "avx2")]
1617#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1618#[rustc_legacy_const_generics(4)]
1619#[stable(feature = "simd_x86", since = "1.27.0")]
1620pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1621    src: __m256i,
1622    slice: *const i64,
1623    offsets: __m256i,
1624    mask: __m256i,
1625) -> __m256i {
1626    static_assert_imm8_scale!(SCALE);
1627    let src = src.as_i64x4();
1628    let mask = mask.as_i64x4();
1629    let offsets = offsets.as_i64x4();
1630    let slice = slice as *const i8;
1631    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1632    transmute(r)
1633}
1634
1635/// Returns values from `slice` at offsets determined by `offsets * scale`,
1636/// where
1637/// `scale` should be 1, 2, 4 or 8.
1638///
1639/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1640#[inline]
1641#[target_feature(enable = "avx2")]
1642#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1643#[rustc_legacy_const_generics(2)]
1644#[stable(feature = "simd_x86", since = "1.27.0")]
1645pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1646    static_assert_imm8_scale!(SCALE);
1647    let zero = _mm_setzero_pd();
1648    let neg_one = _mm_set1_pd(-1.0);
1649    let slice = slice as *const i8;
1650    let offsets = offsets.as_i64x2();
1651    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1652}
1653
1654/// Returns values from `slice` at offsets determined by `offsets * scale`,
1655/// where
1656/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1657/// that position instead.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1660#[inline]
1661#[target_feature(enable = "avx2")]
1662#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1663#[rustc_legacy_const_generics(4)]
1664#[stable(feature = "simd_x86", since = "1.27.0")]
1665pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1666    src: __m128d,
1667    slice: *const f64,
1668    offsets: __m128i,
1669    mask: __m128d,
1670) -> __m128d {
1671    static_assert_imm8_scale!(SCALE);
1672    let slice = slice as *const i8;
1673    let offsets = offsets.as_i64x2();
1674    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1675}
1676
1677/// Returns values from `slice` at offsets determined by `offsets * scale`,
1678/// where
1679/// `scale` should be 1, 2, 4 or 8.
1680///
1681/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1682#[inline]
1683#[target_feature(enable = "avx2")]
1684#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1685#[rustc_legacy_const_generics(2)]
1686#[stable(feature = "simd_x86", since = "1.27.0")]
1687pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1688    slice: *const f64,
1689    offsets: __m256i,
1690) -> __m256d {
1691    static_assert_imm8_scale!(SCALE);
1692    let zero = _mm256_setzero_pd();
1693    let neg_one = _mm256_set1_pd(-1.0);
1694    let slice = slice as *const i8;
1695    let offsets = offsets.as_i64x4();
1696    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1697}
1698
1699/// Returns values from `slice` at offsets determined by `offsets * scale`,
1700/// where
1701/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1702/// that position instead.
1703///
1704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1705#[inline]
1706#[target_feature(enable = "avx2")]
1707#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1708#[rustc_legacy_const_generics(4)]
1709#[stable(feature = "simd_x86", since = "1.27.0")]
1710pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1711    src: __m256d,
1712    slice: *const f64,
1713    offsets: __m256i,
1714    mask: __m256d,
1715) -> __m256d {
1716    static_assert_imm8_scale!(SCALE);
1717    let slice = slice as *const i8;
1718    let offsets = offsets.as_i64x4();
1719    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1720}
1721
1722/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1723/// location specified by `IMM1`.
1724///
1725/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1726#[inline]
1727#[target_feature(enable = "avx2")]
1728#[cfg_attr(
1729    all(test, not(target_env = "msvc")),
1730    assert_instr(vinsertf128, IMM1 = 1)
1731)]
1732#[rustc_legacy_const_generics(2)]
1733#[stable(feature = "simd_x86", since = "1.27.0")]
1734pub unsafe fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1735    static_assert_uimm_bits!(IMM1, 1);
1736    let a = a.as_i64x4();
1737    let b = _mm256_castsi128_si256(b).as_i64x4();
1738    let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1739    transmute(dst)
1740}
1741
1742/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1743/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1744/// of intermediate 32-bit integers.
1745///
1746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1747#[inline]
1748#[target_feature(enable = "avx2")]
1749#[cfg_attr(test, assert_instr(vpmaddwd))]
1750#[stable(feature = "simd_x86", since = "1.27.0")]
1751pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1752    transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
1753}
1754
1755/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1756/// corresponding signed 8-bit integer from `b`, producing intermediate
1757/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1758/// signed 16-bit integers
1759///
1760/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1761#[inline]
1762#[target_feature(enable = "avx2")]
1763#[cfg_attr(test, assert_instr(vpmaddubsw))]
1764#[stable(feature = "simd_x86", since = "1.27.0")]
1765pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1766    transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
1767}
1768
1769/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1770/// (elements are zeroed out when the highest bit is not set in the
1771/// corresponding element).
1772///
1773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1774#[inline]
1775#[target_feature(enable = "avx2")]
1776#[cfg_attr(test, assert_instr(vpmaskmovd))]
1777#[stable(feature = "simd_x86", since = "1.27.0")]
1778pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1779    transmute(maskloadd(mem_addr as *const i8, mask.as_i32x4()))
1780}
1781
1782/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1783/// (elements are zeroed out when the highest bit is not set in the
1784/// corresponding element).
1785///
1786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1787#[inline]
1788#[target_feature(enable = "avx2")]
1789#[cfg_attr(test, assert_instr(vpmaskmovd))]
1790#[stable(feature = "simd_x86", since = "1.27.0")]
1791pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1792    transmute(maskloadd256(mem_addr as *const i8, mask.as_i32x8()))
1793}
1794
1795/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1796/// (elements are zeroed out when the highest bit is not set in the
1797/// corresponding element).
1798///
1799/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1800#[inline]
1801#[target_feature(enable = "avx2")]
1802#[cfg_attr(test, assert_instr(vpmaskmovq))]
1803#[stable(feature = "simd_x86", since = "1.27.0")]
1804pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1805    transmute(maskloadq(mem_addr as *const i8, mask.as_i64x2()))
1806}
1807
1808/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1809/// (elements are zeroed out when the highest bit is not set in the
1810/// corresponding element).
1811///
1812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1813#[inline]
1814#[target_feature(enable = "avx2")]
1815#[cfg_attr(test, assert_instr(vpmaskmovq))]
1816#[stable(feature = "simd_x86", since = "1.27.0")]
1817pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1818    transmute(maskloadq256(mem_addr as *const i8, mask.as_i64x4()))
1819}
1820
1821/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1822/// using `mask` (elements are not stored when the highest bit is not set
1823/// in the corresponding element).
1824///
1825/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1826#[inline]
1827#[target_feature(enable = "avx2")]
1828#[cfg_attr(test, assert_instr(vpmaskmovd))]
1829#[stable(feature = "simd_x86", since = "1.27.0")]
1830pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1831    maskstored(mem_addr as *mut i8, mask.as_i32x4(), a.as_i32x4())
1832}
1833
1834/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1835/// using `mask` (elements are not stored when the highest bit is not set
1836/// in the corresponding element).
1837///
1838/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1839#[inline]
1840#[target_feature(enable = "avx2")]
1841#[cfg_attr(test, assert_instr(vpmaskmovd))]
1842#[stable(feature = "simd_x86", since = "1.27.0")]
1843pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1844    maskstored256(mem_addr as *mut i8, mask.as_i32x8(), a.as_i32x8())
1845}
1846
1847/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1848/// using `mask` (elements are not stored when the highest bit is not set
1849/// in the corresponding element).
1850///
1851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1852#[inline]
1853#[target_feature(enable = "avx2")]
1854#[cfg_attr(test, assert_instr(vpmaskmovq))]
1855#[stable(feature = "simd_x86", since = "1.27.0")]
1856pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1857    maskstoreq(mem_addr as *mut i8, mask.as_i64x2(), a.as_i64x2())
1858}
1859
1860/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1861/// using `mask` (elements are not stored when the highest bit is not set
1862/// in the corresponding element).
1863///
1864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1865#[inline]
1866#[target_feature(enable = "avx2")]
1867#[cfg_attr(test, assert_instr(vpmaskmovq))]
1868#[stable(feature = "simd_x86", since = "1.27.0")]
1869pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1870    maskstoreq256(mem_addr as *mut i8, mask.as_i64x4(), a.as_i64x4())
1871}
1872
1873/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1874/// maximum values.
1875///
1876/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1877#[inline]
1878#[target_feature(enable = "avx2")]
1879#[cfg_attr(test, assert_instr(vpmaxsw))]
1880#[stable(feature = "simd_x86", since = "1.27.0")]
1881pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1882    let a = a.as_i16x16();
1883    let b = b.as_i16x16();
1884    transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1885}
1886
1887/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1888/// maximum values.
1889///
1890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1891#[inline]
1892#[target_feature(enable = "avx2")]
1893#[cfg_attr(test, assert_instr(vpmaxsd))]
1894#[stable(feature = "simd_x86", since = "1.27.0")]
1895pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1896    let a = a.as_i32x8();
1897    let b = b.as_i32x8();
1898    transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1899}
1900
1901/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1902/// maximum values.
1903///
1904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1905#[inline]
1906#[target_feature(enable = "avx2")]
1907#[cfg_attr(test, assert_instr(vpmaxsb))]
1908#[stable(feature = "simd_x86", since = "1.27.0")]
1909pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1910    let a = a.as_i8x32();
1911    let b = b.as_i8x32();
1912    transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1913}
1914
1915/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1916/// the packed maximum values.
1917///
1918/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1919#[inline]
1920#[target_feature(enable = "avx2")]
1921#[cfg_attr(test, assert_instr(vpmaxuw))]
1922#[stable(feature = "simd_x86", since = "1.27.0")]
1923pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1924    let a = a.as_u16x16();
1925    let b = b.as_u16x16();
1926    transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1927}
1928
1929/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1930/// the packed maximum values.
1931///
1932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1933#[inline]
1934#[target_feature(enable = "avx2")]
1935#[cfg_attr(test, assert_instr(vpmaxud))]
1936#[stable(feature = "simd_x86", since = "1.27.0")]
1937pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1938    let a = a.as_u32x8();
1939    let b = b.as_u32x8();
1940    transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1941}
1942
1943/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
1944/// the packed maximum values.
1945///
1946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
1947#[inline]
1948#[target_feature(enable = "avx2")]
1949#[cfg_attr(test, assert_instr(vpmaxub))]
1950#[stable(feature = "simd_x86", since = "1.27.0")]
1951pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
1952    let a = a.as_u8x32();
1953    let b = b.as_u8x32();
1954    transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1955}
1956
1957/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1958/// minimum values.
1959///
1960/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
1961#[inline]
1962#[target_feature(enable = "avx2")]
1963#[cfg_attr(test, assert_instr(vpminsw))]
1964#[stable(feature = "simd_x86", since = "1.27.0")]
1965pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
1966    let a = a.as_i16x16();
1967    let b = b.as_i16x16();
1968    transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
1969}
1970
1971/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1972/// minimum values.
1973///
1974/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
1975#[inline]
1976#[target_feature(enable = "avx2")]
1977#[cfg_attr(test, assert_instr(vpminsd))]
1978#[stable(feature = "simd_x86", since = "1.27.0")]
1979pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
1980    let a = a.as_i32x8();
1981    let b = b.as_i32x8();
1982    transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
1983}
1984
1985/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1986/// minimum values.
1987///
1988/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
1989#[inline]
1990#[target_feature(enable = "avx2")]
1991#[cfg_attr(test, assert_instr(vpminsb))]
1992#[stable(feature = "simd_x86", since = "1.27.0")]
1993pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
1994    let a = a.as_i8x32();
1995    let b = b.as_i8x32();
1996    transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
1997}
1998
1999/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2000/// the packed minimum values.
2001///
2002/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2003#[inline]
2004#[target_feature(enable = "avx2")]
2005#[cfg_attr(test, assert_instr(vpminuw))]
2006#[stable(feature = "simd_x86", since = "1.27.0")]
2007pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2008    let a = a.as_u16x16();
2009    let b = b.as_u16x16();
2010    transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2011}
2012
2013/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2014/// the packed minimum values.
2015///
2016/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2017#[inline]
2018#[target_feature(enable = "avx2")]
2019#[cfg_attr(test, assert_instr(vpminud))]
2020#[stable(feature = "simd_x86", since = "1.27.0")]
2021pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2022    let a = a.as_u32x8();
2023    let b = b.as_u32x8();
2024    transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2025}
2026
2027/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2028/// the packed minimum values.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2031#[inline]
2032#[target_feature(enable = "avx2")]
2033#[cfg_attr(test, assert_instr(vpminub))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2036    let a = a.as_u8x32();
2037    let b = b.as_u8x32();
2038    transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2039}
2040
2041/// Creates mask from the most significant bit of each 8-bit element in `a`,
2042/// return the result.
2043///
2044/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2045#[inline]
2046#[target_feature(enable = "avx2")]
2047#[cfg_attr(test, assert_instr(vpmovmskb))]
2048#[stable(feature = "simd_x86", since = "1.27.0")]
2049pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2050    let z = i8x32::ZERO;
2051    let m: i8x32 = simd_lt(a.as_i8x32(), z);
2052    simd_bitmask::<_, u32>(m) as i32
2053}
2054
2055/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2056/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2057/// results in dst. Eight SADs are performed for each 128-bit lane using one
2058/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2059/// selected from `b` starting at on the offset specified in `imm8`. Eight
2060/// quadruplets are formed from sequential 8-bit integers selected from `a`
2061/// starting at the offset specified in `imm8`.
2062///
2063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2064#[inline]
2065#[target_feature(enable = "avx2")]
2066#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2067#[rustc_legacy_const_generics(2)]
2068#[stable(feature = "simd_x86", since = "1.27.0")]
2069pub unsafe fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2070    static_assert_uimm_bits!(IMM8, 8);
2071    transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8))
2072}
2073
2074/// Multiplies the low 32-bit integers from each packed 64-bit element in
2075/// `a` and `b`
2076///
2077/// Returns the 64-bit results.
2078///
2079/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2080#[inline]
2081#[target_feature(enable = "avx2")]
2082#[cfg_attr(test, assert_instr(vpmuldq))]
2083#[stable(feature = "simd_x86", since = "1.27.0")]
2084pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2085    let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2086    let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2087    transmute(simd_mul(a, b))
2088}
2089
2090/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2091/// element in `a` and `b`
2092///
2093/// Returns the unsigned 64-bit results.
2094///
2095/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2096#[inline]
2097#[target_feature(enable = "avx2")]
2098#[cfg_attr(test, assert_instr(vpmuludq))]
2099#[stable(feature = "simd_x86", since = "1.27.0")]
2100pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2101    let a = a.as_u64x4();
2102    let b = b.as_u64x4();
2103    let mask = u64x4::splat(u32::MAX.into());
2104    transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2105}
2106
2107/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2108/// intermediate 32-bit integers and returning the high 16 bits of the
2109/// intermediate integers.
2110///
2111/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2112#[inline]
2113#[target_feature(enable = "avx2")]
2114#[cfg_attr(test, assert_instr(vpmulhw))]
2115#[stable(feature = "simd_x86", since = "1.27.0")]
2116pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2117    let a = simd_cast::<_, i32x16>(a.as_i16x16());
2118    let b = simd_cast::<_, i32x16>(b.as_i16x16());
2119    let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2120    transmute(simd_cast::<i32x16, i16x16>(r))
2121}
2122
2123/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2124/// intermediate 32-bit integers and returning the high 16 bits of the
2125/// intermediate integers.
2126///
2127/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2128#[inline]
2129#[target_feature(enable = "avx2")]
2130#[cfg_attr(test, assert_instr(vpmulhuw))]
2131#[stable(feature = "simd_x86", since = "1.27.0")]
2132pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2133    let a = simd_cast::<_, u32x16>(a.as_u16x16());
2134    let b = simd_cast::<_, u32x16>(b.as_u16x16());
2135    let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2136    transmute(simd_cast::<u32x16, u16x16>(r))
2137}
2138
2139/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2140/// intermediate 32-bit integers, and returns the low 16 bits of the
2141/// intermediate integers
2142///
2143/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2144#[inline]
2145#[target_feature(enable = "avx2")]
2146#[cfg_attr(test, assert_instr(vpmullw))]
2147#[stable(feature = "simd_x86", since = "1.27.0")]
2148pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2149    transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
2150}
2151
2152/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2153/// intermediate 64-bit integers, and returns the low 32 bits of the
2154/// intermediate integers
2155///
2156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2157#[inline]
2158#[target_feature(enable = "avx2")]
2159#[cfg_attr(test, assert_instr(vpmulld))]
2160#[stable(feature = "simd_x86", since = "1.27.0")]
2161pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2162    transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
2163}
2164
2165/// Multiplies packed 16-bit integers in `a` and `b`, producing
2166/// intermediate signed 32-bit integers. Truncate each intermediate
2167/// integer to the 18 most significant bits, round by adding 1, and
2168/// return bits `[16:1]`.
2169///
2170/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2171#[inline]
2172#[target_feature(enable = "avx2")]
2173#[cfg_attr(test, assert_instr(vpmulhrsw))]
2174#[stable(feature = "simd_x86", since = "1.27.0")]
2175pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2176    transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
2177}
2178
2179/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2180/// and `b`
2181///
2182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2183#[inline]
2184#[target_feature(enable = "avx2")]
2185#[cfg_attr(test, assert_instr(vorps))]
2186#[stable(feature = "simd_x86", since = "1.27.0")]
2187pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2188    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
2189}
2190
2191/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2192/// using signed saturation
2193///
2194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2195#[inline]
2196#[target_feature(enable = "avx2")]
2197#[cfg_attr(test, assert_instr(vpacksswb))]
2198#[stable(feature = "simd_x86", since = "1.27.0")]
2199pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2200    transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
2201}
2202
2203/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2204/// using signed saturation
2205///
2206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2207#[inline]
2208#[target_feature(enable = "avx2")]
2209#[cfg_attr(test, assert_instr(vpackssdw))]
2210#[stable(feature = "simd_x86", since = "1.27.0")]
2211pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2212    transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
2213}
2214
2215/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2216/// using unsigned saturation
2217///
2218/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2219#[inline]
2220#[target_feature(enable = "avx2")]
2221#[cfg_attr(test, assert_instr(vpackuswb))]
2222#[stable(feature = "simd_x86", since = "1.27.0")]
2223pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2224    transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
2225}
2226
2227/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2228/// using unsigned saturation
2229///
2230/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2231#[inline]
2232#[target_feature(enable = "avx2")]
2233#[cfg_attr(test, assert_instr(vpackusdw))]
2234#[stable(feature = "simd_x86", since = "1.27.0")]
2235pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2236    transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
2237}
2238
2239/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2240///
2241/// The last 3 bits of each integer of `b` are used as addresses into the 8
2242/// integers of `a`.
2243///
2244/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2245#[inline]
2246#[target_feature(enable = "avx2")]
2247#[cfg_attr(test, assert_instr(vpermps))]
2248#[stable(feature = "simd_x86", since = "1.27.0")]
2249pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2250    transmute(permd(a.as_u32x8(), b.as_u32x8()))
2251}
2252
2253/// Permutes 64-bit integers from `a` using control mask `imm8`.
2254///
2255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2256#[inline]
2257#[target_feature(enable = "avx2")]
2258#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2259#[rustc_legacy_const_generics(1)]
2260#[stable(feature = "simd_x86", since = "1.27.0")]
2261pub unsafe fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2262    static_assert_uimm_bits!(IMM8, 8);
2263    let zero = i64x4::ZERO;
2264    let r: i64x4 = simd_shuffle!(
2265        a.as_i64x4(),
2266        zero,
2267        [
2268            IMM8 as u32 & 0b11,
2269            (IMM8 as u32 >> 2) & 0b11,
2270            (IMM8 as u32 >> 4) & 0b11,
2271            (IMM8 as u32 >> 6) & 0b11,
2272        ],
2273    );
2274    transmute(r)
2275}
2276
2277/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2278///
2279/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2280#[inline]
2281#[target_feature(enable = "avx2")]
2282#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2283#[rustc_legacy_const_generics(2)]
2284#[stable(feature = "simd_x86", since = "1.27.0")]
2285pub unsafe fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2286    static_assert_uimm_bits!(IMM8, 8);
2287    transmute(vperm2i128(a.as_i64x4(), b.as_i64x4(), IMM8 as i8))
2288}
2289
2290/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2291/// control in `imm8`.
2292///
2293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2294#[inline]
2295#[target_feature(enable = "avx2")]
2296#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2297#[rustc_legacy_const_generics(1)]
2298#[stable(feature = "simd_x86", since = "1.27.0")]
2299pub unsafe fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2300    static_assert_uimm_bits!(IMM8, 8);
2301    simd_shuffle!(
2302        a,
2303        _mm256_undefined_pd(),
2304        [
2305            IMM8 as u32 & 0b11,
2306            (IMM8 as u32 >> 2) & 0b11,
2307            (IMM8 as u32 >> 4) & 0b11,
2308            (IMM8 as u32 >> 6) & 0b11,
2309        ],
2310    )
2311}
2312
2313/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2314/// the corresponding 32-bit integer index in `idx`.
2315///
2316/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2317#[inline]
2318#[target_feature(enable = "avx2")]
2319#[cfg_attr(test, assert_instr(vpermps))]
2320#[stable(feature = "simd_x86", since = "1.27.0")]
2321pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2322    permps(a, idx.as_i32x8())
2323}
2324
2325/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2326/// and `b`, then horizontally sum each consecutive 8 differences to
2327/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2328/// integers in the low 16 bits of the 64-bit return value
2329///
2330/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2331#[inline]
2332#[target_feature(enable = "avx2")]
2333#[cfg_attr(test, assert_instr(vpsadbw))]
2334#[stable(feature = "simd_x86", since = "1.27.0")]
2335pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2336    transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
2337}
2338
2339/// Shuffles bytes from `a` according to the content of `b`.
2340///
2341/// For each of the 128-bit low and high halves of the vectors, the last
2342/// 4 bits of each byte of `b` are used as addresses into the respective
2343/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2344///
2345/// In addition, if the highest significant bit of a byte of `b` is set, the
2346/// respective destination byte is set to 0.
2347///
2348/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2349/// equivalent to:
2350///
2351/// ```
2352/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2353///     let mut r = [0; 32];
2354///     for i in 0..16 {
2355///         // if the most significant bit of b is set,
2356///         // then the destination byte is set to 0.
2357///         if b[i] & 0x80 == 0u8 {
2358///             r[i] = a[(b[i] % 16) as usize];
2359///         }
2360///         if b[i + 16] & 0x80 == 0u8 {
2361///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2362///         }
2363///     }
2364///     r
2365/// }
2366/// ```
2367///
2368/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2369#[inline]
2370#[target_feature(enable = "avx2")]
2371#[cfg_attr(test, assert_instr(vpshufb))]
2372#[stable(feature = "simd_x86", since = "1.27.0")]
2373pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2374    transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
2375}
2376
2377/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2378/// `imm8`.
2379///
2380/// ```rust
2381/// #[cfg(target_arch = "x86")]
2382/// use std::arch::x86::*;
2383/// #[cfg(target_arch = "x86_64")]
2384/// use std::arch::x86_64::*;
2385///
2386/// # fn main() {
2387/// #     if is_x86_feature_detected!("avx2") {
2388/// #         #[target_feature(enable = "avx2")]
2389/// #         unsafe fn worker() {
2390/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2391///
2392/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2393/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2394///
2395/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2396/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2397///
2398/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2399/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2400/// #         }
2401/// #         unsafe { worker(); }
2402/// #     }
2403/// # }
2404/// ```
2405///
2406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2407#[inline]
2408#[target_feature(enable = "avx2")]
2409#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2410#[rustc_legacy_const_generics(1)]
2411#[stable(feature = "simd_x86", since = "1.27.0")]
2412pub unsafe fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2413    static_assert_uimm_bits!(MASK, 8);
2414    let r: i32x8 = simd_shuffle!(
2415        a.as_i32x8(),
2416        a.as_i32x8(),
2417        [
2418            MASK as u32 & 0b11,
2419            (MASK as u32 >> 2) & 0b11,
2420            (MASK as u32 >> 4) & 0b11,
2421            (MASK as u32 >> 6) & 0b11,
2422            (MASK as u32 & 0b11) + 4,
2423            ((MASK as u32 >> 2) & 0b11) + 4,
2424            ((MASK as u32 >> 4) & 0b11) + 4,
2425            ((MASK as u32 >> 6) & 0b11) + 4,
2426        ],
2427    );
2428    transmute(r)
2429}
2430
2431/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2432/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2433/// to the output.
2434///
2435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2436#[inline]
2437#[target_feature(enable = "avx2")]
2438#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2439#[rustc_legacy_const_generics(1)]
2440#[stable(feature = "simd_x86", since = "1.27.0")]
2441pub unsafe fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2442    static_assert_uimm_bits!(IMM8, 8);
2443    let a = a.as_i16x16();
2444    let r: i16x16 = simd_shuffle!(
2445        a,
2446        a,
2447        [
2448            0,
2449            1,
2450            2,
2451            3,
2452            4 + (IMM8 as u32 & 0b11),
2453            4 + ((IMM8 as u32 >> 2) & 0b11),
2454            4 + ((IMM8 as u32 >> 4) & 0b11),
2455            4 + ((IMM8 as u32 >> 6) & 0b11),
2456            8,
2457            9,
2458            10,
2459            11,
2460            12 + (IMM8 as u32 & 0b11),
2461            12 + ((IMM8 as u32 >> 2) & 0b11),
2462            12 + ((IMM8 as u32 >> 4) & 0b11),
2463            12 + ((IMM8 as u32 >> 6) & 0b11),
2464        ],
2465    );
2466    transmute(r)
2467}
2468
2469/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2470/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2471/// to the output.
2472///
2473/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2474#[inline]
2475#[target_feature(enable = "avx2")]
2476#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2477#[rustc_legacy_const_generics(1)]
2478#[stable(feature = "simd_x86", since = "1.27.0")]
2479pub unsafe fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2480    static_assert_uimm_bits!(IMM8, 8);
2481    let a = a.as_i16x16();
2482    let r: i16x16 = simd_shuffle!(
2483        a,
2484        a,
2485        [
2486            0 + (IMM8 as u32 & 0b11),
2487            0 + ((IMM8 as u32 >> 2) & 0b11),
2488            0 + ((IMM8 as u32 >> 4) & 0b11),
2489            0 + ((IMM8 as u32 >> 6) & 0b11),
2490            4,
2491            5,
2492            6,
2493            7,
2494            8 + (IMM8 as u32 & 0b11),
2495            8 + ((IMM8 as u32 >> 2) & 0b11),
2496            8 + ((IMM8 as u32 >> 4) & 0b11),
2497            8 + ((IMM8 as u32 >> 6) & 0b11),
2498            12,
2499            13,
2500            14,
2501            15,
2502        ],
2503    );
2504    transmute(r)
2505}
2506
2507/// Negates packed 16-bit integers in `a` when the corresponding signed
2508/// 16-bit integer in `b` is negative, and returns the results.
2509/// Results are zeroed out when the corresponding element in `b` is zero.
2510///
2511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2512#[inline]
2513#[target_feature(enable = "avx2")]
2514#[cfg_attr(test, assert_instr(vpsignw))]
2515#[stable(feature = "simd_x86", since = "1.27.0")]
2516pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2517    transmute(psignw(a.as_i16x16(), b.as_i16x16()))
2518}
2519
2520/// Negates packed 32-bit integers in `a` when the corresponding signed
2521/// 32-bit integer in `b` is negative, and returns the results.
2522/// Results are zeroed out when the corresponding element in `b` is zero.
2523///
2524/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2525#[inline]
2526#[target_feature(enable = "avx2")]
2527#[cfg_attr(test, assert_instr(vpsignd))]
2528#[stable(feature = "simd_x86", since = "1.27.0")]
2529pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2530    transmute(psignd(a.as_i32x8(), b.as_i32x8()))
2531}
2532
2533/// Negates packed 8-bit integers in `a` when the corresponding signed
2534/// 8-bit integer in `b` is negative, and returns the results.
2535/// Results are zeroed out when the corresponding element in `b` is zero.
2536///
2537/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2538#[inline]
2539#[target_feature(enable = "avx2")]
2540#[cfg_attr(test, assert_instr(vpsignb))]
2541#[stable(feature = "simd_x86", since = "1.27.0")]
2542pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2543    transmute(psignb(a.as_i8x32(), b.as_i8x32()))
2544}
2545
2546/// Shifts packed 16-bit integers in `a` left by `count` while
2547/// shifting in zeros, and returns the result
2548///
2549/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2550#[inline]
2551#[target_feature(enable = "avx2")]
2552#[cfg_attr(test, assert_instr(vpsllw))]
2553#[stable(feature = "simd_x86", since = "1.27.0")]
2554pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2555    transmute(psllw(a.as_i16x16(), count.as_i16x8()))
2556}
2557
2558/// Shifts packed 32-bit integers in `a` left by `count` while
2559/// shifting in zeros, and returns the result
2560///
2561/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2562#[inline]
2563#[target_feature(enable = "avx2")]
2564#[cfg_attr(test, assert_instr(vpslld))]
2565#[stable(feature = "simd_x86", since = "1.27.0")]
2566pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2567    transmute(pslld(a.as_i32x8(), count.as_i32x4()))
2568}
2569
2570/// Shifts packed 64-bit integers in `a` left by `count` while
2571/// shifting in zeros, and returns the result
2572///
2573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2574#[inline]
2575#[target_feature(enable = "avx2")]
2576#[cfg_attr(test, assert_instr(vpsllq))]
2577#[stable(feature = "simd_x86", since = "1.27.0")]
2578pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2579    transmute(psllq(a.as_i64x4(), count.as_i64x2()))
2580}
2581
2582/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2583/// shifting in zeros, return the results;
2584///
2585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2586#[inline]
2587#[target_feature(enable = "avx2")]
2588#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2589#[rustc_legacy_const_generics(1)]
2590#[stable(feature = "simd_x86", since = "1.27.0")]
2591pub unsafe fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2592    static_assert_uimm_bits!(IMM8, 8);
2593    if IMM8 >= 16 {
2594        _mm256_setzero_si256()
2595    } else {
2596        transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2597    }
2598}
2599
2600/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2601/// shifting in zeros, return the results;
2602///
2603/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2604#[inline]
2605#[target_feature(enable = "avx2")]
2606#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2607#[rustc_legacy_const_generics(1)]
2608#[stable(feature = "simd_x86", since = "1.27.0")]
2609pub unsafe fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2610    static_assert_uimm_bits!(IMM8, 8);
2611    if IMM8 >= 32 {
2612        _mm256_setzero_si256()
2613    } else {
2614        transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2615    }
2616}
2617
2618/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2619/// shifting in zeros, return the results;
2620///
2621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2622#[inline]
2623#[target_feature(enable = "avx2")]
2624#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2625#[rustc_legacy_const_generics(1)]
2626#[stable(feature = "simd_x86", since = "1.27.0")]
2627pub unsafe fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2628    static_assert_uimm_bits!(IMM8, 8);
2629    if IMM8 >= 64 {
2630        _mm256_setzero_si256()
2631    } else {
2632        transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2633    }
2634}
2635
2636/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2637///
2638/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2639#[inline]
2640#[target_feature(enable = "avx2")]
2641#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2642#[rustc_legacy_const_generics(1)]
2643#[stable(feature = "simd_x86", since = "1.27.0")]
2644pub unsafe fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2645    static_assert_uimm_bits!(IMM8, 8);
2646    _mm256_bslli_epi128::<IMM8>(a)
2647}
2648
2649/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2650///
2651/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2652#[inline]
2653#[target_feature(enable = "avx2")]
2654#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2655#[rustc_legacy_const_generics(1)]
2656#[stable(feature = "simd_x86", since = "1.27.0")]
2657pub unsafe fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2658    static_assert_uimm_bits!(IMM8, 8);
2659    const fn mask(shift: i32, i: u32) -> u32 {
2660        let shift = shift as u32 & 0xff;
2661        if shift > 15 || i % 16 < shift {
2662            0
2663        } else {
2664            32 + (i - shift)
2665        }
2666    }
2667    let a = a.as_i8x32();
2668    let r: i8x32 = simd_shuffle!(
2669        i8x32::ZERO,
2670        a,
2671        [
2672            mask(IMM8, 0),
2673            mask(IMM8, 1),
2674            mask(IMM8, 2),
2675            mask(IMM8, 3),
2676            mask(IMM8, 4),
2677            mask(IMM8, 5),
2678            mask(IMM8, 6),
2679            mask(IMM8, 7),
2680            mask(IMM8, 8),
2681            mask(IMM8, 9),
2682            mask(IMM8, 10),
2683            mask(IMM8, 11),
2684            mask(IMM8, 12),
2685            mask(IMM8, 13),
2686            mask(IMM8, 14),
2687            mask(IMM8, 15),
2688            mask(IMM8, 16),
2689            mask(IMM8, 17),
2690            mask(IMM8, 18),
2691            mask(IMM8, 19),
2692            mask(IMM8, 20),
2693            mask(IMM8, 21),
2694            mask(IMM8, 22),
2695            mask(IMM8, 23),
2696            mask(IMM8, 24),
2697            mask(IMM8, 25),
2698            mask(IMM8, 26),
2699            mask(IMM8, 27),
2700            mask(IMM8, 28),
2701            mask(IMM8, 29),
2702            mask(IMM8, 30),
2703            mask(IMM8, 31),
2704        ],
2705    );
2706    transmute(r)
2707}
2708
2709/// Shifts packed 32-bit integers in `a` left by the amount
2710/// specified by the corresponding element in `count` while
2711/// shifting in zeros, and returns the result.
2712///
2713/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2714#[inline]
2715#[target_feature(enable = "avx2")]
2716#[cfg_attr(test, assert_instr(vpsllvd))]
2717#[stable(feature = "simd_x86", since = "1.27.0")]
2718pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2719    transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
2720}
2721
2722/// Shifts packed 32-bit integers in `a` left by the amount
2723/// specified by the corresponding element in `count` while
2724/// shifting in zeros, and returns the result.
2725///
2726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2727#[inline]
2728#[target_feature(enable = "avx2")]
2729#[cfg_attr(test, assert_instr(vpsllvd))]
2730#[stable(feature = "simd_x86", since = "1.27.0")]
2731pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2732    transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
2733}
2734
2735/// Shifts packed 64-bit integers in `a` left by the amount
2736/// specified by the corresponding element in `count` while
2737/// shifting in zeros, and returns the result.
2738///
2739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2740#[inline]
2741#[target_feature(enable = "avx2")]
2742#[cfg_attr(test, assert_instr(vpsllvq))]
2743#[stable(feature = "simd_x86", since = "1.27.0")]
2744pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2745    transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
2746}
2747
2748/// Shifts packed 64-bit integers in `a` left by the amount
2749/// specified by the corresponding element in `count` while
2750/// shifting in zeros, and returns the result.
2751///
2752/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2753#[inline]
2754#[target_feature(enable = "avx2")]
2755#[cfg_attr(test, assert_instr(vpsllvq))]
2756#[stable(feature = "simd_x86", since = "1.27.0")]
2757pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2758    transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
2759}
2760
2761/// Shifts packed 16-bit integers in `a` right by `count` while
2762/// shifting in sign bits.
2763///
2764/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2765#[inline]
2766#[target_feature(enable = "avx2")]
2767#[cfg_attr(test, assert_instr(vpsraw))]
2768#[stable(feature = "simd_x86", since = "1.27.0")]
2769pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2770    transmute(psraw(a.as_i16x16(), count.as_i16x8()))
2771}
2772
2773/// Shifts packed 32-bit integers in `a` right by `count` while
2774/// shifting in sign bits.
2775///
2776/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2777#[inline]
2778#[target_feature(enable = "avx2")]
2779#[cfg_attr(test, assert_instr(vpsrad))]
2780#[stable(feature = "simd_x86", since = "1.27.0")]
2781pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2782    transmute(psrad(a.as_i32x8(), count.as_i32x4()))
2783}
2784
2785/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2786/// shifting in sign bits.
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2789#[inline]
2790#[target_feature(enable = "avx2")]
2791#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2792#[rustc_legacy_const_generics(1)]
2793#[stable(feature = "simd_x86", since = "1.27.0")]
2794pub unsafe fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2795    static_assert_uimm_bits!(IMM8, 8);
2796    transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16)))
2797}
2798
2799/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2800/// shifting in sign bits.
2801///
2802/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2803#[inline]
2804#[target_feature(enable = "avx2")]
2805#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2806#[rustc_legacy_const_generics(1)]
2807#[stable(feature = "simd_x86", since = "1.27.0")]
2808pub unsafe fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2809    static_assert_uimm_bits!(IMM8, 8);
2810    transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31))))
2811}
2812
2813/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2814/// corresponding element in `count` while shifting in sign bits.
2815///
2816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2817#[inline]
2818#[target_feature(enable = "avx2")]
2819#[cfg_attr(test, assert_instr(vpsravd))]
2820#[stable(feature = "simd_x86", since = "1.27.0")]
2821pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2822    transmute(psravd(a.as_i32x4(), count.as_i32x4()))
2823}
2824
2825/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2826/// corresponding element in `count` while shifting in sign bits.
2827///
2828/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2829#[inline]
2830#[target_feature(enable = "avx2")]
2831#[cfg_attr(test, assert_instr(vpsravd))]
2832#[stable(feature = "simd_x86", since = "1.27.0")]
2833pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2834    transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
2835}
2836
2837/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2838///
2839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2840#[inline]
2841#[target_feature(enable = "avx2")]
2842#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2843#[rustc_legacy_const_generics(1)]
2844#[stable(feature = "simd_x86", since = "1.27.0")]
2845pub unsafe fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2846    static_assert_uimm_bits!(IMM8, 8);
2847    _mm256_bsrli_epi128::<IMM8>(a)
2848}
2849
2850/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2851///
2852/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2853#[inline]
2854#[target_feature(enable = "avx2")]
2855#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2856#[rustc_legacy_const_generics(1)]
2857#[stable(feature = "simd_x86", since = "1.27.0")]
2858pub unsafe fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2859    static_assert_uimm_bits!(IMM8, 8);
2860    let a = a.as_i8x32();
2861    let zero = i8x32::ZERO;
2862    let r: i8x32 = match IMM8 % 16 {
2863        0 => simd_shuffle!(
2864            a,
2865            zero,
2866            [
2867                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
2868                23, 24, 25, 26, 27, 28, 29, 30, 31,
2869            ],
2870        ),
2871        1 => simd_shuffle!(
2872            a,
2873            zero,
2874            [
2875                1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 17, 18, 19, 20, 21, 22, 23,
2876                24, 25, 26, 27, 28, 29, 30, 31, 32,
2877            ],
2878        ),
2879        2 => simd_shuffle!(
2880            a,
2881            zero,
2882            [
2883                2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 18, 19, 20, 21, 22, 23, 24,
2884                25, 26, 27, 28, 29, 30, 31, 32, 32,
2885            ],
2886        ),
2887        3 => simd_shuffle!(
2888            a,
2889            zero,
2890            [
2891                3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 19, 20, 21, 22, 23, 24,
2892                25, 26, 27, 28, 29, 30, 31, 32, 32, 32,
2893            ],
2894        ),
2895        4 => simd_shuffle!(
2896            a,
2897            zero,
2898            [
2899                4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 20, 21, 22, 23, 24, 25,
2900                26, 27, 28, 29, 30, 31, 32, 32, 32, 32,
2901            ],
2902        ),
2903        5 => simd_shuffle!(
2904            a,
2905            zero,
2906            [
2907                5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 21, 22, 23, 24, 25, 26,
2908                27, 28, 29, 30, 31, 32, 32, 32, 32, 32,
2909            ],
2910        ),
2911        6 => simd_shuffle!(
2912            a,
2913            zero,
2914            [
2915                6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 22, 23, 24, 25, 26, 27,
2916                28, 29, 30, 31, 32, 32, 32, 32, 32, 32,
2917            ],
2918        ),
2919        7 => simd_shuffle!(
2920            a,
2921            zero,
2922            [
2923                7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 23, 24, 25, 26, 27,
2924                28, 29, 30, 31, 32, 32, 32, 32, 32, 32, 32,
2925            ],
2926        ),
2927        8 => simd_shuffle!(
2928            a,
2929            zero,
2930            [
2931                8, 9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 24, 25, 26, 27, 28,
2932                29, 30, 31, 32, 32, 32, 32, 32, 32, 32, 32,
2933            ],
2934        ),
2935        9 => simd_shuffle!(
2936            a,
2937            zero,
2938            [
2939                9, 10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 25, 26, 27, 28, 29,
2940                30, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2941            ],
2942        ),
2943        10 => simd_shuffle!(
2944            a,
2945            zero,
2946            [
2947                10, 11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 26, 27, 28, 29, 30,
2948                31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2949            ],
2950        ),
2951        11 => simd_shuffle!(
2952            a,
2953            zero,
2954            [
2955                11, 12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 27, 28, 29, 30, 31,
2956                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2957            ],
2958        ),
2959        12 => simd_shuffle!(
2960            a,
2961            zero,
2962            [
2963                12, 13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 28, 29, 30, 31, 32,
2964                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2965            ],
2966        ),
2967        13 => simd_shuffle!(
2968            a,
2969            zero,
2970            [
2971                13, 14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 29, 30, 31, 32, 32,
2972                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2973            ],
2974        ),
2975        14 => simd_shuffle!(
2976            a,
2977            zero,
2978            [
2979                14, 15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 30, 31, 32, 32, 32,
2980                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2981            ],
2982        ),
2983        15 => simd_shuffle!(
2984            a,
2985            zero,
2986            [
2987                15, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
2988                32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
2989            ],
2990        ),
2991        _ => zero,
2992    };
2993    transmute(r)
2994}
2995
2996/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
2997/// zeros.
2998///
2999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3000#[inline]
3001#[target_feature(enable = "avx2")]
3002#[cfg_attr(test, assert_instr(vpsrlw))]
3003#[stable(feature = "simd_x86", since = "1.27.0")]
3004pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3005    transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
3006}
3007
3008/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3009/// zeros.
3010///
3011/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3012#[inline]
3013#[target_feature(enable = "avx2")]
3014#[cfg_attr(test, assert_instr(vpsrld))]
3015#[stable(feature = "simd_x86", since = "1.27.0")]
3016pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3017    transmute(psrld(a.as_i32x8(), count.as_i32x4()))
3018}
3019
3020/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3021/// zeros.
3022///
3023/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3024#[inline]
3025#[target_feature(enable = "avx2")]
3026#[cfg_attr(test, assert_instr(vpsrlq))]
3027#[stable(feature = "simd_x86", since = "1.27.0")]
3028pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3029    transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
3030}
3031
3032/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3033/// zeros
3034///
3035/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3036#[inline]
3037#[target_feature(enable = "avx2")]
3038#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3039#[rustc_legacy_const_generics(1)]
3040#[stable(feature = "simd_x86", since = "1.27.0")]
3041pub unsafe fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3042    static_assert_uimm_bits!(IMM8, 8);
3043    if IMM8 >= 16 {
3044        _mm256_setzero_si256()
3045    } else {
3046        transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3047    }
3048}
3049
3050/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3051/// zeros
3052///
3053/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3054#[inline]
3055#[target_feature(enable = "avx2")]
3056#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3057#[rustc_legacy_const_generics(1)]
3058#[stable(feature = "simd_x86", since = "1.27.0")]
3059pub unsafe fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3060    static_assert_uimm_bits!(IMM8, 8);
3061    if IMM8 >= 32 {
3062        _mm256_setzero_si256()
3063    } else {
3064        transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3065    }
3066}
3067
3068/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3069/// zeros
3070///
3071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3072#[inline]
3073#[target_feature(enable = "avx2")]
3074#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3075#[rustc_legacy_const_generics(1)]
3076#[stable(feature = "simd_x86", since = "1.27.0")]
3077pub unsafe fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3078    static_assert_uimm_bits!(IMM8, 8);
3079    if IMM8 >= 64 {
3080        _mm256_setzero_si256()
3081    } else {
3082        transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3083    }
3084}
3085
3086/// Shifts packed 32-bit integers in `a` right by the amount specified by
3087/// the corresponding element in `count` while shifting in zeros,
3088///
3089/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3090#[inline]
3091#[target_feature(enable = "avx2")]
3092#[cfg_attr(test, assert_instr(vpsrlvd))]
3093#[stable(feature = "simd_x86", since = "1.27.0")]
3094pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3095    transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
3096}
3097
3098/// Shifts packed 32-bit integers in `a` right by the amount specified by
3099/// the corresponding element in `count` while shifting in zeros,
3100///
3101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3102#[inline]
3103#[target_feature(enable = "avx2")]
3104#[cfg_attr(test, assert_instr(vpsrlvd))]
3105#[stable(feature = "simd_x86", since = "1.27.0")]
3106pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3107    transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
3108}
3109
3110/// Shifts packed 64-bit integers in `a` right by the amount specified by
3111/// the corresponding element in `count` while shifting in zeros,
3112///
3113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3114#[inline]
3115#[target_feature(enable = "avx2")]
3116#[cfg_attr(test, assert_instr(vpsrlvq))]
3117#[stable(feature = "simd_x86", since = "1.27.0")]
3118pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3119    transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
3120}
3121
3122/// Shifts packed 64-bit integers in `a` right by the amount specified by
3123/// the corresponding element in `count` while shifting in zeros,
3124///
3125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3126#[inline]
3127#[target_feature(enable = "avx2")]
3128#[cfg_attr(test, assert_instr(vpsrlvq))]
3129#[stable(feature = "simd_x86", since = "1.27.0")]
3130pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3131    transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
3132}
3133
3134/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3135/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3136/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3137///
3138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3139#[inline]
3140#[target_feature(enable = "avx2")]
3141#[cfg_attr(test, assert_instr(vmovntdqa))]
3142#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3143pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3144    let dst: __m256i;
3145    crate::arch::asm!(
3146        vpl!("vmovntdqa {a}"),
3147        a = out(ymm_reg) dst,
3148        p = in(reg) mem_addr,
3149        options(pure, readonly, nostack, preserves_flags),
3150    );
3151    dst
3152}
3153
3154/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3155///
3156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3157#[inline]
3158#[target_feature(enable = "avx2")]
3159#[cfg_attr(test, assert_instr(vpsubw))]
3160#[stable(feature = "simd_x86", since = "1.27.0")]
3161pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3162    transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
3163}
3164
3165/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3166///
3167/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3168#[inline]
3169#[target_feature(enable = "avx2")]
3170#[cfg_attr(test, assert_instr(vpsubd))]
3171#[stable(feature = "simd_x86", since = "1.27.0")]
3172pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3173    transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
3174}
3175
3176/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3177///
3178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3179#[inline]
3180#[target_feature(enable = "avx2")]
3181#[cfg_attr(test, assert_instr(vpsubq))]
3182#[stable(feature = "simd_x86", since = "1.27.0")]
3183pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3184    transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
3185}
3186
3187/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3188///
3189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3190#[inline]
3191#[target_feature(enable = "avx2")]
3192#[cfg_attr(test, assert_instr(vpsubb))]
3193#[stable(feature = "simd_x86", since = "1.27.0")]
3194pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3195    transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
3196}
3197
3198/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3199/// `a` using saturation.
3200///
3201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3202#[inline]
3203#[target_feature(enable = "avx2")]
3204#[cfg_attr(test, assert_instr(vpsubsw))]
3205#[stable(feature = "simd_x86", since = "1.27.0")]
3206pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3207    transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16()))
3208}
3209
3210/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3211/// `a` using saturation.
3212///
3213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3214#[inline]
3215#[target_feature(enable = "avx2")]
3216#[cfg_attr(test, assert_instr(vpsubsb))]
3217#[stable(feature = "simd_x86", since = "1.27.0")]
3218pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3219    transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32()))
3220}
3221
3222/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3223/// integers in `a` using saturation.
3224///
3225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3226#[inline]
3227#[target_feature(enable = "avx2")]
3228#[cfg_attr(test, assert_instr(vpsubusw))]
3229#[stable(feature = "simd_x86", since = "1.27.0")]
3230pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3231    transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16()))
3232}
3233
3234/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3235/// integers in `a` using saturation.
3236///
3237/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3238#[inline]
3239#[target_feature(enable = "avx2")]
3240#[cfg_attr(test, assert_instr(vpsubusb))]
3241#[stable(feature = "simd_x86", since = "1.27.0")]
3242pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3243    transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32()))
3244}
3245
3246/// Unpacks and interleave 8-bit integers from the high half of each
3247/// 128-bit lane in `a` and `b`.
3248///
3249/// ```rust
3250/// #[cfg(target_arch = "x86")]
3251/// use std::arch::x86::*;
3252/// #[cfg(target_arch = "x86_64")]
3253/// use std::arch::x86_64::*;
3254///
3255/// # fn main() {
3256/// #     if is_x86_feature_detected!("avx2") {
3257/// #         #[target_feature(enable = "avx2")]
3258/// #         unsafe fn worker() {
3259/// let a = _mm256_setr_epi8(
3260///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3261///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3262/// );
3263/// let b = _mm256_setr_epi8(
3264///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3265///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3266///     -30, -31,
3267/// );
3268///
3269/// let c = _mm256_unpackhi_epi8(a, b);
3270///
3271/// let expected = _mm256_setr_epi8(
3272///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3273///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3274///     -31,
3275/// );
3276/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3277///
3278/// #         }
3279/// #         unsafe { worker(); }
3280/// #     }
3281/// # }
3282/// ```
3283///
3284/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3285#[inline]
3286#[target_feature(enable = "avx2")]
3287#[cfg_attr(test, assert_instr(vpunpckhbw))]
3288#[stable(feature = "simd_x86", since = "1.27.0")]
3289pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3290    #[rustfmt::skip]
3291    let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3292            8, 40, 9, 41, 10, 42, 11, 43,
3293            12, 44, 13, 45, 14, 46, 15, 47,
3294            24, 56, 25, 57, 26, 58, 27, 59,
3295            28, 60, 29, 61, 30, 62, 31, 63,
3296    ]);
3297    transmute(r)
3298}
3299
3300/// Unpacks and interleave 8-bit integers from the low half of each
3301/// 128-bit lane of `a` and `b`.
3302///
3303/// ```rust
3304/// #[cfg(target_arch = "x86")]
3305/// use std::arch::x86::*;
3306/// #[cfg(target_arch = "x86_64")]
3307/// use std::arch::x86_64::*;
3308///
3309/// # fn main() {
3310/// #     if is_x86_feature_detected!("avx2") {
3311/// #         #[target_feature(enable = "avx2")]
3312/// #         unsafe fn worker() {
3313/// let a = _mm256_setr_epi8(
3314///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3315///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3316/// );
3317/// let b = _mm256_setr_epi8(
3318///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3319///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3320///     -30, -31,
3321/// );
3322///
3323/// let c = _mm256_unpacklo_epi8(a, b);
3324///
3325/// let expected = _mm256_setr_epi8(
3326///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3327///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3328/// );
3329/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3330///
3331/// #         }
3332/// #         unsafe { worker(); }
3333/// #     }
3334/// # }
3335/// ```
3336///
3337/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3338#[inline]
3339#[target_feature(enable = "avx2")]
3340#[cfg_attr(test, assert_instr(vpunpcklbw))]
3341#[stable(feature = "simd_x86", since = "1.27.0")]
3342pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3343    #[rustfmt::skip]
3344    let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3345        0, 32, 1, 33, 2, 34, 3, 35,
3346        4, 36, 5, 37, 6, 38, 7, 39,
3347        16, 48, 17, 49, 18, 50, 19, 51,
3348        20, 52, 21, 53, 22, 54, 23, 55,
3349    ]);
3350    transmute(r)
3351}
3352
3353/// Unpacks and interleave 16-bit integers from the high half of each
3354/// 128-bit lane of `a` and `b`.
3355///
3356/// ```rust
3357/// #[cfg(target_arch = "x86")]
3358/// use std::arch::x86::*;
3359/// #[cfg(target_arch = "x86_64")]
3360/// use std::arch::x86_64::*;
3361///
3362/// # fn main() {
3363/// #     if is_x86_feature_detected!("avx2") {
3364/// #         #[target_feature(enable = "avx2")]
3365/// #         unsafe fn worker() {
3366/// let a = _mm256_setr_epi16(
3367///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3368/// );
3369/// let b = _mm256_setr_epi16(
3370///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3371/// );
3372///
3373/// let c = _mm256_unpackhi_epi16(a, b);
3374///
3375/// let expected = _mm256_setr_epi16(
3376///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3377/// );
3378/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3379///
3380/// #         }
3381/// #         unsafe { worker(); }
3382/// #     }
3383/// # }
3384/// ```
3385///
3386/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3387#[inline]
3388#[target_feature(enable = "avx2")]
3389#[cfg_attr(test, assert_instr(vpunpckhwd))]
3390#[stable(feature = "simd_x86", since = "1.27.0")]
3391pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3392    let r: i16x16 = simd_shuffle!(
3393        a.as_i16x16(),
3394        b.as_i16x16(),
3395        [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3396    );
3397    transmute(r)
3398}
3399
3400/// Unpacks and interleave 16-bit integers from the low half of each
3401/// 128-bit lane of `a` and `b`.
3402///
3403/// ```rust
3404/// #[cfg(target_arch = "x86")]
3405/// use std::arch::x86::*;
3406/// #[cfg(target_arch = "x86_64")]
3407/// use std::arch::x86_64::*;
3408///
3409/// # fn main() {
3410/// #     if is_x86_feature_detected!("avx2") {
3411/// #         #[target_feature(enable = "avx2")]
3412/// #         unsafe fn worker() {
3413///
3414/// let a = _mm256_setr_epi16(
3415///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3416/// );
3417/// let b = _mm256_setr_epi16(
3418///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3419/// );
3420///
3421/// let c = _mm256_unpacklo_epi16(a, b);
3422///
3423/// let expected = _mm256_setr_epi16(
3424///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3425/// );
3426/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3427///
3428/// #         }
3429/// #         unsafe { worker(); }
3430/// #     }
3431/// # }
3432/// ```
3433///
3434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3435#[inline]
3436#[target_feature(enable = "avx2")]
3437#[cfg_attr(test, assert_instr(vpunpcklwd))]
3438#[stable(feature = "simd_x86", since = "1.27.0")]
3439pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3440    let r: i16x16 = simd_shuffle!(
3441        a.as_i16x16(),
3442        b.as_i16x16(),
3443        [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3444    );
3445    transmute(r)
3446}
3447
3448/// Unpacks and interleave 32-bit integers from the high half of each
3449/// 128-bit lane of `a` and `b`.
3450///
3451/// ```rust
3452/// #[cfg(target_arch = "x86")]
3453/// use std::arch::x86::*;
3454/// #[cfg(target_arch = "x86_64")]
3455/// use std::arch::x86_64::*;
3456///
3457/// # fn main() {
3458/// #     if is_x86_feature_detected!("avx2") {
3459/// #         #[target_feature(enable = "avx2")]
3460/// #         unsafe fn worker() {
3461/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3462/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3463///
3464/// let c = _mm256_unpackhi_epi32(a, b);
3465///
3466/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3467/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3468///
3469/// #         }
3470/// #         unsafe { worker(); }
3471/// #     }
3472/// # }
3473/// ```
3474///
3475/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3476#[inline]
3477#[target_feature(enable = "avx2")]
3478#[cfg_attr(test, assert_instr(vunpckhps))]
3479#[stable(feature = "simd_x86", since = "1.27.0")]
3480pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3481    let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3482    transmute(r)
3483}
3484
3485/// Unpacks and interleave 32-bit integers from the low half of each
3486/// 128-bit lane of `a` and `b`.
3487///
3488/// ```rust
3489/// #[cfg(target_arch = "x86")]
3490/// use std::arch::x86::*;
3491/// #[cfg(target_arch = "x86_64")]
3492/// use std::arch::x86_64::*;
3493///
3494/// # fn main() {
3495/// #     if is_x86_feature_detected!("avx2") {
3496/// #         #[target_feature(enable = "avx2")]
3497/// #         unsafe fn worker() {
3498/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3499/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3500///
3501/// let c = _mm256_unpacklo_epi32(a, b);
3502///
3503/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3504/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3505///
3506/// #         }
3507/// #         unsafe { worker(); }
3508/// #     }
3509/// # }
3510/// ```
3511///
3512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3513#[inline]
3514#[target_feature(enable = "avx2")]
3515#[cfg_attr(test, assert_instr(vunpcklps))]
3516#[stable(feature = "simd_x86", since = "1.27.0")]
3517pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3518    let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3519    transmute(r)
3520}
3521
3522/// Unpacks and interleave 64-bit integers from the high half of each
3523/// 128-bit lane of `a` and `b`.
3524///
3525/// ```rust
3526/// #[cfg(target_arch = "x86")]
3527/// use std::arch::x86::*;
3528/// #[cfg(target_arch = "x86_64")]
3529/// use std::arch::x86_64::*;
3530///
3531/// # fn main() {
3532/// #     if is_x86_feature_detected!("avx2") {
3533/// #         #[target_feature(enable = "avx2")]
3534/// #         unsafe fn worker() {
3535/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3536/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3537///
3538/// let c = _mm256_unpackhi_epi64(a, b);
3539///
3540/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3541/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3542///
3543/// #         }
3544/// #         unsafe { worker(); }
3545/// #     }
3546/// # }
3547/// ```
3548///
3549/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3550#[inline]
3551#[target_feature(enable = "avx2")]
3552#[cfg_attr(test, assert_instr(vunpckhpd))]
3553#[stable(feature = "simd_x86", since = "1.27.0")]
3554pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3555    let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3556    transmute(r)
3557}
3558
3559/// Unpacks and interleave 64-bit integers from the low half of each
3560/// 128-bit lane of `a` and `b`.
3561///
3562/// ```rust
3563/// #[cfg(target_arch = "x86")]
3564/// use std::arch::x86::*;
3565/// #[cfg(target_arch = "x86_64")]
3566/// use std::arch::x86_64::*;
3567///
3568/// # fn main() {
3569/// #     if is_x86_feature_detected!("avx2") {
3570/// #         #[target_feature(enable = "avx2")]
3571/// #         unsafe fn worker() {
3572/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3573/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3574///
3575/// let c = _mm256_unpacklo_epi64(a, b);
3576///
3577/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3578/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3579///
3580/// #         }
3581/// #         unsafe { worker(); }
3582/// #     }
3583/// # }
3584/// ```
3585///
3586/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3587#[inline]
3588#[target_feature(enable = "avx2")]
3589#[cfg_attr(test, assert_instr(vunpcklpd))]
3590#[stable(feature = "simd_x86", since = "1.27.0")]
3591pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3592    let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3593    transmute(r)
3594}
3595
3596/// Computes the bitwise XOR of 256 bits (representing integer data)
3597/// in `a` and `b`
3598///
3599/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3600#[inline]
3601#[target_feature(enable = "avx2")]
3602#[cfg_attr(test, assert_instr(vxorps))]
3603#[stable(feature = "simd_x86", since = "1.27.0")]
3604pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3605    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
3606}
3607
3608/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3609/// integer containing the zero-extended integer data.
3610///
3611/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3612///
3613/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3614#[inline]
3615#[target_feature(enable = "avx2")]
3616// This intrinsic has no corresponding instruction.
3617#[rustc_legacy_const_generics(1)]
3618#[stable(feature = "simd_x86", since = "1.27.0")]
3619pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3620    static_assert_uimm_bits!(INDEX, 5);
3621    simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32
3622}
3623
3624/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3625/// integer containing the zero-extended integer data.
3626///
3627/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3628///
3629/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3630#[inline]
3631#[target_feature(enable = "avx2")]
3632// This intrinsic has no corresponding instruction.
3633#[rustc_legacy_const_generics(1)]
3634#[stable(feature = "simd_x86", since = "1.27.0")]
3635pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3636    static_assert_uimm_bits!(INDEX, 4);
3637    simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32
3638}
3639
3640#[allow(improper_ctypes)]
3641extern "C" {
3642    #[link_name = "llvm.x86.avx2.phadd.w"]
3643    fn phaddw(a: i16x16, b: i16x16) -> i16x16;
3644    #[link_name = "llvm.x86.avx2.phadd.d"]
3645    fn phaddd(a: i32x8, b: i32x8) -> i32x8;
3646    #[link_name = "llvm.x86.avx2.phadd.sw"]
3647    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3648    #[link_name = "llvm.x86.avx2.phsub.w"]
3649    fn phsubw(a: i16x16, b: i16x16) -> i16x16;
3650    #[link_name = "llvm.x86.avx2.phsub.d"]
3651    fn phsubd(a: i32x8, b: i32x8) -> i32x8;
3652    #[link_name = "llvm.x86.avx2.phsub.sw"]
3653    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3654    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3655    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3656    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3657    fn pmaddubsw(a: u8x32, b: u8x32) -> i16x16;
3658    #[link_name = "llvm.x86.avx2.maskload.d"]
3659    fn maskloadd(mem_addr: *const i8, mask: i32x4) -> i32x4;
3660    #[link_name = "llvm.x86.avx2.maskload.d.256"]
3661    fn maskloadd256(mem_addr: *const i8, mask: i32x8) -> i32x8;
3662    #[link_name = "llvm.x86.avx2.maskload.q"]
3663    fn maskloadq(mem_addr: *const i8, mask: i64x2) -> i64x2;
3664    #[link_name = "llvm.x86.avx2.maskload.q.256"]
3665    fn maskloadq256(mem_addr: *const i8, mask: i64x4) -> i64x4;
3666    #[link_name = "llvm.x86.avx2.maskstore.d"]
3667    fn maskstored(mem_addr: *mut i8, mask: i32x4, a: i32x4);
3668    #[link_name = "llvm.x86.avx2.maskstore.d.256"]
3669    fn maskstored256(mem_addr: *mut i8, mask: i32x8, a: i32x8);
3670    #[link_name = "llvm.x86.avx2.maskstore.q"]
3671    fn maskstoreq(mem_addr: *mut i8, mask: i64x2, a: i64x2);
3672    #[link_name = "llvm.x86.avx2.maskstore.q.256"]
3673    fn maskstoreq256(mem_addr: *mut i8, mask: i64x4, a: i64x4);
3674    #[link_name = "llvm.x86.avx2.mpsadbw"]
3675    fn mpsadbw(a: u8x32, b: u8x32, imm8: i32) -> u16x16;
3676    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3677    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3678    #[link_name = "llvm.x86.avx2.packsswb"]
3679    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3680    #[link_name = "llvm.x86.avx2.packssdw"]
3681    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3682    #[link_name = "llvm.x86.avx2.packuswb"]
3683    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3684    #[link_name = "llvm.x86.avx2.packusdw"]
3685    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3686    #[link_name = "llvm.x86.avx2.psad.bw"]
3687    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3688    #[link_name = "llvm.x86.avx2.psign.b"]
3689    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3690    #[link_name = "llvm.x86.avx2.psign.w"]
3691    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3692    #[link_name = "llvm.x86.avx2.psign.d"]
3693    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3694    #[link_name = "llvm.x86.avx2.psll.w"]
3695    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3696    #[link_name = "llvm.x86.avx2.psll.d"]
3697    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3698    #[link_name = "llvm.x86.avx2.psll.q"]
3699    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3700    #[link_name = "llvm.x86.avx2.psllv.d"]
3701    fn psllvd(a: i32x4, count: i32x4) -> i32x4;
3702    #[link_name = "llvm.x86.avx2.psllv.d.256"]
3703    fn psllvd256(a: i32x8, count: i32x8) -> i32x8;
3704    #[link_name = "llvm.x86.avx2.psllv.q"]
3705    fn psllvq(a: i64x2, count: i64x2) -> i64x2;
3706    #[link_name = "llvm.x86.avx2.psllv.q.256"]
3707    fn psllvq256(a: i64x4, count: i64x4) -> i64x4;
3708    #[link_name = "llvm.x86.avx2.psra.w"]
3709    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3710    #[link_name = "llvm.x86.avx2.psra.d"]
3711    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3712    #[link_name = "llvm.x86.avx2.psrav.d"]
3713    fn psravd(a: i32x4, count: i32x4) -> i32x4;
3714    #[link_name = "llvm.x86.avx2.psrav.d.256"]
3715    fn psravd256(a: i32x8, count: i32x8) -> i32x8;
3716    #[link_name = "llvm.x86.avx2.psrl.w"]
3717    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3718    #[link_name = "llvm.x86.avx2.psrl.d"]
3719    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3720    #[link_name = "llvm.x86.avx2.psrl.q"]
3721    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3722    #[link_name = "llvm.x86.avx2.psrlv.d"]
3723    fn psrlvd(a: i32x4, count: i32x4) -> i32x4;
3724    #[link_name = "llvm.x86.avx2.psrlv.d.256"]
3725    fn psrlvd256(a: i32x8, count: i32x8) -> i32x8;
3726    #[link_name = "llvm.x86.avx2.psrlv.q"]
3727    fn psrlvq(a: i64x2, count: i64x2) -> i64x2;
3728    #[link_name = "llvm.x86.avx2.psrlv.q.256"]
3729    fn psrlvq256(a: i64x4, count: i64x4) -> i64x4;
3730    #[link_name = "llvm.x86.avx2.pshuf.b"]
3731    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3732    #[link_name = "llvm.x86.avx2.permd"]
3733    fn permd(a: u32x8, b: u32x8) -> u32x8;
3734    #[link_name = "llvm.x86.avx2.permps"]
3735    fn permps(a: __m256, b: i32x8) -> __m256;
3736    #[link_name = "llvm.x86.avx2.vperm2i128"]
3737    fn vperm2i128(a: i64x4, b: i64x4, imm8: i8) -> i64x4;
3738    #[link_name = "llvm.x86.avx2.gather.d.d"]
3739    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3740    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3741    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3742    #[link_name = "llvm.x86.avx2.gather.d.q"]
3743    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3744    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3745    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3746    #[link_name = "llvm.x86.avx2.gather.q.d"]
3747    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3748    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3749    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3750    #[link_name = "llvm.x86.avx2.gather.q.q"]
3751    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3752    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3753    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3754    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3755    fn pgatherdpd(
3756        src: __m128d,
3757        slice: *const i8,
3758        offsets: i32x4,
3759        mask: __m128d,
3760        scale: i8,
3761    ) -> __m128d;
3762    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3763    fn vpgatherdpd(
3764        src: __m256d,
3765        slice: *const i8,
3766        offsets: i32x4,
3767        mask: __m256d,
3768        scale: i8,
3769    ) -> __m256d;
3770    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3771    fn pgatherqpd(
3772        src: __m128d,
3773        slice: *const i8,
3774        offsets: i64x2,
3775        mask: __m128d,
3776        scale: i8,
3777    ) -> __m128d;
3778    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3779    fn vpgatherqpd(
3780        src: __m256d,
3781        slice: *const i8,
3782        offsets: i64x4,
3783        mask: __m256d,
3784        scale: i8,
3785    ) -> __m256d;
3786    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3787    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3788        -> __m128;
3789    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3790    fn vpgatherdps(
3791        src: __m256,
3792        slice: *const i8,
3793        offsets: i32x8,
3794        mask: __m256,
3795        scale: i8,
3796    ) -> __m256;
3797    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3798    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3799        -> __m128;
3800    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3801    fn vpgatherqps(
3802        src: __m128,
3803        slice: *const i8,
3804        offsets: i64x4,
3805        mask: __m128,
3806        scale: i8,
3807    ) -> __m128;
3808}
3809
3810#[cfg(test)]
3811mod tests {
3812
3813    use stdarch_test::simd_test;
3814
3815    use crate::core_arch::x86::*;
3816
3817    #[simd_test(enable = "avx2")]
3818    unsafe fn test_mm256_abs_epi32() {
3819        #[rustfmt::skip]
3820        let a = _mm256_setr_epi32(
3821            0, 1, -1, i32::MAX,
3822            i32::MIN, 100, -100, -32,
3823        );
3824        let r = _mm256_abs_epi32(a);
3825        #[rustfmt::skip]
3826        let e = _mm256_setr_epi32(
3827            0, 1, 1, i32::MAX,
3828            i32::MAX.wrapping_add(1), 100, 100, 32,
3829        );
3830        assert_eq_m256i(r, e);
3831    }
3832
3833    #[simd_test(enable = "avx2")]
3834    unsafe fn test_mm256_abs_epi16() {
3835        #[rustfmt::skip]
3836        let a = _mm256_setr_epi16(
3837            0,  1, -1, 2, -2, 3, -3, 4,
3838            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3839        );
3840        let r = _mm256_abs_epi16(a);
3841        #[rustfmt::skip]
3842        let e = _mm256_setr_epi16(
3843            0, 1, 1, 2, 2, 3, 3, 4,
3844            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3845        );
3846        assert_eq_m256i(r, e);
3847    }
3848
3849    #[simd_test(enable = "avx2")]
3850    unsafe fn test_mm256_abs_epi8() {
3851        #[rustfmt::skip]
3852        let a = _mm256_setr_epi8(
3853            0, 1, -1, 2, -2, 3, -3, 4,
3854            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3855            0, 1, -1, 2, -2, 3, -3, 4,
3856            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3857        );
3858        let r = _mm256_abs_epi8(a);
3859        #[rustfmt::skip]
3860        let e = _mm256_setr_epi8(
3861            0, 1, 1, 2, 2, 3, 3, 4,
3862            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3863            0, 1, 1, 2, 2, 3, 3, 4,
3864            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3865        );
3866        assert_eq_m256i(r, e);
3867    }
3868
3869    #[simd_test(enable = "avx2")]
3870    unsafe fn test_mm256_add_epi64() {
3871        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3872        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3873        let r = _mm256_add_epi64(a, b);
3874        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3875        assert_eq_m256i(r, e);
3876    }
3877
3878    #[simd_test(enable = "avx2")]
3879    unsafe fn test_mm256_add_epi32() {
3880        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3881        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3882        let r = _mm256_add_epi32(a, b);
3883        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3884        assert_eq_m256i(r, e);
3885    }
3886
3887    #[simd_test(enable = "avx2")]
3888    unsafe fn test_mm256_add_epi16() {
3889        #[rustfmt::skip]
3890        let a = _mm256_setr_epi16(
3891            0, 1, 2, 3, 4, 5, 6, 7,
3892            8, 9, 10, 11, 12, 13, 14, 15,
3893        );
3894        #[rustfmt::skip]
3895        let b = _mm256_setr_epi16(
3896            0, 1, 2, 3, 4, 5, 6, 7,
3897            8, 9, 10, 11, 12, 13, 14, 15,
3898        );
3899        let r = _mm256_add_epi16(a, b);
3900        #[rustfmt::skip]
3901        let e = _mm256_setr_epi16(
3902            0, 2, 4, 6, 8, 10, 12, 14,
3903            16, 18, 20, 22, 24, 26, 28, 30,
3904        );
3905        assert_eq_m256i(r, e);
3906    }
3907
3908    #[simd_test(enable = "avx2")]
3909    unsafe fn test_mm256_add_epi8() {
3910        #[rustfmt::skip]
3911        let a = _mm256_setr_epi8(
3912            0, 1, 2, 3, 4, 5, 6, 7,
3913            8, 9, 10, 11, 12, 13, 14, 15,
3914            16, 17, 18, 19, 20, 21, 22, 23,
3915            24, 25, 26, 27, 28, 29, 30, 31,
3916        );
3917        #[rustfmt::skip]
3918        let b = _mm256_setr_epi8(
3919            0, 1, 2, 3, 4, 5, 6, 7,
3920            8, 9, 10, 11, 12, 13, 14, 15,
3921            16, 17, 18, 19, 20, 21, 22, 23,
3922            24, 25, 26, 27, 28, 29, 30, 31,
3923        );
3924        let r = _mm256_add_epi8(a, b);
3925        #[rustfmt::skip]
3926        let e = _mm256_setr_epi8(
3927            0, 2, 4, 6, 8, 10, 12, 14,
3928            16, 18, 20, 22, 24, 26, 28, 30,
3929            32, 34, 36, 38, 40, 42, 44, 46,
3930            48, 50, 52, 54, 56, 58, 60, 62,
3931        );
3932        assert_eq_m256i(r, e);
3933    }
3934
3935    #[simd_test(enable = "avx2")]
3936    unsafe fn test_mm256_adds_epi8() {
3937        #[rustfmt::skip]
3938        let a = _mm256_setr_epi8(
3939            0, 1, 2, 3, 4, 5, 6, 7,
3940            8, 9, 10, 11, 12, 13, 14, 15,
3941            16, 17, 18, 19, 20, 21, 22, 23,
3942            24, 25, 26, 27, 28, 29, 30, 31,
3943        );
3944        #[rustfmt::skip]
3945        let b = _mm256_setr_epi8(
3946            32, 33, 34, 35, 36, 37, 38, 39,
3947            40, 41, 42, 43, 44, 45, 46, 47,
3948            48, 49, 50, 51, 52, 53, 54, 55,
3949            56, 57, 58, 59, 60, 61, 62, 63,
3950        );
3951        let r = _mm256_adds_epi8(a, b);
3952        #[rustfmt::skip]
3953        let e = _mm256_setr_epi8(
3954            32, 34, 36, 38, 40, 42, 44, 46,
3955            48, 50, 52, 54, 56, 58, 60, 62,
3956            64, 66, 68, 70, 72, 74, 76, 78,
3957            80, 82, 84, 86, 88, 90, 92, 94,
3958        );
3959        assert_eq_m256i(r, e);
3960    }
3961
3962    #[simd_test(enable = "avx2")]
3963    unsafe fn test_mm256_adds_epi8_saturate_positive() {
3964        let a = _mm256_set1_epi8(0x7F);
3965        let b = _mm256_set1_epi8(1);
3966        let r = _mm256_adds_epi8(a, b);
3967        assert_eq_m256i(r, a);
3968    }
3969
3970    #[simd_test(enable = "avx2")]
3971    unsafe fn test_mm256_adds_epi8_saturate_negative() {
3972        let a = _mm256_set1_epi8(-0x80);
3973        let b = _mm256_set1_epi8(-1);
3974        let r = _mm256_adds_epi8(a, b);
3975        assert_eq_m256i(r, a);
3976    }
3977
3978    #[simd_test(enable = "avx2")]
3979    unsafe fn test_mm256_adds_epi16() {
3980        #[rustfmt::skip]
3981        let a = _mm256_setr_epi16(
3982            0, 1, 2, 3, 4, 5, 6, 7,
3983            8, 9, 10, 11, 12, 13, 14, 15,
3984        );
3985        #[rustfmt::skip]
3986        let b = _mm256_setr_epi16(
3987            32, 33, 34, 35, 36, 37, 38, 39,
3988            40, 41, 42, 43, 44, 45, 46, 47,
3989        );
3990        let r = _mm256_adds_epi16(a, b);
3991        #[rustfmt::skip]
3992        let e = _mm256_setr_epi16(
3993            32, 34, 36, 38, 40, 42, 44, 46,
3994            48, 50, 52, 54, 56, 58, 60, 62,
3995        );
3996
3997        assert_eq_m256i(r, e);
3998    }
3999
4000    #[simd_test(enable = "avx2")]
4001    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4002        let a = _mm256_set1_epi16(0x7FFF);
4003        let b = _mm256_set1_epi16(1);
4004        let r = _mm256_adds_epi16(a, b);
4005        assert_eq_m256i(r, a);
4006    }
4007
4008    #[simd_test(enable = "avx2")]
4009    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4010        let a = _mm256_set1_epi16(-0x8000);
4011        let b = _mm256_set1_epi16(-1);
4012        let r = _mm256_adds_epi16(a, b);
4013        assert_eq_m256i(r, a);
4014    }
4015
4016    #[simd_test(enable = "avx2")]
4017    unsafe fn test_mm256_adds_epu8() {
4018        #[rustfmt::skip]
4019        let a = _mm256_setr_epi8(
4020            0, 1, 2, 3, 4, 5, 6, 7,
4021            8, 9, 10, 11, 12, 13, 14, 15,
4022            16, 17, 18, 19, 20, 21, 22, 23,
4023            24, 25, 26, 27, 28, 29, 30, 31,
4024        );
4025        #[rustfmt::skip]
4026        let b = _mm256_setr_epi8(
4027            32, 33, 34, 35, 36, 37, 38, 39,
4028            40, 41, 42, 43, 44, 45, 46, 47,
4029            48, 49, 50, 51, 52, 53, 54, 55,
4030            56, 57, 58, 59, 60, 61, 62, 63,
4031        );
4032        let r = _mm256_adds_epu8(a, b);
4033        #[rustfmt::skip]
4034        let e = _mm256_setr_epi8(
4035            32, 34, 36, 38, 40, 42, 44, 46,
4036            48, 50, 52, 54, 56, 58, 60, 62,
4037            64, 66, 68, 70, 72, 74, 76, 78,
4038            80, 82, 84, 86, 88, 90, 92, 94,
4039        );
4040        assert_eq_m256i(r, e);
4041    }
4042
4043    #[simd_test(enable = "avx2")]
4044    unsafe fn test_mm256_adds_epu8_saturate() {
4045        let a = _mm256_set1_epi8(!0);
4046        let b = _mm256_set1_epi8(1);
4047        let r = _mm256_adds_epu8(a, b);
4048        assert_eq_m256i(r, a);
4049    }
4050
4051    #[simd_test(enable = "avx2")]
4052    unsafe fn test_mm256_adds_epu16() {
4053        #[rustfmt::skip]
4054        let a = _mm256_setr_epi16(
4055            0, 1, 2, 3, 4, 5, 6, 7,
4056            8, 9, 10, 11, 12, 13, 14, 15,
4057        );
4058        #[rustfmt::skip]
4059        let b = _mm256_setr_epi16(
4060            32, 33, 34, 35, 36, 37, 38, 39,
4061            40, 41, 42, 43, 44, 45, 46, 47,
4062        );
4063        let r = _mm256_adds_epu16(a, b);
4064        #[rustfmt::skip]
4065        let e = _mm256_setr_epi16(
4066            32, 34, 36, 38, 40, 42, 44, 46,
4067            48, 50, 52, 54, 56, 58, 60, 62,
4068        );
4069
4070        assert_eq_m256i(r, e);
4071    }
4072
4073    #[simd_test(enable = "avx2")]
4074    unsafe fn test_mm256_adds_epu16_saturate() {
4075        let a = _mm256_set1_epi16(!0);
4076        let b = _mm256_set1_epi16(1);
4077        let r = _mm256_adds_epu16(a, b);
4078        assert_eq_m256i(r, a);
4079    }
4080
4081    #[simd_test(enable = "avx2")]
4082    unsafe fn test_mm256_and_si256() {
4083        let a = _mm256_set1_epi8(5);
4084        let b = _mm256_set1_epi8(3);
4085        let got = _mm256_and_si256(a, b);
4086        assert_eq_m256i(got, _mm256_set1_epi8(1));
4087    }
4088
4089    #[simd_test(enable = "avx2")]
4090    unsafe fn test_mm256_andnot_si256() {
4091        let a = _mm256_set1_epi8(5);
4092        let b = _mm256_set1_epi8(3);
4093        let got = _mm256_andnot_si256(a, b);
4094        assert_eq_m256i(got, _mm256_set1_epi8(2));
4095    }
4096
4097    #[simd_test(enable = "avx2")]
4098    unsafe fn test_mm256_avg_epu8() {
4099        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4100        let r = _mm256_avg_epu8(a, b);
4101        assert_eq_m256i(r, _mm256_set1_epi8(6));
4102    }
4103
4104    #[simd_test(enable = "avx2")]
4105    unsafe fn test_mm256_avg_epu16() {
4106        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4107        let r = _mm256_avg_epu16(a, b);
4108        assert_eq_m256i(r, _mm256_set1_epi16(6));
4109    }
4110
4111    #[simd_test(enable = "avx2")]
4112    unsafe fn test_mm_blend_epi32() {
4113        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4114        let e = _mm_setr_epi32(9, 3, 3, 3);
4115        let r = _mm_blend_epi32::<0x01>(a, b);
4116        assert_eq_m128i(r, e);
4117
4118        let r = _mm_blend_epi32::<0x0E>(b, a);
4119        assert_eq_m128i(r, e);
4120    }
4121
4122    #[simd_test(enable = "avx2")]
4123    unsafe fn test_mm256_blend_epi32() {
4124        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4125        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4126        let r = _mm256_blend_epi32::<0x01>(a, b);
4127        assert_eq_m256i(r, e);
4128
4129        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4130        let r = _mm256_blend_epi32::<0x82>(a, b);
4131        assert_eq_m256i(r, e);
4132
4133        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4134        let r = _mm256_blend_epi32::<0x7C>(a, b);
4135        assert_eq_m256i(r, e);
4136    }
4137
4138    #[simd_test(enable = "avx2")]
4139    unsafe fn test_mm256_blend_epi16() {
4140        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4141        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4142        let r = _mm256_blend_epi16::<0x01>(a, b);
4143        assert_eq_m256i(r, e);
4144
4145        let r = _mm256_blend_epi16::<0xFE>(b, a);
4146        assert_eq_m256i(r, e);
4147    }
4148
4149    #[simd_test(enable = "avx2")]
4150    unsafe fn test_mm256_blendv_epi8() {
4151        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4152        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4153        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4154        let r = _mm256_blendv_epi8(a, b, mask);
4155        assert_eq_m256i(r, e);
4156    }
4157
4158    #[simd_test(enable = "avx2")]
4159    unsafe fn test_mm_broadcastb_epi8() {
4160        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4161        let res = _mm_broadcastb_epi8(a);
4162        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4163    }
4164
4165    #[simd_test(enable = "avx2")]
4166    unsafe fn test_mm256_broadcastb_epi8() {
4167        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4168        let res = _mm256_broadcastb_epi8(a);
4169        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4170    }
4171
4172    #[simd_test(enable = "avx2")]
4173    unsafe fn test_mm_broadcastd_epi32() {
4174        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4175        let res = _mm_broadcastd_epi32(a);
4176        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4177    }
4178
4179    #[simd_test(enable = "avx2")]
4180    unsafe fn test_mm256_broadcastd_epi32() {
4181        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4182        let res = _mm256_broadcastd_epi32(a);
4183        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4184    }
4185
4186    #[simd_test(enable = "avx2")]
4187    unsafe fn test_mm_broadcastq_epi64() {
4188        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4189        let res = _mm_broadcastq_epi64(a);
4190        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4191    }
4192
4193    #[simd_test(enable = "avx2")]
4194    unsafe fn test_mm256_broadcastq_epi64() {
4195        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4196        let res = _mm256_broadcastq_epi64(a);
4197        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4198    }
4199
4200    #[simd_test(enable = "avx2")]
4201    unsafe fn test_mm_broadcastsd_pd() {
4202        let a = _mm_setr_pd(6.88, 3.44);
4203        let res = _mm_broadcastsd_pd(a);
4204        assert_eq_m128d(res, _mm_set1_pd(6.88));
4205    }
4206
4207    #[simd_test(enable = "avx2")]
4208    unsafe fn test_mm256_broadcastsd_pd() {
4209        let a = _mm_setr_pd(6.88, 3.44);
4210        let res = _mm256_broadcastsd_pd(a);
4211        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4212    }
4213
4214    #[simd_test(enable = "avx2")]
4215    unsafe fn test_mm_broadcastsi128_si256() {
4216        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4217        let res = _mm_broadcastsi128_si256(a);
4218        let retval = _mm256_setr_epi64x(
4219            0x0987654321012334,
4220            0x5678909876543210,
4221            0x0987654321012334,
4222            0x5678909876543210,
4223        );
4224        assert_eq_m256i(res, retval);
4225    }
4226
4227    #[simd_test(enable = "avx2")]
4228    unsafe fn test_mm256_broadcastsi128_si256() {
4229        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4230        let res = _mm256_broadcastsi128_si256(a);
4231        let retval = _mm256_setr_epi64x(
4232            0x0987654321012334,
4233            0x5678909876543210,
4234            0x0987654321012334,
4235            0x5678909876543210,
4236        );
4237        assert_eq_m256i(res, retval);
4238    }
4239
4240    #[simd_test(enable = "avx2")]
4241    unsafe fn test_mm_broadcastss_ps() {
4242        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4243        let res = _mm_broadcastss_ps(a);
4244        assert_eq_m128(res, _mm_set1_ps(6.88));
4245    }
4246
4247    #[simd_test(enable = "avx2")]
4248    unsafe fn test_mm256_broadcastss_ps() {
4249        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4250        let res = _mm256_broadcastss_ps(a);
4251        assert_eq_m256(res, _mm256_set1_ps(6.88));
4252    }
4253
4254    #[simd_test(enable = "avx2")]
4255    unsafe fn test_mm_broadcastw_epi16() {
4256        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4257        let res = _mm_broadcastw_epi16(a);
4258        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4259    }
4260
4261    #[simd_test(enable = "avx2")]
4262    unsafe fn test_mm256_broadcastw_epi16() {
4263        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4264        let res = _mm256_broadcastw_epi16(a);
4265        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4266    }
4267
4268    #[simd_test(enable = "avx2")]
4269    unsafe fn test_mm256_cmpeq_epi8() {
4270        #[rustfmt::skip]
4271        let a = _mm256_setr_epi8(
4272            0, 1, 2, 3, 4, 5, 6, 7,
4273            8, 9, 10, 11, 12, 13, 14, 15,
4274            16, 17, 18, 19, 20, 21, 22, 23,
4275            24, 25, 26, 27, 28, 29, 30, 31,
4276        );
4277        #[rustfmt::skip]
4278        let b = _mm256_setr_epi8(
4279            31, 30, 2, 28, 27, 26, 25, 24,
4280            23, 22, 21, 20, 19, 18, 17, 16,
4281            15, 14, 13, 12, 11, 10, 9, 8,
4282            7, 6, 5, 4, 3, 2, 1, 0,
4283        );
4284        let r = _mm256_cmpeq_epi8(a, b);
4285        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4286    }
4287
4288    #[simd_test(enable = "avx2")]
4289    unsafe fn test_mm256_cmpeq_epi16() {
4290        #[rustfmt::skip]
4291        let a = _mm256_setr_epi16(
4292            0, 1, 2, 3, 4, 5, 6, 7,
4293            8, 9, 10, 11, 12, 13, 14, 15,
4294        );
4295        #[rustfmt::skip]
4296        let b = _mm256_setr_epi16(
4297            15, 14, 2, 12, 11, 10, 9, 8,
4298            7, 6, 5, 4, 3, 2, 1, 0,
4299        );
4300        let r = _mm256_cmpeq_epi16(a, b);
4301        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4302    }
4303
4304    #[simd_test(enable = "avx2")]
4305    unsafe fn test_mm256_cmpeq_epi32() {
4306        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4307        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4308        let r = _mm256_cmpeq_epi32(a, b);
4309        let e = _mm256_set1_epi32(0);
4310        let e = _mm256_insert_epi32::<2>(e, !0);
4311        assert_eq_m256i(r, e);
4312    }
4313
4314    #[simd_test(enable = "avx2")]
4315    unsafe fn test_mm256_cmpeq_epi64() {
4316        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4317        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4318        let r = _mm256_cmpeq_epi64(a, b);
4319        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4320    }
4321
4322    #[simd_test(enable = "avx2")]
4323    unsafe fn test_mm256_cmpgt_epi8() {
4324        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4325        let b = _mm256_set1_epi8(0);
4326        let r = _mm256_cmpgt_epi8(a, b);
4327        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4328    }
4329
4330    #[simd_test(enable = "avx2")]
4331    unsafe fn test_mm256_cmpgt_epi16() {
4332        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4333        let b = _mm256_set1_epi16(0);
4334        let r = _mm256_cmpgt_epi16(a, b);
4335        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4336    }
4337
4338    #[simd_test(enable = "avx2")]
4339    unsafe fn test_mm256_cmpgt_epi32() {
4340        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4341        let b = _mm256_set1_epi32(0);
4342        let r = _mm256_cmpgt_epi32(a, b);
4343        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4344    }
4345
4346    #[simd_test(enable = "avx2")]
4347    unsafe fn test_mm256_cmpgt_epi64() {
4348        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4349        let b = _mm256_set1_epi64x(0);
4350        let r = _mm256_cmpgt_epi64(a, b);
4351        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4352    }
4353
4354    #[simd_test(enable = "avx2")]
4355    unsafe fn test_mm256_cvtepi8_epi16() {
4356        #[rustfmt::skip]
4357        let a = _mm_setr_epi8(
4358            0, 0, -1, 1, -2, 2, -3, 3,
4359            -4, 4, -5, 5, -6, 6, -7, 7,
4360        );
4361        #[rustfmt::skip]
4362        let r = _mm256_setr_epi16(
4363            0, 0, -1, 1, -2, 2, -3, 3,
4364            -4, 4, -5, 5, -6, 6, -7, 7,
4365        );
4366        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4367    }
4368
4369    #[simd_test(enable = "avx2")]
4370    unsafe fn test_mm256_cvtepi8_epi32() {
4371        #[rustfmt::skip]
4372        let a = _mm_setr_epi8(
4373            0, 0, -1, 1, -2, 2, -3, 3,
4374            -4, 4, -5, 5, -6, 6, -7, 7,
4375        );
4376        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4377        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4378    }
4379
4380    #[simd_test(enable = "avx2")]
4381    unsafe fn test_mm256_cvtepi8_epi64() {
4382        #[rustfmt::skip]
4383        let a = _mm_setr_epi8(
4384            0, 0, -1, 1, -2, 2, -3, 3,
4385            -4, 4, -5, 5, -6, 6, -7, 7,
4386        );
4387        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4388        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4389    }
4390
4391    #[simd_test(enable = "avx2")]
4392    unsafe fn test_mm256_cvtepi16_epi32() {
4393        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4394        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4395        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4396    }
4397
4398    #[simd_test(enable = "avx2")]
4399    unsafe fn test_mm256_cvtepi16_epi64() {
4400        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4401        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4402        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4403    }
4404
4405    #[simd_test(enable = "avx2")]
4406    unsafe fn test_mm256_cvtepi32_epi64() {
4407        let a = _mm_setr_epi32(0, 0, -1, 1);
4408        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4409        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4410    }
4411
4412    #[simd_test(enable = "avx2")]
4413    unsafe fn test_mm256_cvtepu16_epi32() {
4414        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4415        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4416        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4417    }
4418
4419    #[simd_test(enable = "avx2")]
4420    unsafe fn test_mm256_cvtepu16_epi64() {
4421        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4422        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4423        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4424    }
4425
4426    #[simd_test(enable = "avx2")]
4427    unsafe fn test_mm256_cvtepu32_epi64() {
4428        let a = _mm_setr_epi32(0, 1, 2, 3);
4429        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4430        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4431    }
4432
4433    #[simd_test(enable = "avx2")]
4434    unsafe fn test_mm256_cvtepu8_epi16() {
4435        #[rustfmt::skip]
4436        let a = _mm_setr_epi8(
4437            0, 1, 2, 3, 4, 5, 6, 7,
4438            8, 9, 10, 11, 12, 13, 14, 15,
4439        );
4440        #[rustfmt::skip]
4441        let r = _mm256_setr_epi16(
4442            0, 1, 2, 3, 4, 5, 6, 7,
4443            8, 9, 10, 11, 12, 13, 14, 15,
4444        );
4445        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4446    }
4447
4448    #[simd_test(enable = "avx2")]
4449    unsafe fn test_mm256_cvtepu8_epi32() {
4450        #[rustfmt::skip]
4451        let a = _mm_setr_epi8(
4452            0, 1, 2, 3, 4, 5, 6, 7,
4453            8, 9, 10, 11, 12, 13, 14, 15,
4454        );
4455        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4456        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4457    }
4458
4459    #[simd_test(enable = "avx2")]
4460    unsafe fn test_mm256_cvtepu8_epi64() {
4461        #[rustfmt::skip]
4462        let a = _mm_setr_epi8(
4463            0, 1, 2, 3, 4, 5, 6, 7,
4464            8, 9, 10, 11, 12, 13, 14, 15,
4465        );
4466        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4467        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4468    }
4469
4470    #[simd_test(enable = "avx2")]
4471    unsafe fn test_mm256_extracti128_si256() {
4472        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4473        let r = _mm256_extracti128_si256::<1>(a);
4474        let e = _mm_setr_epi64x(3, 4);
4475        assert_eq_m128i(r, e);
4476    }
4477
4478    #[simd_test(enable = "avx2")]
4479    unsafe fn test_mm256_hadd_epi16() {
4480        let a = _mm256_set1_epi16(2);
4481        let b = _mm256_set1_epi16(4);
4482        let r = _mm256_hadd_epi16(a, b);
4483        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4484        assert_eq_m256i(r, e);
4485    }
4486
4487    #[simd_test(enable = "avx2")]
4488    unsafe fn test_mm256_hadd_epi32() {
4489        let a = _mm256_set1_epi32(2);
4490        let b = _mm256_set1_epi32(4);
4491        let r = _mm256_hadd_epi32(a, b);
4492        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4493        assert_eq_m256i(r, e);
4494    }
4495
4496    #[simd_test(enable = "avx2")]
4497    unsafe fn test_mm256_hadds_epi16() {
4498        let a = _mm256_set1_epi16(2);
4499        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4500        let a = _mm256_insert_epi16::<1>(a, 1);
4501        let b = _mm256_set1_epi16(4);
4502        let r = _mm256_hadds_epi16(a, b);
4503        #[rustfmt::skip]
4504        let e = _mm256_setr_epi16(
4505            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4506            4, 4, 4, 4, 8, 8, 8, 8,
4507        );
4508        assert_eq_m256i(r, e);
4509    }
4510
4511    #[simd_test(enable = "avx2")]
4512    unsafe fn test_mm256_hsub_epi16() {
4513        let a = _mm256_set1_epi16(2);
4514        let b = _mm256_set1_epi16(4);
4515        let r = _mm256_hsub_epi16(a, b);
4516        let e = _mm256_set1_epi16(0);
4517        assert_eq_m256i(r, e);
4518    }
4519
4520    #[simd_test(enable = "avx2")]
4521    unsafe fn test_mm256_hsub_epi32() {
4522        let a = _mm256_set1_epi32(2);
4523        let b = _mm256_set1_epi32(4);
4524        let r = _mm256_hsub_epi32(a, b);
4525        let e = _mm256_set1_epi32(0);
4526        assert_eq_m256i(r, e);
4527    }
4528
4529    #[simd_test(enable = "avx2")]
4530    unsafe fn test_mm256_hsubs_epi16() {
4531        let a = _mm256_set1_epi16(2);
4532        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4533        let a = _mm256_insert_epi16::<1>(a, -1);
4534        let b = _mm256_set1_epi16(4);
4535        let r = _mm256_hsubs_epi16(a, b);
4536        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4537        assert_eq_m256i(r, e);
4538    }
4539
4540    #[simd_test(enable = "avx2")]
4541    unsafe fn test_mm256_madd_epi16() {
4542        let a = _mm256_set1_epi16(2);
4543        let b = _mm256_set1_epi16(4);
4544        let r = _mm256_madd_epi16(a, b);
4545        let e = _mm256_set1_epi32(16);
4546        assert_eq_m256i(r, e);
4547    }
4548
4549    #[simd_test(enable = "avx2")]
4550    unsafe fn test_mm256_inserti128_si256() {
4551        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4552        let b = _mm_setr_epi64x(7, 8);
4553        let r = _mm256_inserti128_si256::<1>(a, b);
4554        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4555        assert_eq_m256i(r, e);
4556    }
4557
4558    #[simd_test(enable = "avx2")]
4559    unsafe fn test_mm256_maddubs_epi16() {
4560        let a = _mm256_set1_epi8(2);
4561        let b = _mm256_set1_epi8(4);
4562        let r = _mm256_maddubs_epi16(a, b);
4563        let e = _mm256_set1_epi16(16);
4564        assert_eq_m256i(r, e);
4565    }
4566
4567    #[simd_test(enable = "avx2")]
4568    unsafe fn test_mm_maskload_epi32() {
4569        let nums = [1, 2, 3, 4];
4570        let a = &nums as *const i32;
4571        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4572        let r = _mm_maskload_epi32(a, mask);
4573        let e = _mm_setr_epi32(1, 0, 0, 4);
4574        assert_eq_m128i(r, e);
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    unsafe fn test_mm256_maskload_epi32() {
4579        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4580        let a = &nums as *const i32;
4581        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4582        let r = _mm256_maskload_epi32(a, mask);
4583        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4584        assert_eq_m256i(r, e);
4585    }
4586
4587    #[simd_test(enable = "avx2")]
4588    unsafe fn test_mm_maskload_epi64() {
4589        let nums = [1_i64, 2_i64];
4590        let a = &nums as *const i64;
4591        let mask = _mm_setr_epi64x(0, -1);
4592        let r = _mm_maskload_epi64(a, mask);
4593        let e = _mm_setr_epi64x(0, 2);
4594        assert_eq_m128i(r, e);
4595    }
4596
4597    #[simd_test(enable = "avx2")]
4598    unsafe fn test_mm256_maskload_epi64() {
4599        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4600        let a = &nums as *const i64;
4601        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4602        let r = _mm256_maskload_epi64(a, mask);
4603        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4604        assert_eq_m256i(r, e);
4605    }
4606
4607    #[simd_test(enable = "avx2")]
4608    unsafe fn test_mm_maskstore_epi32() {
4609        let a = _mm_setr_epi32(1, 2, 3, 4);
4610        let mut arr = [-1, -1, -1, -1];
4611        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4612        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4613        let e = [1, -1, -1, 4];
4614        assert_eq!(arr, e);
4615    }
4616
4617    #[simd_test(enable = "avx2")]
4618    unsafe fn test_mm256_maskstore_epi32() {
4619        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4620        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4621        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4622        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4623        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4624        assert_eq!(arr, e);
4625    }
4626
4627    #[simd_test(enable = "avx2")]
4628    unsafe fn test_mm_maskstore_epi64() {
4629        let a = _mm_setr_epi64x(1_i64, 2_i64);
4630        let mut arr = [-1_i64, -1_i64];
4631        let mask = _mm_setr_epi64x(0, -1);
4632        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4633        let e = [-1, 2];
4634        assert_eq!(arr, e);
4635    }
4636
4637    #[simd_test(enable = "avx2")]
4638    unsafe fn test_mm256_maskstore_epi64() {
4639        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4640        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4641        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4642        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4643        let e = [-1, 2, 3, -1];
4644        assert_eq!(arr, e);
4645    }
4646
4647    #[simd_test(enable = "avx2")]
4648    unsafe fn test_mm256_max_epi16() {
4649        let a = _mm256_set1_epi16(2);
4650        let b = _mm256_set1_epi16(4);
4651        let r = _mm256_max_epi16(a, b);
4652        assert_eq_m256i(r, b);
4653    }
4654
4655    #[simd_test(enable = "avx2")]
4656    unsafe fn test_mm256_max_epi32() {
4657        let a = _mm256_set1_epi32(2);
4658        let b = _mm256_set1_epi32(4);
4659        let r = _mm256_max_epi32(a, b);
4660        assert_eq_m256i(r, b);
4661    }
4662
4663    #[simd_test(enable = "avx2")]
4664    unsafe fn test_mm256_max_epi8() {
4665        let a = _mm256_set1_epi8(2);
4666        let b = _mm256_set1_epi8(4);
4667        let r = _mm256_max_epi8(a, b);
4668        assert_eq_m256i(r, b);
4669    }
4670
4671    #[simd_test(enable = "avx2")]
4672    unsafe fn test_mm256_max_epu16() {
4673        let a = _mm256_set1_epi16(2);
4674        let b = _mm256_set1_epi16(4);
4675        let r = _mm256_max_epu16(a, b);
4676        assert_eq_m256i(r, b);
4677    }
4678
4679    #[simd_test(enable = "avx2")]
4680    unsafe fn test_mm256_max_epu32() {
4681        let a = _mm256_set1_epi32(2);
4682        let b = _mm256_set1_epi32(4);
4683        let r = _mm256_max_epu32(a, b);
4684        assert_eq_m256i(r, b);
4685    }
4686
4687    #[simd_test(enable = "avx2")]
4688    unsafe fn test_mm256_max_epu8() {
4689        let a = _mm256_set1_epi8(2);
4690        let b = _mm256_set1_epi8(4);
4691        let r = _mm256_max_epu8(a, b);
4692        assert_eq_m256i(r, b);
4693    }
4694
4695    #[simd_test(enable = "avx2")]
4696    unsafe fn test_mm256_min_epi16() {
4697        let a = _mm256_set1_epi16(2);
4698        let b = _mm256_set1_epi16(4);
4699        let r = _mm256_min_epi16(a, b);
4700        assert_eq_m256i(r, a);
4701    }
4702
4703    #[simd_test(enable = "avx2")]
4704    unsafe fn test_mm256_min_epi32() {
4705        let a = _mm256_set1_epi32(2);
4706        let b = _mm256_set1_epi32(4);
4707        let r = _mm256_min_epi32(a, b);
4708        assert_eq_m256i(r, a);
4709    }
4710
4711    #[simd_test(enable = "avx2")]
4712    unsafe fn test_mm256_min_epi8() {
4713        let a = _mm256_set1_epi8(2);
4714        let b = _mm256_set1_epi8(4);
4715        let r = _mm256_min_epi8(a, b);
4716        assert_eq_m256i(r, a);
4717    }
4718
4719    #[simd_test(enable = "avx2")]
4720    unsafe fn test_mm256_min_epu16() {
4721        let a = _mm256_set1_epi16(2);
4722        let b = _mm256_set1_epi16(4);
4723        let r = _mm256_min_epu16(a, b);
4724        assert_eq_m256i(r, a);
4725    }
4726
4727    #[simd_test(enable = "avx2")]
4728    unsafe fn test_mm256_min_epu32() {
4729        let a = _mm256_set1_epi32(2);
4730        let b = _mm256_set1_epi32(4);
4731        let r = _mm256_min_epu32(a, b);
4732        assert_eq_m256i(r, a);
4733    }
4734
4735    #[simd_test(enable = "avx2")]
4736    unsafe fn test_mm256_min_epu8() {
4737        let a = _mm256_set1_epi8(2);
4738        let b = _mm256_set1_epi8(4);
4739        let r = _mm256_min_epu8(a, b);
4740        assert_eq_m256i(r, a);
4741    }
4742
4743    #[simd_test(enable = "avx2")]
4744    unsafe fn test_mm256_movemask_epi8() {
4745        let a = _mm256_set1_epi8(-1);
4746        let r = _mm256_movemask_epi8(a);
4747        let e = -1;
4748        assert_eq!(r, e);
4749    }
4750
4751    #[simd_test(enable = "avx2")]
4752    unsafe fn test_mm256_mpsadbw_epu8() {
4753        let a = _mm256_set1_epi8(2);
4754        let b = _mm256_set1_epi8(4);
4755        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4756        let e = _mm256_set1_epi16(8);
4757        assert_eq_m256i(r, e);
4758    }
4759
4760    #[simd_test(enable = "avx2")]
4761    unsafe fn test_mm256_mul_epi32() {
4762        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4763        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4764        let r = _mm256_mul_epi32(a, b);
4765        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4766        assert_eq_m256i(r, e);
4767    }
4768
4769    #[simd_test(enable = "avx2")]
4770    unsafe fn test_mm256_mul_epu32() {
4771        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4772        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4773        let r = _mm256_mul_epu32(a, b);
4774        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4775        assert_eq_m256i(r, e);
4776    }
4777
4778    #[simd_test(enable = "avx2")]
4779    unsafe fn test_mm256_mulhi_epi16() {
4780        let a = _mm256_set1_epi16(6535);
4781        let b = _mm256_set1_epi16(6535);
4782        let r = _mm256_mulhi_epi16(a, b);
4783        let e = _mm256_set1_epi16(651);
4784        assert_eq_m256i(r, e);
4785    }
4786
4787    #[simd_test(enable = "avx2")]
4788    unsafe fn test_mm256_mulhi_epu16() {
4789        let a = _mm256_set1_epi16(6535);
4790        let b = _mm256_set1_epi16(6535);
4791        let r = _mm256_mulhi_epu16(a, b);
4792        let e = _mm256_set1_epi16(651);
4793        assert_eq_m256i(r, e);
4794    }
4795
4796    #[simd_test(enable = "avx2")]
4797    unsafe fn test_mm256_mullo_epi16() {
4798        let a = _mm256_set1_epi16(2);
4799        let b = _mm256_set1_epi16(4);
4800        let r = _mm256_mullo_epi16(a, b);
4801        let e = _mm256_set1_epi16(8);
4802        assert_eq_m256i(r, e);
4803    }
4804
4805    #[simd_test(enable = "avx2")]
4806    unsafe fn test_mm256_mullo_epi32() {
4807        let a = _mm256_set1_epi32(2);
4808        let b = _mm256_set1_epi32(4);
4809        let r = _mm256_mullo_epi32(a, b);
4810        let e = _mm256_set1_epi32(8);
4811        assert_eq_m256i(r, e);
4812    }
4813
4814    #[simd_test(enable = "avx2")]
4815    unsafe fn test_mm256_mulhrs_epi16() {
4816        let a = _mm256_set1_epi16(2);
4817        let b = _mm256_set1_epi16(4);
4818        let r = _mm256_mullo_epi16(a, b);
4819        let e = _mm256_set1_epi16(8);
4820        assert_eq_m256i(r, e);
4821    }
4822
4823    #[simd_test(enable = "avx2")]
4824    unsafe fn test_mm256_or_si256() {
4825        let a = _mm256_set1_epi8(-1);
4826        let b = _mm256_set1_epi8(0);
4827        let r = _mm256_or_si256(a, b);
4828        assert_eq_m256i(r, a);
4829    }
4830
4831    #[simd_test(enable = "avx2")]
4832    unsafe fn test_mm256_packs_epi16() {
4833        let a = _mm256_set1_epi16(2);
4834        let b = _mm256_set1_epi16(4);
4835        let r = _mm256_packs_epi16(a, b);
4836        #[rustfmt::skip]
4837        let e = _mm256_setr_epi8(
4838            2, 2, 2, 2, 2, 2, 2, 2,
4839            4, 4, 4, 4, 4, 4, 4, 4,
4840            2, 2, 2, 2, 2, 2, 2, 2,
4841            4, 4, 4, 4, 4, 4, 4, 4,
4842        );
4843
4844        assert_eq_m256i(r, e);
4845    }
4846
4847    #[simd_test(enable = "avx2")]
4848    unsafe fn test_mm256_packs_epi32() {
4849        let a = _mm256_set1_epi32(2);
4850        let b = _mm256_set1_epi32(4);
4851        let r = _mm256_packs_epi32(a, b);
4852        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4853
4854        assert_eq_m256i(r, e);
4855    }
4856
4857    #[simd_test(enable = "avx2")]
4858    unsafe fn test_mm256_packus_epi16() {
4859        let a = _mm256_set1_epi16(2);
4860        let b = _mm256_set1_epi16(4);
4861        let r = _mm256_packus_epi16(a, b);
4862        #[rustfmt::skip]
4863        let e = _mm256_setr_epi8(
4864            2, 2, 2, 2, 2, 2, 2, 2,
4865            4, 4, 4, 4, 4, 4, 4, 4,
4866            2, 2, 2, 2, 2, 2, 2, 2,
4867            4, 4, 4, 4, 4, 4, 4, 4,
4868        );
4869
4870        assert_eq_m256i(r, e);
4871    }
4872
4873    #[simd_test(enable = "avx2")]
4874    unsafe fn test_mm256_packus_epi32() {
4875        let a = _mm256_set1_epi32(2);
4876        let b = _mm256_set1_epi32(4);
4877        let r = _mm256_packus_epi32(a, b);
4878        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4879
4880        assert_eq_m256i(r, e);
4881    }
4882
4883    #[simd_test(enable = "avx2")]
4884    unsafe fn test_mm256_sad_epu8() {
4885        let a = _mm256_set1_epi8(2);
4886        let b = _mm256_set1_epi8(4);
4887        let r = _mm256_sad_epu8(a, b);
4888        let e = _mm256_set1_epi64x(16);
4889        assert_eq_m256i(r, e);
4890    }
4891
4892    #[simd_test(enable = "avx2")]
4893    unsafe fn test_mm256_shufflehi_epi16() {
4894        #[rustfmt::skip]
4895        let a = _mm256_setr_epi16(
4896            0, 1, 2, 3, 11, 22, 33, 44,
4897            4, 5, 6, 7, 55, 66, 77, 88,
4898        );
4899        #[rustfmt::skip]
4900        let e = _mm256_setr_epi16(
4901            0, 1, 2, 3, 44, 22, 22, 11,
4902            4, 5, 6, 7, 88, 66, 66, 55,
4903        );
4904        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
4905        assert_eq_m256i(r, e);
4906    }
4907
4908    #[simd_test(enable = "avx2")]
4909    unsafe fn test_mm256_shufflelo_epi16() {
4910        #[rustfmt::skip]
4911        let a = _mm256_setr_epi16(
4912            11, 22, 33, 44, 0, 1, 2, 3,
4913            55, 66, 77, 88, 4, 5, 6, 7,
4914        );
4915        #[rustfmt::skip]
4916        let e = _mm256_setr_epi16(
4917            44, 22, 22, 11, 0, 1, 2, 3,
4918            88, 66, 66, 55, 4, 5, 6, 7,
4919        );
4920        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
4921        assert_eq_m256i(r, e);
4922    }
4923
4924    #[simd_test(enable = "avx2")]
4925    unsafe fn test_mm256_sign_epi16() {
4926        let a = _mm256_set1_epi16(2);
4927        let b = _mm256_set1_epi16(-1);
4928        let r = _mm256_sign_epi16(a, b);
4929        let e = _mm256_set1_epi16(-2);
4930        assert_eq_m256i(r, e);
4931    }
4932
4933    #[simd_test(enable = "avx2")]
4934    unsafe fn test_mm256_sign_epi32() {
4935        let a = _mm256_set1_epi32(2);
4936        let b = _mm256_set1_epi32(-1);
4937        let r = _mm256_sign_epi32(a, b);
4938        let e = _mm256_set1_epi32(-2);
4939        assert_eq_m256i(r, e);
4940    }
4941
4942    #[simd_test(enable = "avx2")]
4943    unsafe fn test_mm256_sign_epi8() {
4944        let a = _mm256_set1_epi8(2);
4945        let b = _mm256_set1_epi8(-1);
4946        let r = _mm256_sign_epi8(a, b);
4947        let e = _mm256_set1_epi8(-2);
4948        assert_eq_m256i(r, e);
4949    }
4950
4951    #[simd_test(enable = "avx2")]
4952    unsafe fn test_mm256_sll_epi16() {
4953        let a = _mm256_set1_epi16(0xFF);
4954        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
4955        let r = _mm256_sll_epi16(a, b);
4956        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
4957    }
4958
4959    #[simd_test(enable = "avx2")]
4960    unsafe fn test_mm256_sll_epi32() {
4961        let a = _mm256_set1_epi32(0xFFFF);
4962        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
4963        let r = _mm256_sll_epi32(a, b);
4964        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
4965    }
4966
4967    #[simd_test(enable = "avx2")]
4968    unsafe fn test_mm256_sll_epi64() {
4969        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4970        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
4971        let r = _mm256_sll_epi64(a, b);
4972        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
4973    }
4974
4975    #[simd_test(enable = "avx2")]
4976    unsafe fn test_mm256_slli_epi16() {
4977        assert_eq_m256i(
4978            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
4979            _mm256_set1_epi16(0xFF0),
4980        );
4981    }
4982
4983    #[simd_test(enable = "avx2")]
4984    unsafe fn test_mm256_slli_epi32() {
4985        assert_eq_m256i(
4986            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
4987            _mm256_set1_epi32(0xFFFF0),
4988        );
4989    }
4990
4991    #[simd_test(enable = "avx2")]
4992    unsafe fn test_mm256_slli_epi64() {
4993        assert_eq_m256i(
4994            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
4995            _mm256_set1_epi64x(0xFFFFFFFF0),
4996        );
4997    }
4998
4999    #[simd_test(enable = "avx2")]
5000    unsafe fn test_mm256_slli_si256() {
5001        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5002        let r = _mm256_slli_si256::<3>(a);
5003        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5004    }
5005
5006    #[simd_test(enable = "avx2")]
5007    unsafe fn test_mm_sllv_epi32() {
5008        let a = _mm_set1_epi32(2);
5009        let b = _mm_set1_epi32(1);
5010        let r = _mm_sllv_epi32(a, b);
5011        let e = _mm_set1_epi32(4);
5012        assert_eq_m128i(r, e);
5013    }
5014
5015    #[simd_test(enable = "avx2")]
5016    unsafe fn test_mm256_sllv_epi32() {
5017        let a = _mm256_set1_epi32(2);
5018        let b = _mm256_set1_epi32(1);
5019        let r = _mm256_sllv_epi32(a, b);
5020        let e = _mm256_set1_epi32(4);
5021        assert_eq_m256i(r, e);
5022    }
5023
5024    #[simd_test(enable = "avx2")]
5025    unsafe fn test_mm_sllv_epi64() {
5026        let a = _mm_set1_epi64x(2);
5027        let b = _mm_set1_epi64x(1);
5028        let r = _mm_sllv_epi64(a, b);
5029        let e = _mm_set1_epi64x(4);
5030        assert_eq_m128i(r, e);
5031    }
5032
5033    #[simd_test(enable = "avx2")]
5034    unsafe fn test_mm256_sllv_epi64() {
5035        let a = _mm256_set1_epi64x(2);
5036        let b = _mm256_set1_epi64x(1);
5037        let r = _mm256_sllv_epi64(a, b);
5038        let e = _mm256_set1_epi64x(4);
5039        assert_eq_m256i(r, e);
5040    }
5041
5042    #[simd_test(enable = "avx2")]
5043    unsafe fn test_mm256_sra_epi16() {
5044        let a = _mm256_set1_epi16(-1);
5045        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5046        let r = _mm256_sra_epi16(a, b);
5047        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5048    }
5049
5050    #[simd_test(enable = "avx2")]
5051    unsafe fn test_mm256_sra_epi32() {
5052        let a = _mm256_set1_epi32(-1);
5053        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5054        let r = _mm256_sra_epi32(a, b);
5055        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5056    }
5057
5058    #[simd_test(enable = "avx2")]
5059    unsafe fn test_mm256_srai_epi16() {
5060        assert_eq_m256i(
5061            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5062            _mm256_set1_epi16(-1),
5063        );
5064    }
5065
5066    #[simd_test(enable = "avx2")]
5067    unsafe fn test_mm256_srai_epi32() {
5068        assert_eq_m256i(
5069            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5070            _mm256_set1_epi32(-1),
5071        );
5072    }
5073
5074    #[simd_test(enable = "avx2")]
5075    unsafe fn test_mm_srav_epi32() {
5076        let a = _mm_set1_epi32(4);
5077        let count = _mm_set1_epi32(1);
5078        let r = _mm_srav_epi32(a, count);
5079        let e = _mm_set1_epi32(2);
5080        assert_eq_m128i(r, e);
5081    }
5082
5083    #[simd_test(enable = "avx2")]
5084    unsafe fn test_mm256_srav_epi32() {
5085        let a = _mm256_set1_epi32(4);
5086        let count = _mm256_set1_epi32(1);
5087        let r = _mm256_srav_epi32(a, count);
5088        let e = _mm256_set1_epi32(2);
5089        assert_eq_m256i(r, e);
5090    }
5091
5092    #[simd_test(enable = "avx2")]
5093    unsafe fn test_mm256_srli_si256() {
5094        #[rustfmt::skip]
5095        let a = _mm256_setr_epi8(
5096            1, 2, 3, 4, 5, 6, 7, 8,
5097            9, 10, 11, 12, 13, 14, 15, 16,
5098            17, 18, 19, 20, 21, 22, 23, 24,
5099            25, 26, 27, 28, 29, 30, 31, 32,
5100        );
5101        let r = _mm256_srli_si256::<3>(a);
5102        #[rustfmt::skip]
5103        let e = _mm256_setr_epi8(
5104            4, 5, 6, 7, 8, 9, 10, 11,
5105            12, 13, 14, 15, 16, 0, 0, 0,
5106            20, 21, 22, 23, 24, 25, 26, 27,
5107            28, 29, 30, 31, 32, 0, 0, 0,
5108        );
5109        assert_eq_m256i(r, e);
5110    }
5111
5112    #[simd_test(enable = "avx2")]
5113    unsafe fn test_mm256_srl_epi16() {
5114        let a = _mm256_set1_epi16(0xFF);
5115        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5116        let r = _mm256_srl_epi16(a, b);
5117        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5118    }
5119
5120    #[simd_test(enable = "avx2")]
5121    unsafe fn test_mm256_srl_epi32() {
5122        let a = _mm256_set1_epi32(0xFFFF);
5123        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5124        let r = _mm256_srl_epi32(a, b);
5125        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5126    }
5127
5128    #[simd_test(enable = "avx2")]
5129    unsafe fn test_mm256_srl_epi64() {
5130        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5131        let b = _mm_setr_epi64x(4, 0);
5132        let r = _mm256_srl_epi64(a, b);
5133        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5134    }
5135
5136    #[simd_test(enable = "avx2")]
5137    unsafe fn test_mm256_srli_epi16() {
5138        assert_eq_m256i(
5139            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5140            _mm256_set1_epi16(0xF),
5141        );
5142    }
5143
5144    #[simd_test(enable = "avx2")]
5145    unsafe fn test_mm256_srli_epi32() {
5146        assert_eq_m256i(
5147            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5148            _mm256_set1_epi32(0xFFF),
5149        );
5150    }
5151
5152    #[simd_test(enable = "avx2")]
5153    unsafe fn test_mm256_srli_epi64() {
5154        assert_eq_m256i(
5155            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5156            _mm256_set1_epi64x(0xFFFFFFF),
5157        );
5158    }
5159
5160    #[simd_test(enable = "avx2")]
5161    unsafe fn test_mm_srlv_epi32() {
5162        let a = _mm_set1_epi32(2);
5163        let count = _mm_set1_epi32(1);
5164        let r = _mm_srlv_epi32(a, count);
5165        let e = _mm_set1_epi32(1);
5166        assert_eq_m128i(r, e);
5167    }
5168
5169    #[simd_test(enable = "avx2")]
5170    unsafe fn test_mm256_srlv_epi32() {
5171        let a = _mm256_set1_epi32(2);
5172        let count = _mm256_set1_epi32(1);
5173        let r = _mm256_srlv_epi32(a, count);
5174        let e = _mm256_set1_epi32(1);
5175        assert_eq_m256i(r, e);
5176    }
5177
5178    #[simd_test(enable = "avx2")]
5179    unsafe fn test_mm_srlv_epi64() {
5180        let a = _mm_set1_epi64x(2);
5181        let count = _mm_set1_epi64x(1);
5182        let r = _mm_srlv_epi64(a, count);
5183        let e = _mm_set1_epi64x(1);
5184        assert_eq_m128i(r, e);
5185    }
5186
5187    #[simd_test(enable = "avx2")]
5188    unsafe fn test_mm256_srlv_epi64() {
5189        let a = _mm256_set1_epi64x(2);
5190        let count = _mm256_set1_epi64x(1);
5191        let r = _mm256_srlv_epi64(a, count);
5192        let e = _mm256_set1_epi64x(1);
5193        assert_eq_m256i(r, e);
5194    }
5195
5196    #[simd_test(enable = "avx2")]
5197    unsafe fn test_mm256_stream_load_si256() {
5198        let a = _mm256_set_epi64x(5, 6, 7, 8);
5199        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5200        assert_eq_m256i(a, r);
5201    }
5202
5203    #[simd_test(enable = "avx2")]
5204    unsafe fn test_mm256_sub_epi16() {
5205        let a = _mm256_set1_epi16(4);
5206        let b = _mm256_set1_epi16(2);
5207        let r = _mm256_sub_epi16(a, b);
5208        assert_eq_m256i(r, b);
5209    }
5210
5211    #[simd_test(enable = "avx2")]
5212    unsafe fn test_mm256_sub_epi32() {
5213        let a = _mm256_set1_epi32(4);
5214        let b = _mm256_set1_epi32(2);
5215        let r = _mm256_sub_epi32(a, b);
5216        assert_eq_m256i(r, b);
5217    }
5218
5219    #[simd_test(enable = "avx2")]
5220    unsafe fn test_mm256_sub_epi64() {
5221        let a = _mm256_set1_epi64x(4);
5222        let b = _mm256_set1_epi64x(2);
5223        let r = _mm256_sub_epi64(a, b);
5224        assert_eq_m256i(r, b);
5225    }
5226
5227    #[simd_test(enable = "avx2")]
5228    unsafe fn test_mm256_sub_epi8() {
5229        let a = _mm256_set1_epi8(4);
5230        let b = _mm256_set1_epi8(2);
5231        let r = _mm256_sub_epi8(a, b);
5232        assert_eq_m256i(r, b);
5233    }
5234
5235    #[simd_test(enable = "avx2")]
5236    unsafe fn test_mm256_subs_epi16() {
5237        let a = _mm256_set1_epi16(4);
5238        let b = _mm256_set1_epi16(2);
5239        let r = _mm256_subs_epi16(a, b);
5240        assert_eq_m256i(r, b);
5241    }
5242
5243    #[simd_test(enable = "avx2")]
5244    unsafe fn test_mm256_subs_epi8() {
5245        let a = _mm256_set1_epi8(4);
5246        let b = _mm256_set1_epi8(2);
5247        let r = _mm256_subs_epi8(a, b);
5248        assert_eq_m256i(r, b);
5249    }
5250
5251    #[simd_test(enable = "avx2")]
5252    unsafe fn test_mm256_subs_epu16() {
5253        let a = _mm256_set1_epi16(4);
5254        let b = _mm256_set1_epi16(2);
5255        let r = _mm256_subs_epu16(a, b);
5256        assert_eq_m256i(r, b);
5257    }
5258
5259    #[simd_test(enable = "avx2")]
5260    unsafe fn test_mm256_subs_epu8() {
5261        let a = _mm256_set1_epi8(4);
5262        let b = _mm256_set1_epi8(2);
5263        let r = _mm256_subs_epu8(a, b);
5264        assert_eq_m256i(r, b);
5265    }
5266
5267    #[simd_test(enable = "avx2")]
5268    unsafe fn test_mm256_xor_si256() {
5269        let a = _mm256_set1_epi8(5);
5270        let b = _mm256_set1_epi8(3);
5271        let r = _mm256_xor_si256(a, b);
5272        assert_eq_m256i(r, _mm256_set1_epi8(6));
5273    }
5274
5275    #[simd_test(enable = "avx2")]
5276    unsafe fn test_mm256_alignr_epi8() {
5277        #[rustfmt::skip]
5278        let a = _mm256_setr_epi8(
5279            1, 2, 3, 4, 5, 6, 7, 8,
5280            9, 10, 11, 12, 13, 14, 15, 16,
5281            17, 18, 19, 20, 21, 22, 23, 24,
5282            25, 26, 27, 28, 29, 30, 31, 32,
5283        );
5284        #[rustfmt::skip]
5285        let b = _mm256_setr_epi8(
5286            -1, -2, -3, -4, -5, -6, -7, -8,
5287            -9, -10, -11, -12, -13, -14, -15, -16,
5288            -17, -18, -19, -20, -21, -22, -23, -24,
5289            -25, -26, -27, -28, -29, -30, -31, -32,
5290        );
5291        let r = _mm256_alignr_epi8::<33>(a, b);
5292        assert_eq_m256i(r, _mm256_set1_epi8(0));
5293
5294        let r = _mm256_alignr_epi8::<17>(a, b);
5295        #[rustfmt::skip]
5296        let expected = _mm256_setr_epi8(
5297            2, 3, 4, 5, 6, 7, 8, 9,
5298            10, 11, 12, 13, 14, 15, 16, 0,
5299            18, 19, 20, 21, 22, 23, 24, 25,
5300            26, 27, 28, 29, 30, 31, 32, 0,
5301        );
5302        assert_eq_m256i(r, expected);
5303
5304        let r = _mm256_alignr_epi8::<4>(a, b);
5305        #[rustfmt::skip]
5306        let expected = _mm256_setr_epi8(
5307            -5, -6, -7, -8, -9, -10, -11, -12,
5308            -13, -14, -15, -16, 1, 2, 3, 4,
5309            -21, -22, -23, -24, -25, -26, -27, -28,
5310            -29, -30, -31, -32, 17, 18, 19, 20,
5311        );
5312        assert_eq_m256i(r, expected);
5313
5314        let r = _mm256_alignr_epi8::<15>(a, b);
5315        #[rustfmt::skip]
5316        let expected = _mm256_setr_epi8(
5317            -16, 1, 2, 3, 4, 5, 6, 7,
5318            8, 9, 10, 11, 12, 13, 14, 15,
5319            -32, 17, 18, 19, 20, 21, 22, 23,
5320            24, 25, 26, 27, 28, 29, 30, 31,
5321        );
5322        assert_eq_m256i(r, expected);
5323
5324        let r = _mm256_alignr_epi8::<0>(a, b);
5325        assert_eq_m256i(r, b);
5326
5327        let r = _mm256_alignr_epi8::<16>(a, b);
5328        assert_eq_m256i(r, a);
5329    }
5330
5331    #[simd_test(enable = "avx2")]
5332    unsafe fn test_mm256_shuffle_epi8() {
5333        #[rustfmt::skip]
5334        let a = _mm256_setr_epi8(
5335            1, 2, 3, 4, 5, 6, 7, 8,
5336            9, 10, 11, 12, 13, 14, 15, 16,
5337            17, 18, 19, 20, 21, 22, 23, 24,
5338            25, 26, 27, 28, 29, 30, 31, 32,
5339        );
5340        #[rustfmt::skip]
5341        let b = _mm256_setr_epi8(
5342            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5343            12, 5, 5, 10, 4, 1, 8, 0,
5344            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5345            12, 5, 5, 10, 4, 1, 8, 0,
5346        );
5347        #[rustfmt::skip]
5348        let expected = _mm256_setr_epi8(
5349            5, 0, 5, 4, 9, 13, 7, 4,
5350            13, 6, 6, 11, 5, 2, 9, 1,
5351            21, 0, 21, 20, 25, 29, 23, 20,
5352            29, 22, 22, 27, 21, 18, 25, 17,
5353        );
5354        let r = _mm256_shuffle_epi8(a, b);
5355        assert_eq_m256i(r, expected);
5356    }
5357
5358    #[simd_test(enable = "avx2")]
5359    unsafe fn test_mm256_permutevar8x32_epi32() {
5360        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5361        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5362        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5363        let r = _mm256_permutevar8x32_epi32(a, b);
5364        assert_eq_m256i(r, expected);
5365    }
5366
5367    #[simd_test(enable = "avx2")]
5368    unsafe fn test_mm256_permute4x64_epi64() {
5369        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5370        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5371        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5372        assert_eq_m256i(r, expected);
5373    }
5374
5375    #[simd_test(enable = "avx2")]
5376    unsafe fn test_mm256_permute2x128_si256() {
5377        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5378        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5379        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5380        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5381        assert_eq_m256i(r, e);
5382    }
5383
5384    #[simd_test(enable = "avx2")]
5385    unsafe fn test_mm256_permute4x64_pd() {
5386        let a = _mm256_setr_pd(1., 2., 3., 4.);
5387        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5388        let e = _mm256_setr_pd(4., 1., 2., 1.);
5389        assert_eq_m256d(r, e);
5390    }
5391
5392    #[simd_test(enable = "avx2")]
5393    unsafe fn test_mm256_permutevar8x32_ps() {
5394        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5395        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5396        let r = _mm256_permutevar8x32_ps(a, b);
5397        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5398        assert_eq_m256(r, e);
5399    }
5400
5401    #[simd_test(enable = "avx2")]
5402    unsafe fn test_mm_i32gather_epi32() {
5403        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5404        // A multiplier of 4 is word-addressing
5405        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5406        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5407    }
5408
5409    #[simd_test(enable = "avx2")]
5410    unsafe fn test_mm_mask_i32gather_epi32() {
5411        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5412        // A multiplier of 4 is word-addressing
5413        let r = _mm_mask_i32gather_epi32::<4>(
5414            _mm_set1_epi32(256),
5415            arr.as_ptr(),
5416            _mm_setr_epi32(0, 16, 64, 96),
5417            _mm_setr_epi32(-1, -1, -1, 0),
5418        );
5419        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5420    }
5421
5422    #[simd_test(enable = "avx2")]
5423    unsafe fn test_mm256_i32gather_epi32() {
5424        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5425        // A multiplier of 4 is word-addressing
5426        let r =
5427            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5428        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5429    }
5430
5431    #[simd_test(enable = "avx2")]
5432    unsafe fn test_mm256_mask_i32gather_epi32() {
5433        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5434        // A multiplier of 4 is word-addressing
5435        let r = _mm256_mask_i32gather_epi32::<4>(
5436            _mm256_set1_epi32(256),
5437            arr.as_ptr(),
5438            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5439            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5440        );
5441        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5442    }
5443
5444    #[simd_test(enable = "avx2")]
5445    unsafe fn test_mm_i32gather_ps() {
5446        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5447        // A multiplier of 4 is word-addressing for f32s
5448        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5449        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5450    }
5451
5452    #[simd_test(enable = "avx2")]
5453    unsafe fn test_mm_mask_i32gather_ps() {
5454        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5455        // A multiplier of 4 is word-addressing for f32s
5456        let r = _mm_mask_i32gather_ps::<4>(
5457            _mm_set1_ps(256.0),
5458            arr.as_ptr(),
5459            _mm_setr_epi32(0, 16, 64, 96),
5460            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5461        );
5462        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5463    }
5464
5465    #[simd_test(enable = "avx2")]
5466    unsafe fn test_mm256_i32gather_ps() {
5467        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5468        // A multiplier of 4 is word-addressing for f32s
5469        let r =
5470            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5471        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5472    }
5473
5474    #[simd_test(enable = "avx2")]
5475    unsafe fn test_mm256_mask_i32gather_ps() {
5476        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5477        // A multiplier of 4 is word-addressing for f32s
5478        let r = _mm256_mask_i32gather_ps::<4>(
5479            _mm256_set1_ps(256.0),
5480            arr.as_ptr(),
5481            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5482            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5483        );
5484        assert_eq_m256(
5485            r,
5486            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5487        );
5488    }
5489
5490    #[simd_test(enable = "avx2")]
5491    unsafe fn test_mm_i32gather_epi64() {
5492        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5493        // A multiplier of 8 is word-addressing for i64s
5494        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5495        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5496    }
5497
5498    #[simd_test(enable = "avx2")]
5499    unsafe fn test_mm_mask_i32gather_epi64() {
5500        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5501        // A multiplier of 8 is word-addressing for i64s
5502        let r = _mm_mask_i32gather_epi64::<8>(
5503            _mm_set1_epi64x(256),
5504            arr.as_ptr(),
5505            _mm_setr_epi32(16, 16, 16, 16),
5506            _mm_setr_epi64x(-1, 0),
5507        );
5508        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5509    }
5510
5511    #[simd_test(enable = "avx2")]
5512    unsafe fn test_mm256_i32gather_epi64() {
5513        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5514        // A multiplier of 8 is word-addressing for i64s
5515        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5516        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5517    }
5518
5519    #[simd_test(enable = "avx2")]
5520    unsafe fn test_mm256_mask_i32gather_epi64() {
5521        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5522        // A multiplier of 8 is word-addressing for i64s
5523        let r = _mm256_mask_i32gather_epi64::<8>(
5524            _mm256_set1_epi64x(256),
5525            arr.as_ptr(),
5526            _mm_setr_epi32(0, 16, 64, 96),
5527            _mm256_setr_epi64x(-1, -1, -1, 0),
5528        );
5529        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5530    }
5531
5532    #[simd_test(enable = "avx2")]
5533    unsafe fn test_mm_i32gather_pd() {
5534        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5535        // A multiplier of 8 is word-addressing for f64s
5536        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5537        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5538    }
5539
5540    #[simd_test(enable = "avx2")]
5541    unsafe fn test_mm_mask_i32gather_pd() {
5542        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5543        // A multiplier of 8 is word-addressing for f64s
5544        let r = _mm_mask_i32gather_pd::<8>(
5545            _mm_set1_pd(256.0),
5546            arr.as_ptr(),
5547            _mm_setr_epi32(16, 16, 16, 16),
5548            _mm_setr_pd(-1.0, 0.0),
5549        );
5550        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5551    }
5552
5553    #[simd_test(enable = "avx2")]
5554    unsafe fn test_mm256_i32gather_pd() {
5555        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5556        // A multiplier of 8 is word-addressing for f64s
5557        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5558        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5559    }
5560
5561    #[simd_test(enable = "avx2")]
5562    unsafe fn test_mm256_mask_i32gather_pd() {
5563        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5564        // A multiplier of 8 is word-addressing for f64s
5565        let r = _mm256_mask_i32gather_pd::<8>(
5566            _mm256_set1_pd(256.0),
5567            arr.as_ptr(),
5568            _mm_setr_epi32(0, 16, 64, 96),
5569            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5570        );
5571        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5572    }
5573
5574    #[simd_test(enable = "avx2")]
5575    unsafe fn test_mm_i64gather_epi32() {
5576        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5577        // A multiplier of 4 is word-addressing
5578        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5579        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5580    }
5581
5582    #[simd_test(enable = "avx2")]
5583    unsafe fn test_mm_mask_i64gather_epi32() {
5584        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5585        // A multiplier of 4 is word-addressing
5586        let r = _mm_mask_i64gather_epi32::<4>(
5587            _mm_set1_epi32(256),
5588            arr.as_ptr(),
5589            _mm_setr_epi64x(0, 16),
5590            _mm_setr_epi32(-1, 0, -1, 0),
5591        );
5592        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5593    }
5594
5595    #[simd_test(enable = "avx2")]
5596    unsafe fn test_mm256_i64gather_epi32() {
5597        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5598        // A multiplier of 4 is word-addressing
5599        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5600        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5601    }
5602
5603    #[simd_test(enable = "avx2")]
5604    unsafe fn test_mm256_mask_i64gather_epi32() {
5605        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5606        // A multiplier of 4 is word-addressing
5607        let r = _mm256_mask_i64gather_epi32::<4>(
5608            _mm_set1_epi32(256),
5609            arr.as_ptr(),
5610            _mm256_setr_epi64x(0, 16, 64, 96),
5611            _mm_setr_epi32(-1, -1, -1, 0),
5612        );
5613        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5614    }
5615
5616    #[simd_test(enable = "avx2")]
5617    unsafe fn test_mm_i64gather_ps() {
5618        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5619        // A multiplier of 4 is word-addressing for f32s
5620        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5621        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5622    }
5623
5624    #[simd_test(enable = "avx2")]
5625    unsafe fn test_mm_mask_i64gather_ps() {
5626        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5627        // A multiplier of 4 is word-addressing for f32s
5628        let r = _mm_mask_i64gather_ps::<4>(
5629            _mm_set1_ps(256.0),
5630            arr.as_ptr(),
5631            _mm_setr_epi64x(0, 16),
5632            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5633        );
5634        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5635    }
5636
5637    #[simd_test(enable = "avx2")]
5638    unsafe fn test_mm256_i64gather_ps() {
5639        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5640        // A multiplier of 4 is word-addressing for f32s
5641        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5642        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5643    }
5644
5645    #[simd_test(enable = "avx2")]
5646    unsafe fn test_mm256_mask_i64gather_ps() {
5647        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5648        // A multiplier of 4 is word-addressing for f32s
5649        let r = _mm256_mask_i64gather_ps::<4>(
5650            _mm_set1_ps(256.0),
5651            arr.as_ptr(),
5652            _mm256_setr_epi64x(0, 16, 64, 96),
5653            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5654        );
5655        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5656    }
5657
5658    #[simd_test(enable = "avx2")]
5659    unsafe fn test_mm_i64gather_epi64() {
5660        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5661        // A multiplier of 8 is word-addressing for i64s
5662        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5663        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5664    }
5665
5666    #[simd_test(enable = "avx2")]
5667    unsafe fn test_mm_mask_i64gather_epi64() {
5668        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5669        // A multiplier of 8 is word-addressing for i64s
5670        let r = _mm_mask_i64gather_epi64::<8>(
5671            _mm_set1_epi64x(256),
5672            arr.as_ptr(),
5673            _mm_setr_epi64x(16, 16),
5674            _mm_setr_epi64x(-1, 0),
5675        );
5676        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5677    }
5678
5679    #[simd_test(enable = "avx2")]
5680    unsafe fn test_mm256_i64gather_epi64() {
5681        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5682        // A multiplier of 8 is word-addressing for i64s
5683        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5684        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5685    }
5686
5687    #[simd_test(enable = "avx2")]
5688    unsafe fn test_mm256_mask_i64gather_epi64() {
5689        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5690        // A multiplier of 8 is word-addressing for i64s
5691        let r = _mm256_mask_i64gather_epi64::<8>(
5692            _mm256_set1_epi64x(256),
5693            arr.as_ptr(),
5694            _mm256_setr_epi64x(0, 16, 64, 96),
5695            _mm256_setr_epi64x(-1, -1, -1, 0),
5696        );
5697        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5698    }
5699
5700    #[simd_test(enable = "avx2")]
5701    unsafe fn test_mm_i64gather_pd() {
5702        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5703        // A multiplier of 8 is word-addressing for f64s
5704        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5705        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5706    }
5707
5708    #[simd_test(enable = "avx2")]
5709    unsafe fn test_mm_mask_i64gather_pd() {
5710        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5711        // A multiplier of 8 is word-addressing for f64s
5712        let r = _mm_mask_i64gather_pd::<8>(
5713            _mm_set1_pd(256.0),
5714            arr.as_ptr(),
5715            _mm_setr_epi64x(16, 16),
5716            _mm_setr_pd(-1.0, 0.0),
5717        );
5718        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5719    }
5720
5721    #[simd_test(enable = "avx2")]
5722    unsafe fn test_mm256_i64gather_pd() {
5723        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5724        // A multiplier of 8 is word-addressing for f64s
5725        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5726        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5727    }
5728
5729    #[simd_test(enable = "avx2")]
5730    unsafe fn test_mm256_mask_i64gather_pd() {
5731        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5732        // A multiplier of 8 is word-addressing for f64s
5733        let r = _mm256_mask_i64gather_pd::<8>(
5734            _mm256_set1_pd(256.0),
5735            arr.as_ptr(),
5736            _mm256_setr_epi64x(0, 16, 64, 96),
5737            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5738        );
5739        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5740    }
5741
5742    #[simd_test(enable = "avx")]
5743    unsafe fn test_mm256_extract_epi8() {
5744        #[rustfmt::skip]
5745        let a = _mm256_setr_epi8(
5746            -1, 1, 2, 3, 4, 5, 6, 7,
5747            8, 9, 10, 11, 12, 13, 14, 15,
5748            16, 17, 18, 19, 20, 21, 22, 23,
5749            24, 25, 26, 27, 28, 29, 30, 31
5750        );
5751        let r1 = _mm256_extract_epi8::<0>(a);
5752        let r2 = _mm256_extract_epi8::<3>(a);
5753        assert_eq!(r1, 0xFF);
5754        assert_eq!(r2, 3);
5755    }
5756
5757    #[simd_test(enable = "avx2")]
5758    unsafe fn test_mm256_extract_epi16() {
5759        #[rustfmt::skip]
5760        let a = _mm256_setr_epi16(
5761            -1, 1, 2, 3, 4, 5, 6, 7,
5762            8, 9, 10, 11, 12, 13, 14, 15,
5763        );
5764        let r1 = _mm256_extract_epi16::<0>(a);
5765        let r2 = _mm256_extract_epi16::<3>(a);
5766        assert_eq!(r1, 0xFFFF);
5767        assert_eq!(r2, 3);
5768    }
5769}