Skip to main content

core/stdarch/crates/core_arch/src/x86/
sse.rs

1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4    core_arch::{simd::*, x86::*},
5    intrinsics::simd::*,
6    intrinsics::sqrtf32,
7    mem, ptr,
8};
9
10#[cfg(test)]
11use stdarch_test::assert_instr;
12
13/// Adds the first component of `a` and `b`, the other components are copied
14/// from `a`.
15///
16/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
17#[inline]
18#[target_feature(enable = "sse")]
19#[cfg_attr(test, assert_instr(addss))]
20#[stable(feature = "simd_x86", since = "1.27.0")]
21#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
22pub const fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
23    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b)) }
24}
25
26/// Adds packed single-precision (32-bit) floating-point elements in `a` and
27/// `b`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
30#[inline]
31#[target_feature(enable = "sse")]
32#[cfg_attr(test, assert_instr(addps))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
35pub const fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
36    unsafe { simd_add(a, b) }
37}
38
39/// Subtracts the first component of `b` from `a`, the other components are
40/// copied from `a`.
41///
42/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
43#[inline]
44#[target_feature(enable = "sse")]
45#[cfg_attr(test, assert_instr(subss))]
46#[stable(feature = "simd_x86", since = "1.27.0")]
47#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
48pub const fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
49    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b)) }
50}
51
52/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
53/// `b`.
54///
55/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
56#[inline]
57#[target_feature(enable = "sse")]
58#[cfg_attr(test, assert_instr(subps))]
59#[stable(feature = "simd_x86", since = "1.27.0")]
60#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
61pub const fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
62    unsafe { simd_sub(a, b) }
63}
64
65/// Multiplies the first component of `a` and `b`, the other components are
66/// copied from `a`.
67///
68/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
69#[inline]
70#[target_feature(enable = "sse")]
71#[cfg_attr(test, assert_instr(mulss))]
72#[stable(feature = "simd_x86", since = "1.27.0")]
73#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
74pub const fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
75    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b)) }
76}
77
78/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
79/// `b`.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
82#[inline]
83#[target_feature(enable = "sse")]
84#[cfg_attr(test, assert_instr(mulps))]
85#[stable(feature = "simd_x86", since = "1.27.0")]
86#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
87pub const fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
88    unsafe { simd_mul(a, b) }
89}
90
91/// Divides the first component of `b` by `a`, the other components are
92/// copied from `a`.
93///
94/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
95#[inline]
96#[target_feature(enable = "sse")]
97#[cfg_attr(test, assert_instr(divss))]
98#[stable(feature = "simd_x86", since = "1.27.0")]
99#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
100pub const fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
101    unsafe { simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b)) }
102}
103
104/// Divides packed single-precision (32-bit) floating-point elements in `a` and
105/// `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
108#[inline]
109#[target_feature(enable = "sse")]
110#[cfg_attr(test, assert_instr(divps))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
113pub const fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
114    unsafe { simd_div(a, b) }
115}
116
117/// Returns the square root of the first single-precision (32-bit)
118/// floating-point element in `a`, the other elements are unchanged.
119///
120/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
121#[inline]
122#[target_feature(enable = "sse")]
123#[cfg_attr(test, assert_instr(sqrtss))]
124#[stable(feature = "simd_x86", since = "1.27.0")]
125pub fn _mm_sqrt_ss(a: __m128) -> __m128 {
126    unsafe { simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a))) }
127}
128
129/// Returns the square root of packed single-precision (32-bit) floating-point
130/// elements in `a`.
131///
132/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
133#[inline]
134#[target_feature(enable = "sse")]
135#[cfg_attr(test, assert_instr(sqrtps))]
136#[stable(feature = "simd_x86", since = "1.27.0")]
137pub fn _mm_sqrt_ps(a: __m128) -> __m128 {
138    unsafe { simd_fsqrt(a) }
139}
140
141/// Returns the approximate reciprocal of the first single-precision
142/// (32-bit) floating-point element in `a`, the other elements are unchanged.
143///
144/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
145#[inline]
146#[target_feature(enable = "sse")]
147#[cfg_attr(test, assert_instr(rcpss))]
148#[stable(feature = "simd_x86", since = "1.27.0")]
149pub fn _mm_rcp_ss(a: __m128) -> __m128 {
150    unsafe { rcpss(a) }
151}
152
153/// Returns the approximate reciprocal of packed single-precision (32-bit)
154/// floating-point elements in `a`.
155///
156/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
157#[inline]
158#[target_feature(enable = "sse")]
159#[cfg_attr(test, assert_instr(rcpps))]
160#[stable(feature = "simd_x86", since = "1.27.0")]
161pub fn _mm_rcp_ps(a: __m128) -> __m128 {
162    unsafe { rcpps(a) }
163}
164
165/// Returns the approximate reciprocal square root of the first single-precision
166/// (32-bit) floating-point element in `a`, the other elements are unchanged.
167///
168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
169#[inline]
170#[target_feature(enable = "sse")]
171#[cfg_attr(test, assert_instr(rsqrtss))]
172#[stable(feature = "simd_x86", since = "1.27.0")]
173pub fn _mm_rsqrt_ss(a: __m128) -> __m128 {
174    unsafe { rsqrtss(a) }
175}
176
177/// Returns the approximate reciprocal square root of packed single-precision
178/// (32-bit) floating-point elements in `a`.
179///
180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
181#[inline]
182#[target_feature(enable = "sse")]
183#[cfg_attr(test, assert_instr(rsqrtps))]
184#[stable(feature = "simd_x86", since = "1.27.0")]
185pub fn _mm_rsqrt_ps(a: __m128) -> __m128 {
186    unsafe { rsqrtps(a) }
187}
188
189/// Compares the first single-precision (32-bit) floating-point element of `a`
190/// and `b`, and return the minimum value in the first element of the return
191/// value, the other elements are copied from `a`.
192///
193/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
194#[inline]
195#[target_feature(enable = "sse")]
196#[cfg_attr(test, assert_instr(minss))]
197#[stable(feature = "simd_x86", since = "1.27.0")]
198pub fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
199    unsafe { minss(a, b) }
200}
201
202/// Compares packed single-precision (32-bit) floating-point elements in `a` and
203/// `b`, and return the corresponding minimum values.
204///
205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
206#[inline]
207#[target_feature(enable = "sse")]
208#[cfg_attr(test, assert_instr(minps))]
209#[stable(feature = "simd_x86", since = "1.27.0")]
210pub fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
211    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
212    unsafe { minps(a, b) }
213}
214
215/// Compares the first single-precision (32-bit) floating-point element of `a`
216/// and `b`, and return the maximum value in the first element of the return
217/// value, the other elements are copied from `a`.
218///
219/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
220#[inline]
221#[target_feature(enable = "sse")]
222#[cfg_attr(test, assert_instr(maxss))]
223#[stable(feature = "simd_x86", since = "1.27.0")]
224pub fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
225    unsafe { maxss(a, b) }
226}
227
228/// Compares packed single-precision (32-bit) floating-point elements in `a` and
229/// `b`, and return the corresponding maximum values.
230///
231/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
232#[inline]
233#[target_feature(enable = "sse")]
234#[cfg_attr(test, assert_instr(maxps))]
235#[stable(feature = "simd_x86", since = "1.27.0")]
236pub fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
237    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
238    unsafe { maxps(a, b) }
239}
240
241/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
242///
243/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
244#[inline]
245#[target_feature(enable = "sse")]
246// i586 only seems to generate plain `and` instructions, so ignore it.
247#[cfg_attr(
248    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
249    assert_instr(andps)
250)]
251#[stable(feature = "simd_x86", since = "1.27.0")]
252#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
253pub const fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
254    unsafe {
255        let a: __m128i = mem::transmute(a);
256        let b: __m128i = mem::transmute(b);
257        mem::transmute(simd_and(a, b))
258    }
259}
260
261/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
262/// elements.
263///
264/// Computes `!a & b` for each bit in `a` and `b`.
265///
266/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
267#[inline]
268#[target_feature(enable = "sse")]
269// i586 only seems to generate plain `not` and `and` instructions, so ignore
270// it.
271#[cfg_attr(
272    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
273    assert_instr(andnps)
274)]
275#[stable(feature = "simd_x86", since = "1.27.0")]
276#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
277pub const fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
278    unsafe {
279        let a: __m128i = mem::transmute(a);
280        let b: __m128i = mem::transmute(b);
281        let mask: __m128i = mem::transmute(i32x4::splat(-1));
282        mem::transmute(simd_and(simd_xor(mask, a), b))
283    }
284}
285
286/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
287///
288/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
289#[inline]
290#[target_feature(enable = "sse")]
291// i586 only seems to generate plain `or` instructions, so we ignore it.
292#[cfg_attr(
293    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
294    assert_instr(orps)
295)]
296#[stable(feature = "simd_x86", since = "1.27.0")]
297#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
298pub const fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
299    unsafe {
300        let a: __m128i = mem::transmute(a);
301        let b: __m128i = mem::transmute(b);
302        mem::transmute(simd_or(a, b))
303    }
304}
305
306/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
307/// elements.
308///
309/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
310#[inline]
311#[target_feature(enable = "sse")]
312// i586 only seems to generate plain `xor` instructions, so we ignore it.
313#[cfg_attr(
314    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
315    assert_instr(xorps)
316)]
317#[stable(feature = "simd_x86", since = "1.27.0")]
318#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
319pub const fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
320    unsafe {
321        let a: __m128i = mem::transmute(a);
322        let b: __m128i = mem::transmute(b);
323        mem::transmute(simd_xor(a, b))
324    }
325}
326
327/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
328/// the result will be `0xffffffff` if the two inputs are equal, or `0`
329/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
330///
331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
332#[inline]
333#[target_feature(enable = "sse")]
334#[cfg_attr(test, assert_instr(cmpeqss))]
335#[stable(feature = "simd_x86", since = "1.27.0")]
336pub fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
337    unsafe { cmpss(a, b, 0) }
338}
339
340/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
341/// of the result will be `0xffffffff` if `a.extract(0)` is less than
342/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
343/// upper 96 bits of `a`.
344///
345/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
346#[inline]
347#[target_feature(enable = "sse")]
348#[cfg_attr(test, assert_instr(cmpltss))]
349#[stable(feature = "simd_x86", since = "1.27.0")]
350pub fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
351    unsafe { cmpss(a, b, 1) }
352}
353
354/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
355/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
356/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
357/// are the upper 96 bits of `a`.
358///
359/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
360#[inline]
361#[target_feature(enable = "sse")]
362#[cfg_attr(test, assert_instr(cmpless))]
363#[stable(feature = "simd_x86", since = "1.27.0")]
364pub fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
365    unsafe { cmpss(a, b, 2) }
366}
367
368/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
369/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
370/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
371/// are the upper 96 bits of `a`.
372///
373/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
374#[inline]
375#[target_feature(enable = "sse")]
376#[cfg_attr(test, assert_instr(cmpltss))]
377#[stable(feature = "simd_x86", since = "1.27.0")]
378pub fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
379    unsafe { simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3]) }
380}
381
382/// Compares the lowest `f32` of both inputs for greater than or equal. The
383/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
384/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
385/// of the result are the upper 96 bits of `a`.
386///
387/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
388#[inline]
389#[target_feature(enable = "sse")]
390#[cfg_attr(test, assert_instr(cmpless))]
391#[stable(feature = "simd_x86", since = "1.27.0")]
392pub fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
393    unsafe { simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3]) }
394}
395
396/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
397/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
398/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
399/// upper 96 bits of `a`.
400///
401/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
402#[inline]
403#[target_feature(enable = "sse")]
404#[cfg_attr(test, assert_instr(cmpneqss))]
405#[stable(feature = "simd_x86", since = "1.27.0")]
406pub fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
407    unsafe { cmpss(a, b, 4) }
408}
409
410/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
411/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
412/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
413/// upper 96 bits of `a`.
414///
415/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
416#[inline]
417#[target_feature(enable = "sse")]
418#[cfg_attr(test, assert_instr(cmpnltss))]
419#[stable(feature = "simd_x86", since = "1.27.0")]
420pub fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
421    unsafe { cmpss(a, b, 5) }
422}
423
424/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
425/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
426/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
427/// of the result are the upper 96 bits of `a`.
428///
429/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
430#[inline]
431#[target_feature(enable = "sse")]
432#[cfg_attr(test, assert_instr(cmpnless))]
433#[stable(feature = "simd_x86", since = "1.27.0")]
434pub fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
435    unsafe { cmpss(a, b, 6) }
436}
437
438/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
439/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
440/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
441/// the upper 96 bits of `a`.
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
444#[inline]
445#[target_feature(enable = "sse")]
446#[cfg_attr(test, assert_instr(cmpnltss))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
449    unsafe { simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3]) }
450}
451
452/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
453/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
454/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
455/// bits of the result are the upper 96 bits of `a`.
456///
457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
458#[inline]
459#[target_feature(enable = "sse")]
460#[cfg_attr(test, assert_instr(cmpnless))]
461#[stable(feature = "simd_x86", since = "1.27.0")]
462pub fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
463    unsafe { simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3]) }
464}
465
466/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
467/// the result will be `0xffffffff` if neither of `a.extract(0)` or
468/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
469/// are the upper 96 bits of `a`.
470///
471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
472#[inline]
473#[target_feature(enable = "sse")]
474#[cfg_attr(test, assert_instr(cmpordss))]
475#[stable(feature = "simd_x86", since = "1.27.0")]
476pub fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
477    unsafe { cmpss(a, b, 7) }
478}
479
480/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
481/// of the result will be `0xffffffff` if any of `a.extract(0)` or
482/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
483/// are the upper 96 bits of `a`.
484///
485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
486#[inline]
487#[target_feature(enable = "sse")]
488#[cfg_attr(test, assert_instr(cmpunordss))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
491    unsafe { cmpss(a, b, 3) }
492}
493
494/// Compares each of the four floats in `a` to the corresponding element in `b`.
495/// The result in the output vector will be `0xffffffff` if the input elements
496/// were equal, or `0` otherwise.
497///
498/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
499#[inline]
500#[target_feature(enable = "sse")]
501#[cfg_attr(test, assert_instr(cmpeqps))]
502#[stable(feature = "simd_x86", since = "1.27.0")]
503pub fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
504    unsafe { cmpps(a, b, 0) }
505}
506
507/// Compares each of the four floats in `a` to the corresponding element in `b`.
508/// The result in the output vector will be `0xffffffff` if the input element
509/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
510///
511/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
512#[inline]
513#[target_feature(enable = "sse")]
514#[cfg_attr(test, assert_instr(cmpltps))]
515#[stable(feature = "simd_x86", since = "1.27.0")]
516pub fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
517    unsafe { cmpps(a, b, 1) }
518}
519
520/// Compares each of the four floats in `a` to the corresponding element in `b`.
521/// The result in the output vector will be `0xffffffff` if the input element
522/// in `a` is less than or equal to the corresponding element in `b`, or `0`
523/// otherwise.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
526#[inline]
527#[target_feature(enable = "sse")]
528#[cfg_attr(test, assert_instr(cmpleps))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530pub fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
531    unsafe { cmpps(a, b, 2) }
532}
533
534/// Compares each of the four floats in `a` to the corresponding element in `b`.
535/// The result in the output vector will be `0xffffffff` if the input element
536/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
537///
538/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
539#[inline]
540#[target_feature(enable = "sse")]
541#[cfg_attr(test, assert_instr(cmpltps))]
542#[stable(feature = "simd_x86", since = "1.27.0")]
543pub fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
544    unsafe { cmpps(b, a, 1) }
545}
546
547/// Compares each of the four floats in `a` to the corresponding element in `b`.
548/// The result in the output vector will be `0xffffffff` if the input element
549/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
550/// otherwise.
551///
552/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
553#[inline]
554#[target_feature(enable = "sse")]
555#[cfg_attr(test, assert_instr(cmpleps))]
556#[stable(feature = "simd_x86", since = "1.27.0")]
557pub fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
558    unsafe { cmpps(b, a, 2) }
559}
560
561/// Compares each of the four floats in `a` to the corresponding element in `b`.
562/// The result in the output vector will be `0xffffffff` if the input elements
563/// are **not** equal, or `0` otherwise.
564///
565/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
566#[inline]
567#[target_feature(enable = "sse")]
568#[cfg_attr(test, assert_instr(cmpneqps))]
569#[stable(feature = "simd_x86", since = "1.27.0")]
570pub fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
571    unsafe { cmpps(a, b, 4) }
572}
573
574/// Compares each of the four floats in `a` to the corresponding element in `b`.
575/// The result in the output vector will be `0xffffffff` if the input element
576/// in `a` is **not** less than the corresponding element in `b`, or `0`
577/// otherwise.
578///
579/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
580#[inline]
581#[target_feature(enable = "sse")]
582#[cfg_attr(test, assert_instr(cmpnltps))]
583#[stable(feature = "simd_x86", since = "1.27.0")]
584pub fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
585    unsafe { cmpps(a, b, 5) }
586}
587
588/// Compares each of the four floats in `a` to the corresponding element in `b`.
589/// The result in the output vector will be `0xffffffff` if the input element
590/// in `a` is **not** less than or equal to the corresponding element in `b`, or
591/// `0` otherwise.
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
594#[inline]
595#[target_feature(enable = "sse")]
596#[cfg_attr(test, assert_instr(cmpnleps))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598pub fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
599    unsafe { cmpps(a, b, 6) }
600}
601
602/// Compares each of the four floats in `a` to the corresponding element in `b`.
603/// The result in the output vector will be `0xffffffff` if the input element
604/// in `a` is **not** greater than the corresponding element in `b`, or `0`
605/// otherwise.
606///
607/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
608#[inline]
609#[target_feature(enable = "sse")]
610#[cfg_attr(test, assert_instr(cmpnltps))]
611#[stable(feature = "simd_x86", since = "1.27.0")]
612pub fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
613    unsafe { cmpps(b, a, 5) }
614}
615
616/// Compares each of the four floats in `a` to the corresponding element in `b`.
617/// The result in the output vector will be `0xffffffff` if the input element
618/// in `a` is **not** greater than or equal to the corresponding element in `b`,
619/// or `0` otherwise.
620///
621/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
622#[inline]
623#[target_feature(enable = "sse")]
624#[cfg_attr(test, assert_instr(cmpnleps))]
625#[stable(feature = "simd_x86", since = "1.27.0")]
626pub fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
627    unsafe { cmpps(b, a, 6) }
628}
629
630/// Compares each of the four floats in `a` to the corresponding element in `b`.
631/// Returns four floats that have one of two possible bit patterns. The element
632/// in the output vector will be `0xffffffff` if the input elements in `a` and
633/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
634///
635/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
636#[inline]
637#[target_feature(enable = "sse")]
638#[cfg_attr(test, assert_instr(cmpordps))]
639#[stable(feature = "simd_x86", since = "1.27.0")]
640pub fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
641    unsafe { cmpps(b, a, 7) }
642}
643
644/// Compares each of the four floats in `a` to the corresponding element in `b`.
645/// Returns four floats that have one of two possible bit patterns. The element
646/// in the output vector will be `0xffffffff` if the input elements in `a` and
647/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
648///
649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
650#[inline]
651#[target_feature(enable = "sse")]
652#[cfg_attr(test, assert_instr(cmpunordps))]
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
655    unsafe { cmpps(b, a, 3) }
656}
657
658/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
659/// `1` if they are equal, or `0` otherwise.
660///
661/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
662#[inline]
663#[target_feature(enable = "sse")]
664#[cfg_attr(test, assert_instr(comiss))]
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
667    unsafe { comieq_ss(a, b) }
668}
669
670/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
671/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
674#[inline]
675#[target_feature(enable = "sse")]
676#[cfg_attr(test, assert_instr(comiss))]
677#[stable(feature = "simd_x86", since = "1.27.0")]
678pub fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
679    unsafe { comilt_ss(a, b) }
680}
681
682/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
683/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
684/// otherwise.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
687#[inline]
688#[target_feature(enable = "sse")]
689#[cfg_attr(test, assert_instr(comiss))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691pub fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
692    unsafe { comile_ss(a, b) }
693}
694
695/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
696/// `1` if the value from `a` is greater than the one from `b`, or `0`
697/// otherwise.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
700#[inline]
701#[target_feature(enable = "sse")]
702#[cfg_attr(test, assert_instr(comiss))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
705    unsafe { comigt_ss(a, b) }
706}
707
708/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
709/// `1` if the value from `a` is greater than or equal to the one from `b`, or
710/// `0` otherwise.
711///
712/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
713#[inline]
714#[target_feature(enable = "sse")]
715#[cfg_attr(test, assert_instr(comiss))]
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
718    unsafe { comige_ss(a, b) }
719}
720
721/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
722/// `1` if they are **not** equal, or `0` otherwise.
723///
724/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
725#[inline]
726#[target_feature(enable = "sse")]
727#[cfg_attr(test, assert_instr(comiss))]
728#[stable(feature = "simd_x86", since = "1.27.0")]
729pub fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
730    unsafe { comineq_ss(a, b) }
731}
732
733/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
734/// `1` if they are equal, or `0` otherwise. This instruction will not signal
735/// an exception if either argument is a quiet NaN.
736///
737/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
738#[inline]
739#[target_feature(enable = "sse")]
740#[cfg_attr(test, assert_instr(ucomiss))]
741#[stable(feature = "simd_x86", since = "1.27.0")]
742pub fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
743    unsafe { ucomieq_ss(a, b) }
744}
745
746/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
747/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
748/// This instruction will not signal an exception if either argument is a quiet
749/// NaN.
750///
751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
752#[inline]
753#[target_feature(enable = "sse")]
754#[cfg_attr(test, assert_instr(ucomiss))]
755#[stable(feature = "simd_x86", since = "1.27.0")]
756pub fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
757    unsafe { ucomilt_ss(a, b) }
758}
759
760/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
761/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
762/// otherwise. This instruction will not signal an exception if either argument
763/// is a quiet NaN.
764///
765/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
766#[inline]
767#[target_feature(enable = "sse")]
768#[cfg_attr(test, assert_instr(ucomiss))]
769#[stable(feature = "simd_x86", since = "1.27.0")]
770pub fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
771    unsafe { ucomile_ss(a, b) }
772}
773
774/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
775/// `1` if the value from `a` is greater than the one from `b`, or `0`
776/// otherwise. This instruction will not signal an exception if either argument
777/// is a quiet NaN.
778///
779/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
780#[inline]
781#[target_feature(enable = "sse")]
782#[cfg_attr(test, assert_instr(ucomiss))]
783#[stable(feature = "simd_x86", since = "1.27.0")]
784pub fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
785    unsafe { ucomigt_ss(a, b) }
786}
787
788/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
789/// `1` if the value from `a` is greater than or equal to the one from `b`, or
790/// `0` otherwise. This instruction will not signal an exception if either
791/// argument is a quiet NaN.
792///
793/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
794#[inline]
795#[target_feature(enable = "sse")]
796#[cfg_attr(test, assert_instr(ucomiss))]
797#[stable(feature = "simd_x86", since = "1.27.0")]
798pub fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
799    unsafe { ucomige_ss(a, b) }
800}
801
802/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
803/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
804/// signal an exception if either argument is a quiet NaN.
805///
806/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
807#[inline]
808#[target_feature(enable = "sse")]
809#[cfg_attr(test, assert_instr(ucomiss))]
810#[stable(feature = "simd_x86", since = "1.27.0")]
811pub fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
812    unsafe { ucomineq_ss(a, b) }
813}
814
815/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
816///
817/// The result is rounded according to the current rounding mode. If the result
818/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
819/// (`i32::MIN`).
820///
821/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
822///
823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
824#[inline]
825#[target_feature(enable = "sse")]
826#[cfg_attr(test, assert_instr(cvtss2si))]
827#[stable(feature = "simd_x86", since = "1.27.0")]
828pub fn _mm_cvtss_si32(a: __m128) -> i32 {
829    unsafe { cvtss2si(a) }
830}
831
832/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
833///
834/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
835#[inline]
836#[target_feature(enable = "sse")]
837#[cfg_attr(test, assert_instr(cvtss2si))]
838#[stable(feature = "simd_x86", since = "1.27.0")]
839pub fn _mm_cvt_ss2si(a: __m128) -> i32 {
840    _mm_cvtss_si32(a)
841}
842
843/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
844/// with
845/// truncation.
846///
847/// The result is rounded always using truncation (round towards zero). If the
848/// result cannot be represented as a 32 bit integer the result will be
849/// `0x8000_0000` (`i32::MIN`).
850///
851/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
852///
853/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
854#[inline]
855#[target_feature(enable = "sse")]
856#[cfg_attr(test, assert_instr(cvttss2si))]
857#[stable(feature = "simd_x86", since = "1.27.0")]
858pub fn _mm_cvttss_si32(a: __m128) -> i32 {
859    unsafe { cvttss2si(a) }
860}
861
862/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
863///
864/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
865#[inline]
866#[target_feature(enable = "sse")]
867#[cfg_attr(test, assert_instr(cvttss2si))]
868#[stable(feature = "simd_x86", since = "1.27.0")]
869pub fn _mm_cvtt_ss2si(a: __m128) -> i32 {
870    _mm_cvttss_si32(a)
871}
872
873/// Extracts the lowest 32 bit float from the input vector.
874///
875/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
876#[inline]
877#[target_feature(enable = "sse")]
878// No point in using assert_instrs. In Unix x86_64 calling convention this is a
879// no-op, and on msvc it's just a `mov`.
880#[stable(feature = "simd_x86", since = "1.27.0")]
881#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
882pub const fn _mm_cvtss_f32(a: __m128) -> f32 {
883    unsafe { simd_extract!(a, 0) }
884}
885
886/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
887/// vector `a` with the lowest 32 bit float replaced by the converted integer.
888///
889/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
890/// input).
891///
892/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
893#[inline]
894#[target_feature(enable = "sse")]
895#[cfg_attr(test, assert_instr(cvtsi2ss))]
896#[stable(feature = "simd_x86", since = "1.27.0")]
897#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
898pub const fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
899    unsafe { simd_insert!(a, 0, b as f32) }
900}
901
902/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
903///
904/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
905#[inline]
906#[target_feature(enable = "sse")]
907#[cfg_attr(test, assert_instr(cvtsi2ss))]
908#[stable(feature = "simd_x86", since = "1.27.0")]
909pub fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
910    _mm_cvtsi32_ss(a, b)
911}
912
913/// Construct a `__m128` with the lowest element set to `a` and the rest set to
914/// zero.
915///
916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
917#[inline]
918#[target_feature(enable = "sse")]
919#[cfg_attr(test, assert_instr(movss))]
920#[stable(feature = "simd_x86", since = "1.27.0")]
921#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
922pub const fn _mm_set_ss(a: f32) -> __m128 {
923    __m128([a, 0.0, 0.0, 0.0])
924}
925
926/// Construct a `__m128` with all element set to `a`.
927///
928/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
929#[inline]
930#[target_feature(enable = "sse")]
931#[cfg_attr(test, assert_instr(shufps))]
932#[stable(feature = "simd_x86", since = "1.27.0")]
933#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
934pub const fn _mm_set1_ps(a: f32) -> __m128 {
935    f32x4::splat(a).as_m128()
936}
937
938/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
939///
940/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
941#[inline]
942#[target_feature(enable = "sse")]
943#[cfg_attr(test, assert_instr(shufps))]
944#[stable(feature = "simd_x86", since = "1.27.0")]
945#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
946pub const fn _mm_set_ps1(a: f32) -> __m128 {
947    _mm_set1_ps(a)
948}
949
950/// Construct a `__m128` from four floating point values highest to lowest.
951///
952/// Note that `a` will be the highest 32 bits of the result, and `d` the
953/// lowest. This matches the standard way of writing bit patterns on x86:
954///
955/// ```text
956///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
957///        +---------+---------+---------+---------+
958///        |    a    |    b    |    c    |    d    |   result
959///        +---------+---------+---------+---------+
960/// ```
961///
962/// Alternatively:
963///
964/// ```text
965/// let v = _mm_set_ps(d, c, b, a);
966/// ```
967///
968/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
969#[inline]
970#[target_feature(enable = "sse")]
971#[cfg_attr(test, assert_instr(unpcklps))]
972#[stable(feature = "simd_x86", since = "1.27.0")]
973#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
974pub const fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
975    __m128([d, c, b, a])
976}
977
978/// Construct a `__m128` from four floating point values lowest to highest.
979///
980/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
981/// bits of the result, and `d` the highest.
982///
983/// ```text
984/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
985/// ```
986///
987/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
988#[inline]
989#[target_feature(enable = "sse")]
990#[cfg_attr(
991    all(test, any(target_env = "msvc", target_arch = "x86_64")),
992    assert_instr(unpcklps)
993)]
994// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
995#[cfg_attr(
996    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
997    assert_instr(movaps)
998)]
999#[stable(feature = "simd_x86", since = "1.27.0")]
1000#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1001pub const fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
1002    __m128([a, b, c, d])
1003}
1004
1005/// Construct a `__m128` with all elements initialized to zero.
1006///
1007/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
1008#[inline]
1009#[target_feature(enable = "sse")]
1010#[cfg_attr(test, assert_instr(xorps))]
1011#[stable(feature = "simd_x86", since = "1.27.0")]
1012#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1013pub const fn _mm_setzero_ps() -> __m128 {
1014    const { unsafe { mem::zeroed() } }
1015}
1016
1017/// A utility function for creating masks to use with Intel shuffle and
1018/// permute intrinsics.
1019#[inline]
1020#[allow(non_snake_case)]
1021#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
1022pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
1023    ((z << 6) | (y << 4) | (x << 2) | w) as i32
1024}
1025
1026/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
1027/// `b` using `MASK`.
1028///
1029/// The lower half of result takes values from `a` and the higher half from
1030/// `b`. Mask is split to 2 control bits each to index the element from inputs.
1031///
1032/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1033///
1034/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1035/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1036/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1037/// Performing an implicit type conversion between an unsigned integer and a signed integer
1038/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1039#[inline]
1040#[target_feature(enable = "sse")]
1041#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1042#[rustc_legacy_const_generics(2)]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1045pub const fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1046    static_assert_uimm_bits!(MASK, 8);
1047    unsafe {
1048        simd_shuffle!(
1049            a,
1050            b,
1051            [
1052                MASK as u32 & 0b11,
1053                (MASK as u32 >> 2) & 0b11,
1054                ((MASK as u32 >> 4) & 0b11) + 4,
1055                ((MASK as u32 >> 6) & 0b11) + 4,
1056            ],
1057        )
1058    }
1059}
1060
1061/// Unpacks and interleave single-precision (32-bit) floating-point elements
1062/// from the higher half of `a` and `b`.
1063///
1064/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1065#[inline]
1066#[target_feature(enable = "sse")]
1067#[cfg_attr(test, assert_instr(unpckhps))]
1068#[stable(feature = "simd_x86", since = "1.27.0")]
1069#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1070pub const fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1071    unsafe { simd_shuffle!(a, b, [2, 6, 3, 7]) }
1072}
1073
1074/// Unpacks and interleave single-precision (32-bit) floating-point elements
1075/// from the lower half of `a` and `b`.
1076///
1077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1078#[inline]
1079#[target_feature(enable = "sse")]
1080#[cfg_attr(test, assert_instr(unpcklps))]
1081#[stable(feature = "simd_x86", since = "1.27.0")]
1082#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1083pub const fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1084    unsafe { simd_shuffle!(a, b, [0, 4, 1, 5]) }
1085}
1086
1087/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1088/// lower half of result.
1089///
1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1091#[inline]
1092#[target_feature(enable = "sse")]
1093#[cfg_attr(test, assert_instr(movhlps))]
1094#[stable(feature = "simd_x86", since = "1.27.0")]
1095#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1096pub const fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1097    // TODO; figure why this is a different instruction on msvc?
1098    unsafe { simd_shuffle!(a, b, [6, 7, 2, 3]) }
1099}
1100
1101/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1102/// higher half of result.
1103///
1104/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1105#[inline]
1106#[target_feature(enable = "sse")]
1107#[cfg_attr(test, assert_instr(movlhps))]
1108#[stable(feature = "simd_x86", since = "1.27.0")]
1109#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1110pub const fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1111    unsafe { simd_shuffle!(a, b, [0, 1, 4, 5]) }
1112}
1113
1114/// Returns a mask of the most significant bit of each element in `a`.
1115///
1116/// The mask is stored in the 4 least significant bits of the return value.
1117/// All other bits are set to `0`.
1118///
1119/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1120#[inline]
1121#[target_feature(enable = "sse")]
1122#[cfg_attr(test, assert_instr(movmskps))]
1123#[stable(feature = "simd_x86", since = "1.27.0")]
1124#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1125pub const fn _mm_movemask_ps(a: __m128) -> i32 {
1126    // Propagate the highest bit to the rest, because simd_bitmask
1127    // requires all-1 or all-0.
1128    unsafe {
1129        let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
1130        simd_bitmask::<i32x4, u8>(mask) as i32
1131    }
1132}
1133
1134/// Construct a `__m128` with the lowest element read from `p` and the other
1135/// elements set to zero.
1136///
1137/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1138///
1139/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1140#[inline]
1141#[target_feature(enable = "sse")]
1142#[cfg_attr(test, assert_instr(movss))]
1143#[stable(feature = "simd_x86", since = "1.27.0")]
1144#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1145pub const unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1146    __m128([*p, 0.0, 0.0, 0.0])
1147}
1148
1149/// Construct a `__m128` by duplicating the value read from `p` into all
1150/// elements.
1151///
1152/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1153/// shuffling.
1154///
1155/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1156#[inline]
1157#[target_feature(enable = "sse")]
1158#[cfg_attr(test, assert_instr(movss))]
1159#[stable(feature = "simd_x86", since = "1.27.0")]
1160#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1161pub const unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1162    let a = *p;
1163    __m128([a, a, a, a])
1164}
1165
1166/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1167///
1168/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1169#[inline]
1170#[target_feature(enable = "sse")]
1171#[cfg_attr(test, assert_instr(movss))]
1172#[stable(feature = "simd_x86", since = "1.27.0")]
1173#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1174pub const unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1175    _mm_load1_ps(p)
1176}
1177
1178/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1179/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1180/// protection fault will be triggered (fatal program crash).
1181///
1182/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1183/// memory.
1184///
1185/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1186///
1187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1188#[inline]
1189#[target_feature(enable = "sse")]
1190// FIXME: Rust doesn't emit alignment attributes for MSVC x86-32. Ref https://github.com/rust-lang/rust/pull/139261
1191// All aligned load/store intrinsics are affected
1192#[cfg_attr(
1193    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1194    assert_instr(movaps)
1195)]
1196#[stable(feature = "simd_x86", since = "1.27.0")]
1197#[allow(clippy::cast_ptr_alignment)]
1198#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1199pub const unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1200    *(p as *const __m128)
1201}
1202
1203/// Loads four `f32` values from memory into a `__m128`. There are no
1204/// restrictions
1205/// on memory alignment. For aligned memory
1206/// [`_mm_load_ps`](fn._mm_load_ps.html)
1207/// may be faster.
1208///
1209/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1210///
1211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1212#[inline]
1213#[target_feature(enable = "sse")]
1214#[cfg_attr(test, assert_instr(movups))]
1215#[stable(feature = "simd_x86", since = "1.27.0")]
1216#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1217pub const unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1218    // Note: Using `*p` would require `f32` alignment, but `movups` has no
1219    // alignment restrictions.
1220    let mut dst = _mm_undefined_ps();
1221    ptr::copy_nonoverlapping(
1222        p as *const u8,
1223        ptr::addr_of_mut!(dst) as *mut u8,
1224        mem::size_of::<__m128>(),
1225    );
1226    dst
1227}
1228
1229/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1230/// order.
1231///
1232/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1233/// protection fault will be triggered (fatal program crash).
1234///
1235/// Functionally equivalent to the following code sequence (assuming `p`
1236/// satisfies the alignment restrictions):
1237///
1238/// ```text
1239/// let a0 = *p;
1240/// let a1 = *p.add(1);
1241/// let a2 = *p.add(2);
1242/// let a3 = *p.add(3);
1243/// __m128::new(a3, a2, a1, a0)
1244/// ```
1245///
1246/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1247/// shuffling.
1248///
1249/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1250#[inline]
1251#[target_feature(enable = "sse")]
1252#[cfg_attr(
1253    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1254    assert_instr(movaps)
1255)]
1256#[stable(feature = "simd_x86", since = "1.27.0")]
1257#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1258pub const unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1259    let a = _mm_load_ps(p);
1260    simd_shuffle!(a, a, [3, 2, 1, 0])
1261}
1262
1263/// Stores the lowest 32 bit float of `a` into memory.
1264///
1265/// This intrinsic corresponds to the `MOVSS` instruction.
1266///
1267/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1268#[inline]
1269#[target_feature(enable = "sse")]
1270#[cfg_attr(test, assert_instr(movss))]
1271#[stable(feature = "simd_x86", since = "1.27.0")]
1272#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1273pub const unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1274    *p = simd_extract!(a, 0);
1275}
1276
1277/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1278/// memory.
1279///
1280/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1281/// protection fault will be triggered (fatal program crash).
1282///
1283/// Functionally equivalent to the following code sequence (assuming `p`
1284/// satisfies the alignment restrictions):
1285///
1286/// ```text
1287/// let x = a.extract(0);
1288/// *p = x;
1289/// *p.add(1) = x;
1290/// *p.add(2) = x;
1291/// *p.add(3) = x;
1292/// ```
1293///
1294/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1295#[inline]
1296#[target_feature(enable = "sse")]
1297#[cfg_attr(
1298    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1299    assert_instr(movaps)
1300)]
1301#[stable(feature = "simd_x86", since = "1.27.0")]
1302#[allow(clippy::cast_ptr_alignment)]
1303#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1304pub const unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1305    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1306    *(p as *mut __m128) = b;
1307}
1308
1309/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1310///
1311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1312#[inline]
1313#[target_feature(enable = "sse")]
1314#[cfg_attr(
1315    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1316    assert_instr(movaps)
1317)]
1318#[stable(feature = "simd_x86", since = "1.27.0")]
1319#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1320pub const unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1321    _mm_store1_ps(p, a);
1322}
1323
1324/// Stores four 32-bit floats into *aligned* memory.
1325///
1326/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1327/// protection fault will be triggered (fatal program crash).
1328///
1329/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1330/// memory.
1331///
1332/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1333///
1334/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1335#[inline]
1336#[target_feature(enable = "sse")]
1337#[cfg_attr(
1338    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1339    assert_instr(movaps)
1340)]
1341#[stable(feature = "simd_x86", since = "1.27.0")]
1342#[allow(clippy::cast_ptr_alignment)]
1343#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1344pub const unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1345    *(p as *mut __m128) = a;
1346}
1347
1348/// Stores four 32-bit floats into memory. There are no restrictions on memory
1349/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1350/// faster.
1351///
1352/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1353///
1354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1355#[inline]
1356#[target_feature(enable = "sse")]
1357#[cfg_attr(test, assert_instr(movups))]
1358#[stable(feature = "simd_x86", since = "1.27.0")]
1359#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1360pub const unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1361    ptr::copy_nonoverlapping(
1362        ptr::addr_of!(a) as *const u8,
1363        p as *mut u8,
1364        mem::size_of::<__m128>(),
1365    );
1366}
1367
1368/// Stores four 32-bit floats into *aligned* memory in reverse order.
1369///
1370/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1371/// protection fault will be triggered (fatal program crash).
1372///
1373/// Functionally equivalent to the following code sequence (assuming `p`
1374/// satisfies the alignment restrictions):
1375///
1376/// ```text
1377/// *p = a.extract(3);
1378/// *p.add(1) = a.extract(2);
1379/// *p.add(2) = a.extract(1);
1380/// *p.add(3) = a.extract(0);
1381/// ```
1382///
1383/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1384#[inline]
1385#[target_feature(enable = "sse")]
1386#[cfg_attr(
1387    all(test, not(all(target_arch = "x86", target_env = "msvc"))),
1388    assert_instr(movaps)
1389)]
1390#[stable(feature = "simd_x86", since = "1.27.0")]
1391#[allow(clippy::cast_ptr_alignment)]
1392#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1393pub const unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1394    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1395    *(p as *mut __m128) = b;
1396}
1397
1398/// Returns a `__m128` with the first component from `b` and the remaining
1399/// components from `a`.
1400///
1401/// In other words for any `a` and `b`:
1402/// ```text
1403/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1404/// ```
1405///
1406/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1407#[inline]
1408#[target_feature(enable = "sse")]
1409#[cfg_attr(test, assert_instr(movss))]
1410#[stable(feature = "simd_x86", since = "1.27.0")]
1411#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1412pub const fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1413    unsafe { simd_shuffle!(a, b, [4, 1, 2, 3]) }
1414}
1415
1416/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1417/// were issued by the current thread prior to this instruction.
1418///
1419/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1420/// ordered before any load or store instruction which follows the fence in
1421/// synchronization order.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1424/// (but note that Intel is only documenting the hardware-level concerns related to this
1425/// instruction; the Intel documentation does not take into account the extra concerns that arise
1426/// because the Rust memory model is different from the x86 memory model.)
1427///
1428/// # Safety of non-temporal stores
1429///
1430/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1431/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1432/// intrinsic.
1433///
1434/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1435/// memory model, these stores are happening asynchronously in a background thread. This means a
1436/// non-temporal store can cause data races with other accesses, even other accesses on the same
1437/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1438/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1439/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1440/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1441/// with all the non-temporal stores previously started on this thread, which means in particular
1442/// that subsequent synchronization with other threads will then work as intended again.
1443///
1444/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1445/// code jumps back to code outside your library. This ensures all stores inside your function
1446/// are synchronized-before the return, and thus transitively synchronized-before everything
1447/// the caller does after your function returns.
1448//
1449// The following is not a doc comment since it's not clear whether we want to put this into the
1450// docs, but it should be written out somewhere.
1451//
1452// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1453// inspect, and that behave like the following functions. This explains where the docs above come
1454// from.
1455// ```
1456// #[thread_local]
1457// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1458//
1459// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1460//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1461//     // Spawn a thread that will eventually do our write.
1462//     // We need to fetch a pointer to this thread's pending-write
1463//     // counter, so that we can access it from the background thread.
1464//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1465//     // If this was actual Rust code we'd have to do some extra work
1466//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1467//     std::thread::spawn(move || {
1468//         // Do the write in the background thread.
1469//         ptr.write(val);
1470//         // Register the write as done. Crucially, this is `Release`, so it
1471//         // syncs-with the `Acquire in `sfence`.
1472//         (&*pending_writes).fetch_sub(1, Release);
1473//     });
1474// }
1475//
1476// pub fn sfence() {
1477//     unsafe {
1478//         // Wait until there are no more pending writes.
1479//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1480//     }
1481// }
1482// ```
1483#[inline]
1484#[target_feature(enable = "sse")]
1485#[cfg_attr(test, assert_instr(sfence))]
1486#[stable(feature = "simd_x86", since = "1.27.0")]
1487pub fn _mm_sfence() {
1488    unsafe { sfence() }
1489}
1490
1491/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1492///
1493/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1494/// floating-point operations may or may not result in this register getting updated with exception
1495/// state, and the register can change between two invocations of this function even when no
1496/// floating-point operations appear in the source code (since floating-point operations appearing
1497/// earlier or later can be reordered).
1498///
1499/// If you need to perform some floating-point operations and check whether they raised an
1500/// exception, use an inline assembly block for the entire sequence of operations.
1501///
1502/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1503///
1504/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1505#[inline]
1506#[target_feature(enable = "sse")]
1507#[cfg_attr(test, assert_instr(stmxcsr))]
1508#[stable(feature = "simd_x86", since = "1.27.0")]
1509#[deprecated(
1510    since = "1.75.0",
1511    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1512)]
1513pub unsafe fn _mm_getcsr() -> u32 {
1514    unsafe {
1515        let mut result = 0_i32;
1516        stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1517        result as u32
1518    }
1519}
1520
1521/// Sets the MXCSR register with the 32-bit unsigned integer value.
1522///
1523/// This register controls how SIMD instructions handle floating point
1524/// operations. Modifying this register only affects the current thread.
1525///
1526/// It contains several groups of flags:
1527///
1528/// * *Exception flags* report which exceptions occurred since last they were reset.
1529///
1530/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
1531///   these flags are all set to 1, so all exceptions are masked. When
1532///   an exception is masked, the processor simply sets the exception flag and
1533///   continues the operation. If the exception is unmasked, the flag is also set
1534///   but additionally an exception handler is invoked.
1535///
1536/// * *Rounding mode flags* control the rounding mode of floating point
1537///   instructions.
1538///
1539/// * The *denormals-are-zero mode flag* turns all numbers which would be
1540///   denormalized (exponent bits are all zeros) into zeros.
1541///
1542/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1543/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1544/// will optimize accordingly. This even applies when the register is altered and later reset to its
1545/// original value without any floating-point operations appearing in the source code between those
1546/// operations (since floating-point operations appearing earlier or later can be reordered).
1547///
1548/// If you need to perform some floating-point operations under a different masking flags, rounding
1549/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1550/// original MXCSR register state before the end of the block.
1551///
1552/// ## Exception Flags
1553///
1554/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1555///   Infinity by Infinity).
1556///
1557/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1558///   number. Mainly this can cause loss of precision.
1559///
1560/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1561///
1562/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1563///   result was too large to be represented (e.g., an `f32` with absolute
1564///   value greater than `2^128`).
1565///
1566/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1567///   result was too small to be represented in a normalized way (e.g., an
1568///   `f32` with absolute value smaller than `2^-126`.)
1569///
1570/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1571///   precision exception). This means some precision was lost due to rounding.
1572///   For example, the fraction `1/3` cannot be represented accurately in a
1573///   32 or 64 bit float and computing it would cause this exception to be
1574///   raised. Precision exceptions are very common, so they are usually masked.
1575///
1576/// Exception flags can be read and set using the convenience functions
1577/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1578/// check if an operation caused some overflow:
1579///
1580/// ```rust,ignore
1581/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1582///                             // perform calculations
1583/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1584///     // handle overflow
1585/// }
1586/// ```
1587///
1588/// ## Masking Flags
1589///
1590/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1591/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1592/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1593///
1594/// A single masking bit can be set via
1595///
1596/// ```rust,ignore
1597/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1598/// ```
1599///
1600/// However, since mask bits are by default all set to 1, it is more common to
1601/// want to *disable* certain bits. For example, to unmask the underflow
1602/// exception, use:
1603///
1604/// ```rust,ignore
1605/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1606/// exception
1607/// ```
1608///
1609/// Warning: an unmasked exception will cause an exception handler to be
1610/// called.
1611/// The standard handler will simply terminate the process. So, in this case
1612/// any underflow exception would terminate the current process with something
1613/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1614///
1615/// ## Rounding Mode
1616///
1617/// The rounding mode is describe using two bits. It can be read and set using
1618/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1619/// `_MM_SET_ROUNDING_MODE(mode)`.
1620///
1621/// The rounding modes are:
1622///
1623/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1624///   value. If two values are equally close, round to even (i.e., least
1625///   significant bit will be zero).
1626///
1627/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1628///
1629/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1630///
1631/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1632///
1633/// Example:
1634///
1635/// ```rust,ignore
1636/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1637/// ```
1638///
1639/// ## Denormals-are-zero/Flush-to-zero Mode
1640///
1641/// If this bit is set, values that would be denormalized will be set to zero
1642/// instead. This is turned off by default.
1643///
1644/// You can read and enable/disable this mode via the helper functions
1645/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1646///
1647/// ```rust,ignore
1648/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1649/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1650/// ```
1651///
1652///
1653/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1654#[inline]
1655#[target_feature(enable = "sse")]
1656#[cfg_attr(test, assert_instr(ldmxcsr))]
1657#[stable(feature = "simd_x86", since = "1.27.0")]
1658#[deprecated(
1659    since = "1.75.0",
1660    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1661)]
1662pub unsafe fn _mm_setcsr(val: u32) {
1663    ldmxcsr(ptr::addr_of!(val) as *const i8);
1664}
1665
1666/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1667#[stable(feature = "simd_x86", since = "1.27.0")]
1668pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1669/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1672/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1673#[stable(feature = "simd_x86", since = "1.27.0")]
1674pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1675/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1676#[stable(feature = "simd_x86", since = "1.27.0")]
1677pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1678/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1679#[stable(feature = "simd_x86", since = "1.27.0")]
1680pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1681/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1682#[stable(feature = "simd_x86", since = "1.27.0")]
1683pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1684/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1685#[stable(feature = "simd_x86", since = "1.27.0")]
1686pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1687
1688/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1689#[stable(feature = "simd_x86", since = "1.27.0")]
1690pub const _MM_MASK_INVALID: u32 = 0x0080;
1691/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub const _MM_MASK_DENORM: u32 = 0x0100;
1694/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1695#[stable(feature = "simd_x86", since = "1.27.0")]
1696pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1697/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1698#[stable(feature = "simd_x86", since = "1.27.0")]
1699pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1700/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1701#[stable(feature = "simd_x86", since = "1.27.0")]
1702pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1703/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1704#[stable(feature = "simd_x86", since = "1.27.0")]
1705pub const _MM_MASK_INEXACT: u32 = 0x1000;
1706/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1707#[stable(feature = "simd_x86", since = "1.27.0")]
1708pub const _MM_MASK_MASK: u32 = 0x1f80;
1709
1710/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1711#[stable(feature = "simd_x86", since = "1.27.0")]
1712pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1713/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1714#[stable(feature = "simd_x86", since = "1.27.0")]
1715pub const _MM_ROUND_DOWN: u32 = 0x2000;
1716/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1717#[stable(feature = "simd_x86", since = "1.27.0")]
1718pub const _MM_ROUND_UP: u32 = 0x4000;
1719/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1720#[stable(feature = "simd_x86", since = "1.27.0")]
1721pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1722
1723/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1724#[stable(feature = "simd_x86", since = "1.27.0")]
1725pub const _MM_ROUND_MASK: u32 = 0x6000;
1726
1727/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1728#[stable(feature = "simd_x86", since = "1.27.0")]
1729pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1730/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1731#[stable(feature = "simd_x86", since = "1.27.0")]
1732pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1733/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1736
1737/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1738///
1739/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1740#[inline]
1741#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1742#[allow(non_snake_case)]
1743#[target_feature(enable = "sse")]
1744#[stable(feature = "simd_x86", since = "1.27.0")]
1745#[deprecated(
1746    since = "1.75.0",
1747    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1748)]
1749pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1750    _mm_getcsr() & _MM_MASK_MASK
1751}
1752
1753/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1754///
1755/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1756#[inline]
1757#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1758#[allow(non_snake_case)]
1759#[target_feature(enable = "sse")]
1760#[stable(feature = "simd_x86", since = "1.27.0")]
1761#[deprecated(
1762    since = "1.75.0",
1763    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1764)]
1765pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1766    _mm_getcsr() & _MM_EXCEPT_MASK
1767}
1768
1769/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1770///
1771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1772#[inline]
1773#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1774#[allow(non_snake_case)]
1775#[target_feature(enable = "sse")]
1776#[stable(feature = "simd_x86", since = "1.27.0")]
1777#[deprecated(
1778    since = "1.75.0",
1779    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1780)]
1781pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1782    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1783}
1784
1785/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1786///
1787/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1788#[inline]
1789#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1790#[allow(non_snake_case)]
1791#[target_feature(enable = "sse")]
1792#[stable(feature = "simd_x86", since = "1.27.0")]
1793#[deprecated(
1794    since = "1.75.0",
1795    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1796)]
1797pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1798    _mm_getcsr() & _MM_ROUND_MASK
1799}
1800
1801/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1802///
1803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1804#[inline]
1805#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1806#[allow(non_snake_case)]
1807#[target_feature(enable = "sse")]
1808#[stable(feature = "simd_x86", since = "1.27.0")]
1809#[deprecated(
1810    since = "1.75.0",
1811    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1812)]
1813pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1814    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | (x & _MM_MASK_MASK))
1815}
1816
1817/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1818///
1819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1820#[inline]
1821#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1822#[allow(non_snake_case)]
1823#[target_feature(enable = "sse")]
1824#[stable(feature = "simd_x86", since = "1.27.0")]
1825#[deprecated(
1826    since = "1.75.0",
1827    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1828)]
1829pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1830    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | (x & _MM_EXCEPT_MASK))
1831}
1832
1833/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1834///
1835/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1836#[inline]
1837#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1838#[allow(non_snake_case)]
1839#[target_feature(enable = "sse")]
1840#[stable(feature = "simd_x86", since = "1.27.0")]
1841#[deprecated(
1842    since = "1.75.0",
1843    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1844)]
1845pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1846    _mm_setcsr((_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | (x & _MM_FLUSH_ZERO_MASK))
1847}
1848
1849/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1850///
1851/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1852#[inline]
1853#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1854#[allow(non_snake_case)]
1855#[target_feature(enable = "sse")]
1856#[stable(feature = "simd_x86", since = "1.27.0")]
1857#[deprecated(
1858    since = "1.75.0",
1859    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1860)]
1861pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1862    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | (x & _MM_ROUND_MASK))
1863}
1864
1865/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1866#[stable(feature = "simd_x86", since = "1.27.0")]
1867pub const _MM_HINT_T0: i32 = 3;
1868
1869/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1870#[stable(feature = "simd_x86", since = "1.27.0")]
1871pub const _MM_HINT_T1: i32 = 2;
1872
1873/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1874#[stable(feature = "simd_x86", since = "1.27.0")]
1875pub const _MM_HINT_T2: i32 = 1;
1876
1877/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879pub const _MM_HINT_NTA: i32 = 0;
1880
1881/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883pub const _MM_HINT_ET0: i32 = 7;
1884
1885/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1886#[stable(feature = "simd_x86", since = "1.27.0")]
1887pub const _MM_HINT_ET1: i32 = 6;
1888
1889/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1890///
1891/// The `STRATEGY` must be one of:
1892///
1893/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1894///   cache hierarchy.
1895///
1896/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1897///
1898/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1899///   an implementation-specific choice (e.g., L2 if there is no L3).
1900///
1901/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1902///   non-temporal access (NTA) hint. It may be a place closer than main memory
1903///   but outside of the cache hierarchy. This is used to reduce access latency
1904///   without polluting the cache.
1905///
1906/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1907///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1908///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1909///
1910/// The actual implementation depends on the particular CPU. This instruction
1911/// is considered a hint, so the CPU is also free to simply ignore the request.
1912///
1913/// The amount of prefetched data depends on the cache line size of the
1914/// specific CPU, but it will be at least 32 bytes.
1915///
1916/// Common caveats:
1917///
1918/// * Most modern CPUs already automatically prefetch data based on predicted
1919///   access patterns.
1920///
1921/// * Data is usually not fetched if this would cause a TLB miss or a page
1922///   fault.
1923///
1924/// * Too much prefetching can cause unnecessary cache evictions.
1925///
1926/// * Prefetching may also fail if there are not enough memory-subsystem
1927///   resources (e.g., request buffers).
1928///
1929/// Note: this intrinsic is safe to use even though it takes a raw pointer argument. In general, this
1930/// cannot change the behavior of the program, including not trapping on invalid pointers.
1931///
1932/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1933#[inline]
1934#[target_feature(enable = "sse")]
1935#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1936#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1937#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1938#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1939#[rustc_legacy_const_generics(1)]
1940#[stable(feature = "simd_x86", since = "1.27.0")]
1941pub fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1942    static_assert_uimm_bits!(STRATEGY, 3);
1943    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1944    // `locality` and `rw` are based on our `STRATEGY`.
1945    unsafe {
1946        prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
1947    }
1948}
1949
1950/// Returns vector of type __m128 with indeterminate elements.
1951/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
1952/// picks some valid value and is not equivalent to [`mem::MaybeUninit`].
1953/// In practice, this is typically equivalent to [`mem::zeroed`].
1954///
1955/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1956#[inline]
1957#[target_feature(enable = "sse")]
1958#[stable(feature = "simd_x86", since = "1.27.0")]
1959#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1960pub const fn _mm_undefined_ps() -> __m128 {
1961    const { unsafe { mem::zeroed() } }
1962}
1963
1964/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1965///
1966/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1967#[inline]
1968#[allow(non_snake_case)]
1969#[target_feature(enable = "sse")]
1970#[stable(feature = "simd_x86", since = "1.27.0")]
1971#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
1972pub const fn _MM_TRANSPOSE4_PS(
1973    row0: &mut __m128,
1974    row1: &mut __m128,
1975    row2: &mut __m128,
1976    row3: &mut __m128,
1977) {
1978    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1979    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1980    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1981    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1982
1983    *row0 = _mm_movelh_ps(tmp0, tmp2);
1984    *row1 = _mm_movehl_ps(tmp2, tmp0);
1985    *row2 = _mm_movelh_ps(tmp1, tmp3);
1986    *row3 = _mm_movehl_ps(tmp3, tmp1);
1987}
1988
1989#[allow(improper_ctypes)]
1990unsafe extern "C" {
1991    #[link_name = "llvm.x86.sse.rcp.ss"]
1992    fn rcpss(a: __m128) -> __m128;
1993    #[link_name = "llvm.x86.sse.rcp.ps"]
1994    fn rcpps(a: __m128) -> __m128;
1995    #[link_name = "llvm.x86.sse.rsqrt.ss"]
1996    fn rsqrtss(a: __m128) -> __m128;
1997    #[link_name = "llvm.x86.sse.rsqrt.ps"]
1998    fn rsqrtps(a: __m128) -> __m128;
1999    #[link_name = "llvm.x86.sse.min.ss"]
2000    fn minss(a: __m128, b: __m128) -> __m128;
2001    #[link_name = "llvm.x86.sse.min.ps"]
2002    fn minps(a: __m128, b: __m128) -> __m128;
2003    #[link_name = "llvm.x86.sse.max.ss"]
2004    fn maxss(a: __m128, b: __m128) -> __m128;
2005    #[link_name = "llvm.x86.sse.max.ps"]
2006    fn maxps(a: __m128, b: __m128) -> __m128;
2007    #[link_name = "llvm.x86.sse.cmp.ps"]
2008    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
2009    #[link_name = "llvm.x86.sse.comieq.ss"]
2010    fn comieq_ss(a: __m128, b: __m128) -> i32;
2011    #[link_name = "llvm.x86.sse.comilt.ss"]
2012    fn comilt_ss(a: __m128, b: __m128) -> i32;
2013    #[link_name = "llvm.x86.sse.comile.ss"]
2014    fn comile_ss(a: __m128, b: __m128) -> i32;
2015    #[link_name = "llvm.x86.sse.comigt.ss"]
2016    fn comigt_ss(a: __m128, b: __m128) -> i32;
2017    #[link_name = "llvm.x86.sse.comige.ss"]
2018    fn comige_ss(a: __m128, b: __m128) -> i32;
2019    #[link_name = "llvm.x86.sse.comineq.ss"]
2020    fn comineq_ss(a: __m128, b: __m128) -> i32;
2021    #[link_name = "llvm.x86.sse.ucomieq.ss"]
2022    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
2023    #[link_name = "llvm.x86.sse.ucomilt.ss"]
2024    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
2025    #[link_name = "llvm.x86.sse.ucomile.ss"]
2026    fn ucomile_ss(a: __m128, b: __m128) -> i32;
2027    #[link_name = "llvm.x86.sse.ucomigt.ss"]
2028    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
2029    #[link_name = "llvm.x86.sse.ucomige.ss"]
2030    fn ucomige_ss(a: __m128, b: __m128) -> i32;
2031    #[link_name = "llvm.x86.sse.ucomineq.ss"]
2032    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
2033    #[link_name = "llvm.x86.sse.cvtss2si"]
2034    fn cvtss2si(a: __m128) -> i32;
2035    #[link_name = "llvm.x86.sse.cvttss2si"]
2036    fn cvttss2si(a: __m128) -> i32;
2037    #[link_name = "llvm.x86.sse.sfence"]
2038    fn sfence();
2039    #[link_name = "llvm.x86.sse.stmxcsr"]
2040    fn stmxcsr(p: *mut i8);
2041    #[link_name = "llvm.x86.sse.ldmxcsr"]
2042    fn ldmxcsr(p: *const i8);
2043    #[link_name = "llvm.prefetch"]
2044    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
2045    #[link_name = "llvm.x86.sse.cmp.ss"]
2046    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
2047}
2048
2049/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
2050///
2051/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
2052/// exception _may_ be generated.
2053///
2054/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2055///
2056/// # Safety of non-temporal stores
2057///
2058/// After using this intrinsic, but before any other access to the memory that this intrinsic
2059/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2060/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2061/// return.
2062///
2063/// See [`_mm_sfence`] for details.
2064#[inline]
2065#[target_feature(enable = "sse")]
2066#[cfg_attr(test, assert_instr(movntps))]
2067#[stable(feature = "simd_x86", since = "1.27.0")]
2068#[allow(clippy::cast_ptr_alignment)]
2069pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2070    // see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2071    crate::arch::asm!(
2072        vps!("movntps", ",{a}"),
2073        p = in(reg) mem_addr,
2074        a = in(xmm_reg) a,
2075        options(nostack, preserves_flags),
2076    );
2077}
2078
2079#[cfg(test)]
2080mod tests {
2081    use crate::core_arch::assert_eq_const as assert_eq;
2082    use crate::{hint::black_box, ptr};
2083    use std::boxed;
2084    use stdarch_test::simd_test;
2085
2086    use crate::core_arch::{simd::*, x86::*};
2087
2088    const NAN: f32 = f32::NAN;
2089
2090    #[simd_test(enable = "sse")]
2091    const fn test_mm_add_ps() {
2092        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2093        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2094        let r = _mm_add_ps(a, b);
2095        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
2096    }
2097
2098    #[simd_test(enable = "sse")]
2099    const fn test_mm_add_ss() {
2100        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
2101        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
2102        let r = _mm_add_ss(a, b);
2103        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
2104    }
2105
2106    #[simd_test(enable = "sse")]
2107    const fn test_mm_sub_ps() {
2108        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2109        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2110        let r = _mm_sub_ps(a, b);
2111        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
2112    }
2113
2114    #[simd_test(enable = "sse")]
2115    const fn test_mm_sub_ss() {
2116        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2117        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2118        let r = _mm_sub_ss(a, b);
2119        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
2120    }
2121
2122    #[simd_test(enable = "sse")]
2123    const fn test_mm_mul_ps() {
2124        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2125        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2126        let r = _mm_mul_ps(a, b);
2127        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
2128    }
2129
2130    #[simd_test(enable = "sse")]
2131    const fn test_mm_mul_ss() {
2132        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2133        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2134        let r = _mm_mul_ss(a, b);
2135        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2136    }
2137
2138    #[simd_test(enable = "sse")]
2139    const fn test_mm_div_ps() {
2140        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2141        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2142        let r = _mm_div_ps(a, b);
2143        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2144    }
2145
2146    #[simd_test(enable = "sse")]
2147    const fn test_mm_div_ss() {
2148        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2149        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2150        let r = _mm_div_ss(a, b);
2151        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2152    }
2153
2154    #[simd_test(enable = "sse")]
2155    fn test_mm_sqrt_ss() {
2156        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2157        let r = _mm_sqrt_ss(a);
2158        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2159        assert_eq_m128(r, e);
2160    }
2161
2162    #[simd_test(enable = "sse")]
2163    fn test_mm_sqrt_ps() {
2164        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2165        let r = _mm_sqrt_ps(a);
2166        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2167        assert_eq_m128(r, e);
2168    }
2169
2170    #[simd_test(enable = "sse")]
2171    fn test_mm_rcp_ss() {
2172        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2173        let r = _mm_rcp_ss(a);
2174        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2175        let rel_err = 0.00048828125;
2176        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2177        for i in 1..4 {
2178            assert_eq!(get_m128(r, i), get_m128(e, i));
2179        }
2180    }
2181
2182    #[simd_test(enable = "sse")]
2183    fn test_mm_rcp_ps() {
2184        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2185        let r = _mm_rcp_ps(a);
2186        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2187        let rel_err = 0.00048828125;
2188        for i in 0..4 {
2189            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2190        }
2191    }
2192
2193    #[simd_test(enable = "sse")]
2194    fn test_mm_rsqrt_ss() {
2195        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2196        let r = _mm_rsqrt_ss(a);
2197        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2198        let rel_err = 0.00048828125;
2199        for i in 0..4 {
2200            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2201        }
2202    }
2203
2204    #[simd_test(enable = "sse")]
2205    fn test_mm_rsqrt_ps() {
2206        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2207        let r = _mm_rsqrt_ps(a);
2208        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2209        let rel_err = 0.00048828125;
2210        for i in 0..4 {
2211            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2212        }
2213    }
2214
2215    #[simd_test(enable = "sse")]
2216    fn test_mm_min_ss() {
2217        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2218        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2219        let r = _mm_min_ss(a, b);
2220        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2221    }
2222
2223    #[simd_test(enable = "sse")]
2224    fn test_mm_min_ps() {
2225        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2226        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2227        let r = _mm_min_ps(a, b);
2228        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2229
2230        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2231        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2232        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2233        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2234        // `r1` to `a` and `r2` to `b`.
2235        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2236        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2237        let r1 = _mm_min_ps(a, b).as_f32x4().to_bits();
2238        let r2 = _mm_min_ps(b, a).as_f32x4().to_bits();
2239        let a = a.as_f32x4().to_bits();
2240        let b = b.as_f32x4().to_bits();
2241        assert_eq!(r1, b);
2242        assert_eq!(r2, a);
2243        assert_ne!(a, b); // sanity check that -0.0 is actually present
2244    }
2245
2246    #[simd_test(enable = "sse")]
2247    fn test_mm_max_ss() {
2248        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2249        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2250        let r = _mm_max_ss(a, b);
2251        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2252    }
2253
2254    #[simd_test(enable = "sse")]
2255    fn test_mm_max_ps() {
2256        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2257        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2258        let r = _mm_max_ps(a, b);
2259        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2260
2261        // Check SSE-specific semantics for -0.0 handling.
2262        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2263        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2264        let r1 = _mm_max_ps(a, b).as_f32x4().to_bits();
2265        let r2 = _mm_max_ps(b, a).as_f32x4().to_bits();
2266        let a = a.as_f32x4().to_bits();
2267        let b = b.as_f32x4().to_bits();
2268        assert_eq!(r1, b);
2269        assert_eq!(r2, a);
2270        assert_ne!(a, b); // sanity check that -0.0 is actually present
2271    }
2272
2273    #[simd_test(enable = "sse")]
2274    const fn test_mm_and_ps() {
2275        let a = f32x4::from_bits(u32x4::splat(0b0011)).as_m128();
2276        let b = f32x4::from_bits(u32x4::splat(0b0101)).as_m128();
2277        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2278        let e = f32x4::from_bits(u32x4::splat(0b0001)).as_m128();
2279        assert_eq_m128(r, e);
2280    }
2281
2282    #[simd_test(enable = "sse")]
2283    const fn test_mm_andnot_ps() {
2284        let a = f32x4::from_bits(u32x4::splat(0b0011)).as_m128();
2285        let b = f32x4::from_bits(u32x4::splat(0b0101)).as_m128();
2286        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2287        let e = f32x4::from_bits(u32x4::splat(0b0100)).as_m128();
2288        assert_eq_m128(r, e);
2289    }
2290
2291    #[simd_test(enable = "sse")]
2292    const fn test_mm_or_ps() {
2293        let a = f32x4::from_bits(u32x4::splat(0b0011)).as_m128();
2294        let b = f32x4::from_bits(u32x4::splat(0b0101)).as_m128();
2295        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2296        let e = f32x4::from_bits(u32x4::splat(0b0111)).as_m128();
2297        assert_eq_m128(r, e);
2298    }
2299
2300    #[simd_test(enable = "sse")]
2301    const fn test_mm_xor_ps() {
2302        let a = f32x4::from_bits(u32x4::splat(0b0011)).as_m128();
2303        let b = f32x4::from_bits(u32x4::splat(0b0101)).as_m128();
2304        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2305        let e = f32x4::from_bits(u32x4::splat(0b0110)).as_m128();
2306        assert_eq_m128(r, e);
2307    }
2308
2309    #[simd_test(enable = "sse")]
2310    fn test_mm_cmpeq_ss() {
2311        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2312        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2313        let r = _mm_cmpeq_ss(a, b).as_f32x4().to_bits();
2314        let e = f32x4::new(f32::from_bits(0), 2.0, 3.0, 4.0).to_bits();
2315        assert_eq!(r, e);
2316
2317        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2318        let r2 = _mm_cmpeq_ss(a, b2).as_f32x4().to_bits();
2319        let e2 = f32x4::new(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0).to_bits();
2320        assert_eq!(r2, e2);
2321    }
2322
2323    #[simd_test(enable = "sse")]
2324    fn test_mm_cmplt_ss() {
2325        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2326        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2327        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2328        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2329
2330        let b1 = 0u32; // a.extract(0) < b.extract(0)
2331        let c1 = 0u32; // a.extract(0) < c.extract(0)
2332        let d1 = !0u32; // a.extract(0) < d.extract(0)
2333
2334        let rb = _mm_cmplt_ss(a, b).as_f32x4().to_bits();
2335        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2336        assert_eq!(rb, eb);
2337
2338        let rc = _mm_cmplt_ss(a, c).as_f32x4().to_bits();
2339        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2340        assert_eq!(rc, ec);
2341
2342        let rd = _mm_cmplt_ss(a, d).as_f32x4().to_bits();
2343        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2344        assert_eq!(rd, ed);
2345    }
2346
2347    #[simd_test(enable = "sse")]
2348    fn test_mm_cmple_ss() {
2349        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2350        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2351        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2352        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2353
2354        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2355        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2356        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2357
2358        let rb = _mm_cmple_ss(a, b).as_f32x4().to_bits();
2359        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2360        assert_eq!(rb, eb);
2361
2362        let rc = _mm_cmple_ss(a, c).as_f32x4().to_bits();
2363        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2364        assert_eq!(rc, ec);
2365
2366        let rd = _mm_cmple_ss(a, d).as_f32x4().to_bits();
2367        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2368        assert_eq!(rd, ed);
2369    }
2370
2371    #[simd_test(enable = "sse")]
2372    fn test_mm_cmpgt_ss() {
2373        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2374        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2375        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2376        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2377
2378        let b1 = !0u32; // a.extract(0) > b.extract(0)
2379        let c1 = 0u32; // a.extract(0) > c.extract(0)
2380        let d1 = 0u32; // a.extract(0) > d.extract(0)
2381
2382        let rb = _mm_cmpgt_ss(a, b).as_f32x4().to_bits();
2383        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2384        assert_eq!(rb, eb);
2385
2386        let rc = _mm_cmpgt_ss(a, c).as_f32x4().to_bits();
2387        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2388        assert_eq!(rc, ec);
2389
2390        let rd = _mm_cmpgt_ss(a, d).as_f32x4().to_bits();
2391        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2392        assert_eq!(rd, ed);
2393    }
2394
2395    #[simd_test(enable = "sse")]
2396    fn test_mm_cmpge_ss() {
2397        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2398        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2399        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2400        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2401
2402        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2403        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2404        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2405
2406        let rb = _mm_cmpge_ss(a, b).as_f32x4().to_bits();
2407        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2408        assert_eq!(rb, eb);
2409
2410        let rc = _mm_cmpge_ss(a, c).as_f32x4().to_bits();
2411        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2412        assert_eq!(rc, ec);
2413
2414        let rd = _mm_cmpge_ss(a, d).as_f32x4().to_bits();
2415        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2416        assert_eq!(rd, ed);
2417    }
2418
2419    #[simd_test(enable = "sse")]
2420    fn test_mm_cmpneq_ss() {
2421        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2422        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2423        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2424        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2425
2426        let b1 = !0u32; // a.extract(0) != b.extract(0)
2427        let c1 = 0u32; // a.extract(0) != c.extract(0)
2428        let d1 = !0u32; // a.extract(0) != d.extract(0)
2429
2430        let rb = _mm_cmpneq_ss(a, b).as_f32x4().to_bits();
2431        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2432        assert_eq!(rb, eb);
2433
2434        let rc = _mm_cmpneq_ss(a, c).as_f32x4().to_bits();
2435        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2436        assert_eq!(rc, ec);
2437
2438        let rd = _mm_cmpneq_ss(a, d).as_f32x4().to_bits();
2439        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2440        assert_eq!(rd, ed);
2441    }
2442
2443    #[simd_test(enable = "sse")]
2444    fn test_mm_cmpnlt_ss() {
2445        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2446        // must be a difference. It may have to do with behavior in the
2447        // presence of NaNs (signaling or quiet). If so, we should add tests
2448        // for those.
2449
2450        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2451        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2452        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2453        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2454
2455        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2456        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2457        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2458
2459        let rb = _mm_cmpnlt_ss(a, b).as_f32x4().to_bits();
2460        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2461        assert_eq!(rb, eb);
2462
2463        let rc = _mm_cmpnlt_ss(a, c).as_f32x4().to_bits();
2464        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2465        assert_eq!(rc, ec);
2466
2467        let rd = _mm_cmpnlt_ss(a, d).as_f32x4().to_bits();
2468        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2469        assert_eq!(rd, ed);
2470    }
2471
2472    #[simd_test(enable = "sse")]
2473    fn test_mm_cmpnle_ss() {
2474        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2475        // must be a difference. It may have to do with behavior in the
2476        // presence
2477        // of NaNs (signaling or quiet). If so, we should add tests for those.
2478
2479        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2480        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2481        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2482        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2483
2484        let b1 = !0u32; // a.extract(0) > b.extract(0)
2485        let c1 = 0u32; // a.extract(0) > c.extract(0)
2486        let d1 = 0u32; // a.extract(0) > d.extract(0)
2487
2488        let rb = _mm_cmpnle_ss(a, b).as_f32x4().to_bits();
2489        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2490        assert_eq!(rb, eb);
2491
2492        let rc = _mm_cmpnle_ss(a, c).as_f32x4().to_bits();
2493        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2494        assert_eq!(rc, ec);
2495
2496        let rd = _mm_cmpnle_ss(a, d).as_f32x4().to_bits();
2497        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2498        assert_eq!(rd, ed);
2499    }
2500
2501    #[simd_test(enable = "sse")]
2502    fn test_mm_cmpngt_ss() {
2503        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2504        // must be a difference. It may have to do with behavior in the
2505        // presence of NaNs (signaling or quiet). If so, we should add tests
2506        // for those.
2507
2508        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2509        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2510        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2511        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2512
2513        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2514        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2515        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2516
2517        let rb = _mm_cmpngt_ss(a, b).as_f32x4().to_bits();
2518        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2519        assert_eq!(rb, eb);
2520
2521        let rc = _mm_cmpngt_ss(a, c).as_f32x4().to_bits();
2522        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2523        assert_eq!(rc, ec);
2524
2525        let rd = _mm_cmpngt_ss(a, d).as_f32x4().to_bits();
2526        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2527        assert_eq!(rd, ed);
2528    }
2529
2530    #[simd_test(enable = "sse")]
2531    fn test_mm_cmpnge_ss() {
2532        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2533        // must be a difference. It may have to do with behavior in the
2534        // presence of NaNs (signaling or quiet). If so, we should add tests
2535        // for those.
2536
2537        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2538        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2539        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2540        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2541
2542        let b1 = 0u32; // a.extract(0) < b.extract(0)
2543        let c1 = 0u32; // a.extract(0) < c.extract(0)
2544        let d1 = !0u32; // a.extract(0) < d.extract(0)
2545
2546        let rb = _mm_cmpnge_ss(a, b).as_f32x4().to_bits();
2547        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2548        assert_eq!(rb, eb);
2549
2550        let rc = _mm_cmpnge_ss(a, c).as_f32x4().to_bits();
2551        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2552        assert_eq!(rc, ec);
2553
2554        let rd = _mm_cmpnge_ss(a, d).as_f32x4().to_bits();
2555        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2556        assert_eq!(rd, ed);
2557    }
2558
2559    #[simd_test(enable = "sse")]
2560    fn test_mm_cmpord_ss() {
2561        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2562        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2563        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2564        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2565
2566        let b1 = !0u32; // a.extract(0) ord b.extract(0)
2567        let c1 = 0u32; // a.extract(0) ord c.extract(0)
2568        let d1 = !0u32; // a.extract(0) ord d.extract(0)
2569
2570        let rb = _mm_cmpord_ss(a, b).as_f32x4().to_bits();
2571        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2572        assert_eq!(rb, eb);
2573
2574        let rc = _mm_cmpord_ss(a, c).as_f32x4().to_bits();
2575        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2576        assert_eq!(rc, ec);
2577
2578        let rd = _mm_cmpord_ss(a, d).as_f32x4().to_bits();
2579        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2580        assert_eq!(rd, ed);
2581    }
2582
2583    #[simd_test(enable = "sse")]
2584    fn test_mm_cmpunord_ss() {
2585        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2586        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2587        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2588        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2589
2590        let b1 = 0u32; // a.extract(0) unord b.extract(0)
2591        let c1 = !0u32; // a.extract(0) unord c.extract(0)
2592        let d1 = 0u32; // a.extract(0) unord d.extract(0)
2593
2594        let rb = _mm_cmpunord_ss(a, b).as_f32x4().to_bits();
2595        let eb = f32x4::new(f32::from_bits(b1), 2.0, 3.0, 4.0).to_bits();
2596        assert_eq!(rb, eb);
2597
2598        let rc = _mm_cmpunord_ss(a, c).as_f32x4().to_bits();
2599        let ec = f32x4::new(f32::from_bits(c1), 2.0, 3.0, 4.0).to_bits();
2600        assert_eq!(rc, ec);
2601
2602        let rd = _mm_cmpunord_ss(a, d).as_f32x4().to_bits();
2603        let ed = f32x4::new(f32::from_bits(d1), 2.0, 3.0, 4.0).to_bits();
2604        assert_eq!(rd, ed);
2605    }
2606
2607    #[simd_test(enable = "sse")]
2608    fn test_mm_cmpeq_ps() {
2609        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2610        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2611        let tru = !0u32;
2612        let fls = 0u32;
2613
2614        let e = u32x4::new(fls, fls, tru, fls);
2615        let r = _mm_cmpeq_ps(a, b).as_f32x4().to_bits();
2616        assert_eq!(r, e);
2617    }
2618
2619    #[simd_test(enable = "sse")]
2620    fn test_mm_cmplt_ps() {
2621        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2622        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2623        let tru = !0u32;
2624        let fls = 0u32;
2625
2626        let e = u32x4::new(tru, fls, fls, fls);
2627        let r = _mm_cmplt_ps(a, b).as_f32x4().to_bits();
2628        assert_eq!(r, e);
2629    }
2630
2631    #[simd_test(enable = "sse")]
2632    fn test_mm_cmple_ps() {
2633        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2634        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2635        let tru = !0u32;
2636        let fls = 0u32;
2637
2638        let e = u32x4::new(tru, fls, tru, fls);
2639        let r = _mm_cmple_ps(a, b).as_f32x4().to_bits();
2640        assert_eq!(r, e);
2641    }
2642
2643    #[simd_test(enable = "sse")]
2644    fn test_mm_cmpgt_ps() {
2645        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2646        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2647        let tru = !0u32;
2648        let fls = 0u32;
2649
2650        let e = u32x4::new(fls, tru, fls, fls);
2651        let r = _mm_cmpgt_ps(a, b).as_f32x4().to_bits();
2652        assert_eq!(r, e);
2653    }
2654
2655    #[simd_test(enable = "sse")]
2656    fn test_mm_cmpge_ps() {
2657        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2658        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2659        let tru = !0u32;
2660        let fls = 0u32;
2661
2662        let e = u32x4::new(fls, tru, tru, fls);
2663        let r = _mm_cmpge_ps(a, b).as_f32x4().to_bits();
2664        assert_eq!(r, e);
2665    }
2666
2667    #[simd_test(enable = "sse")]
2668    fn test_mm_cmpneq_ps() {
2669        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2670        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2671        let tru = !0u32;
2672        let fls = 0u32;
2673
2674        let e = u32x4::new(tru, tru, fls, tru);
2675        let r = _mm_cmpneq_ps(a, b).as_f32x4().to_bits();
2676        assert_eq!(r, e);
2677    }
2678
2679    #[simd_test(enable = "sse")]
2680    fn test_mm_cmpnlt_ps() {
2681        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2682        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2683        let tru = !0u32;
2684        let fls = 0u32;
2685
2686        let e = u32x4::new(fls, tru, tru, tru);
2687        let r = _mm_cmpnlt_ps(a, b).as_f32x4().to_bits();
2688        assert_eq!(r, e);
2689    }
2690
2691    #[simd_test(enable = "sse")]
2692    fn test_mm_cmpnle_ps() {
2693        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2694        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2695        let tru = !0u32;
2696        let fls = 0u32;
2697
2698        let e = u32x4::new(fls, tru, fls, tru);
2699        let r = _mm_cmpnle_ps(a, b).as_f32x4().to_bits();
2700        assert_eq!(r, e);
2701    }
2702
2703    #[simd_test(enable = "sse")]
2704    fn test_mm_cmpngt_ps() {
2705        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2706        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2707        let tru = !0u32;
2708        let fls = 0u32;
2709
2710        let e = u32x4::new(tru, fls, tru, tru);
2711        let r = _mm_cmpngt_ps(a, b).as_f32x4().to_bits();
2712        assert_eq!(r, e);
2713    }
2714
2715    #[simd_test(enable = "sse")]
2716    fn test_mm_cmpnge_ps() {
2717        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2718        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2719        let tru = !0u32;
2720        let fls = 0u32;
2721
2722        let e = u32x4::new(tru, fls, fls, tru);
2723        let r = _mm_cmpnge_ps(a, b).as_f32x4().to_bits();
2724        assert_eq!(r, e);
2725    }
2726
2727    #[simd_test(enable = "sse")]
2728    fn test_mm_cmpord_ps() {
2729        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2730        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2731        let tru = !0u32;
2732        let fls = 0u32;
2733
2734        let e = u32x4::new(tru, fls, fls, fls);
2735        let r = _mm_cmpord_ps(a, b).as_f32x4().to_bits();
2736        assert_eq!(r, e);
2737    }
2738
2739    #[simd_test(enable = "sse")]
2740    fn test_mm_cmpunord_ps() {
2741        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2742        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2743        let tru = !0u32;
2744        let fls = 0u32;
2745
2746        let e = u32x4::new(fls, tru, tru, tru);
2747        let r = _mm_cmpunord_ps(a, b).as_f32x4().to_bits();
2748        assert_eq!(r, e);
2749    }
2750
2751    #[simd_test(enable = "sse")]
2752    fn test_mm_comieq_ss() {
2753        let aa = &[3.0f32, 12.0, 23.0, NAN];
2754        let bb = &[3.0f32, 47.5, 1.5, NAN];
2755
2756        let ee = &[1i32, 0, 0, 0];
2757
2758        for i in 0..4 {
2759            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2760            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2761
2762            let r = _mm_comieq_ss(a, b);
2763
2764            assert_eq!(
2765                ee[i], r,
2766                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2767                a, b, r, ee[i], i
2768            );
2769        }
2770    }
2771
2772    #[simd_test(enable = "sse")]
2773    fn test_mm_comilt_ss() {
2774        let aa = &[3.0f32, 12.0, 23.0, NAN];
2775        let bb = &[3.0f32, 47.5, 1.5, NAN];
2776
2777        let ee = &[0i32, 1, 0, 0];
2778
2779        for i in 0..4 {
2780            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2781            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2782
2783            let r = _mm_comilt_ss(a, b);
2784
2785            assert_eq!(
2786                ee[i], r,
2787                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2788                a, b, r, ee[i], i
2789            );
2790        }
2791    }
2792
2793    #[simd_test(enable = "sse")]
2794    fn test_mm_comile_ss() {
2795        let aa = &[3.0f32, 12.0, 23.0, NAN];
2796        let bb = &[3.0f32, 47.5, 1.5, NAN];
2797
2798        let ee = &[1i32, 1, 0, 0];
2799
2800        for i in 0..4 {
2801            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2802            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2803
2804            let r = _mm_comile_ss(a, b);
2805
2806            assert_eq!(
2807                ee[i], r,
2808                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2809                a, b, r, ee[i], i
2810            );
2811        }
2812    }
2813
2814    #[simd_test(enable = "sse")]
2815    fn test_mm_comigt_ss() {
2816        let aa = &[3.0f32, 12.0, 23.0, NAN];
2817        let bb = &[3.0f32, 47.5, 1.5, NAN];
2818
2819        let ee = &[1i32, 0, 1, 0];
2820
2821        for i in 0..4 {
2822            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2823            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2824
2825            let r = _mm_comige_ss(a, b);
2826
2827            assert_eq!(
2828                ee[i], r,
2829                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2830                a, b, r, ee[i], i
2831            );
2832        }
2833    }
2834
2835    #[simd_test(enable = "sse")]
2836    fn test_mm_comineq_ss() {
2837        let aa = &[3.0f32, 12.0, 23.0, NAN];
2838        let bb = &[3.0f32, 47.5, 1.5, NAN];
2839
2840        let ee = &[0i32, 1, 1, 1];
2841
2842        for i in 0..4 {
2843            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2844            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2845
2846            let r = _mm_comineq_ss(a, b);
2847
2848            assert_eq!(
2849                ee[i], r,
2850                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2851                a, b, r, ee[i], i
2852            );
2853        }
2854    }
2855
2856    #[simd_test(enable = "sse")]
2857    fn test_mm_ucomieq_ss() {
2858        let aa = &[3.0f32, 12.0, 23.0, NAN];
2859        let bb = &[3.0f32, 47.5, 1.5, NAN];
2860
2861        let ee = &[1i32, 0, 0, 0];
2862
2863        for i in 0..4 {
2864            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2865            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2866
2867            let r = _mm_ucomieq_ss(a, b);
2868
2869            assert_eq!(
2870                ee[i], r,
2871                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2872                a, b, r, ee[i], i
2873            );
2874        }
2875    }
2876
2877    #[simd_test(enable = "sse")]
2878    fn test_mm_ucomilt_ss() {
2879        let aa = &[3.0f32, 12.0, 23.0, NAN];
2880        let bb = &[3.0f32, 47.5, 1.5, NAN];
2881
2882        let ee = &[0i32, 1, 0, 0];
2883
2884        for i in 0..4 {
2885            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2886            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2887
2888            let r = _mm_ucomilt_ss(a, b);
2889
2890            assert_eq!(
2891                ee[i], r,
2892                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2893                a, b, r, ee[i], i
2894            );
2895        }
2896    }
2897
2898    #[simd_test(enable = "sse")]
2899    fn test_mm_ucomile_ss() {
2900        let aa = &[3.0f32, 12.0, 23.0, NAN];
2901        let bb = &[3.0f32, 47.5, 1.5, NAN];
2902
2903        let ee = &[1i32, 1, 0, 0];
2904
2905        for i in 0..4 {
2906            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2907            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2908
2909            let r = _mm_ucomile_ss(a, b);
2910
2911            assert_eq!(
2912                ee[i], r,
2913                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2914                a, b, r, ee[i], i
2915            );
2916        }
2917    }
2918
2919    #[simd_test(enable = "sse")]
2920    fn test_mm_ucomigt_ss() {
2921        let aa = &[3.0f32, 12.0, 23.0, NAN];
2922        let bb = &[3.0f32, 47.5, 1.5, NAN];
2923
2924        let ee = &[0i32, 0, 1, 0];
2925
2926        for i in 0..4 {
2927            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2928            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2929
2930            let r = _mm_ucomigt_ss(a, b);
2931
2932            assert_eq!(
2933                ee[i], r,
2934                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2935                a, b, r, ee[i], i
2936            );
2937        }
2938    }
2939
2940    #[simd_test(enable = "sse")]
2941    fn test_mm_ucomige_ss() {
2942        let aa = &[3.0f32, 12.0, 23.0, NAN];
2943        let bb = &[3.0f32, 47.5, 1.5, NAN];
2944
2945        let ee = &[1i32, 0, 1, 0];
2946
2947        for i in 0..4 {
2948            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2949            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2950
2951            let r = _mm_ucomige_ss(a, b);
2952
2953            assert_eq!(
2954                ee[i], r,
2955                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2956                a, b, r, ee[i], i
2957            );
2958        }
2959    }
2960
2961    #[simd_test(enable = "sse")]
2962    fn test_mm_ucomineq_ss() {
2963        let aa = &[3.0f32, 12.0, 23.0, NAN];
2964        let bb = &[3.0f32, 47.5, 1.5, NAN];
2965
2966        let ee = &[0i32, 1, 1, 1];
2967
2968        for i in 0..4 {
2969            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2970            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2971
2972            let r = _mm_ucomineq_ss(a, b);
2973
2974            assert_eq!(
2975                ee[i], r,
2976                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2977                a, b, r, ee[i], i
2978            );
2979        }
2980    }
2981
2982    #[simd_test(enable = "sse")]
2983    fn test_mm_cvtss_si32() {
2984        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2985        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2986        for i in 0..inputs.len() {
2987            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2988            let e = result[i];
2989            let r = _mm_cvtss_si32(x);
2990            assert_eq!(
2991                e, r,
2992                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2993                i, x, r, e
2994            );
2995        }
2996    }
2997
2998    #[simd_test(enable = "sse")]
2999    fn test_mm_cvttss_si32() {
3000        let inputs = &[
3001            (42.0f32, 42i32),
3002            (-31.4, -31),
3003            (-33.5, -33),
3004            (-34.5, -34),
3005            (10.999, 10),
3006            (-5.99, -5),
3007            (4.0e10, i32::MIN),
3008            (4.0e-10, 0),
3009            (NAN, i32::MIN),
3010            (2147483500.1, 2147483520),
3011        ];
3012        for (i, &(xi, e)) in inputs.iter().enumerate() {
3013            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
3014            let r = _mm_cvttss_si32(x);
3015            assert_eq!(
3016                e, r,
3017                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
3018                i, x, r, e
3019            );
3020        }
3021    }
3022
3023    #[simd_test(enable = "sse")]
3024    const fn test_mm_cvtsi32_ss() {
3025        let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3026
3027        let r = _mm_cvtsi32_ss(a, 4555);
3028        let e = _mm_setr_ps(4555.0, 6.0, 7.0, 8.0);
3029        assert_eq_m128(e, r);
3030
3031        let r = _mm_cvtsi32_ss(a, 322223333);
3032        let e = _mm_setr_ps(322223333.0, 6.0, 7.0, 8.0);
3033        assert_eq_m128(e, r);
3034
3035        let r = _mm_cvtsi32_ss(a, -432);
3036        let e = _mm_setr_ps(-432.0, 6.0, 7.0, 8.0);
3037        assert_eq_m128(e, r);
3038
3039        let r = _mm_cvtsi32_ss(a, -322223333);
3040        let e = _mm_setr_ps(-322223333.0, 6.0, 7.0, 8.0);
3041        assert_eq_m128(e, r);
3042    }
3043
3044    #[simd_test(enable = "sse")]
3045    const fn test_mm_cvtss_f32() {
3046        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
3047        assert_eq!(_mm_cvtss_f32(a), 312.0134);
3048    }
3049
3050    #[simd_test(enable = "sse")]
3051    const fn test_mm_set_ss() {
3052        let r = _mm_set_ss(black_box(4.25));
3053        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
3054    }
3055
3056    #[simd_test(enable = "sse")]
3057    const fn test_mm_set1_ps() {
3058        let r1 = _mm_set1_ps(black_box(4.25));
3059        let r2 = _mm_set_ps1(black_box(4.25));
3060        assert_eq!(get_m128(r1, 0), 4.25);
3061        assert_eq!(get_m128(r1, 1), 4.25);
3062        assert_eq!(get_m128(r1, 2), 4.25);
3063        assert_eq!(get_m128(r1, 3), 4.25);
3064        assert_eq!(get_m128(r2, 0), 4.25);
3065        assert_eq!(get_m128(r2, 1), 4.25);
3066        assert_eq!(get_m128(r2, 2), 4.25);
3067        assert_eq!(get_m128(r2, 3), 4.25);
3068    }
3069
3070    #[simd_test(enable = "sse")]
3071    const fn test_mm_set_ps() {
3072        let r = _mm_set_ps(
3073            black_box(1.0),
3074            black_box(2.0),
3075            black_box(3.0),
3076            black_box(4.0),
3077        );
3078        assert_eq!(get_m128(r, 0), 4.0);
3079        assert_eq!(get_m128(r, 1), 3.0);
3080        assert_eq!(get_m128(r, 2), 2.0);
3081        assert_eq!(get_m128(r, 3), 1.0);
3082    }
3083
3084    #[simd_test(enable = "sse")]
3085    const fn test_mm_setr_ps() {
3086        let r = _mm_setr_ps(
3087            black_box(1.0),
3088            black_box(2.0),
3089            black_box(3.0),
3090            black_box(4.0),
3091        );
3092        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3093    }
3094
3095    #[simd_test(enable = "sse")]
3096    const fn test_mm_setzero_ps() {
3097        let r = *black_box(&_mm_setzero_ps());
3098        assert_eq_m128(r, _mm_set1_ps(0.0));
3099    }
3100
3101    #[simd_test]
3102    #[allow(non_snake_case)]
3103    const fn test_MM_SHUFFLE() {
3104        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3105        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3106        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3107    }
3108
3109    #[simd_test(enable = "sse")]
3110    const fn test_mm_shuffle_ps() {
3111        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3112        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3113        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3114        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3115    }
3116
3117    #[simd_test(enable = "sse")]
3118    const fn test_mm_unpackhi_ps() {
3119        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3120        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3121        let r = _mm_unpackhi_ps(a, b);
3122        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3123    }
3124
3125    #[simd_test(enable = "sse")]
3126    const fn test_mm_unpacklo_ps() {
3127        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3128        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3129        let r = _mm_unpacklo_ps(a, b);
3130        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3131    }
3132
3133    #[simd_test(enable = "sse")]
3134    const fn test_mm_movehl_ps() {
3135        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3136        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3137        let r = _mm_movehl_ps(a, b);
3138        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3139    }
3140
3141    #[simd_test(enable = "sse")]
3142    const fn test_mm_movelh_ps() {
3143        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3144        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3145        let r = _mm_movelh_ps(a, b);
3146        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3147    }
3148
3149    #[simd_test(enable = "sse")]
3150    const fn test_mm_load_ss() {
3151        let a = 42.0f32;
3152        let r = unsafe { _mm_load_ss(ptr::addr_of!(a)) };
3153        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3154    }
3155
3156    #[simd_test(enable = "sse")]
3157    const fn test_mm_load1_ps() {
3158        let a = 42.0f32;
3159        let r = unsafe { _mm_load1_ps(ptr::addr_of!(a)) };
3160        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3161    }
3162
3163    #[simd_test(enable = "sse")]
3164    const fn test_mm_load_ps() {
3165        let vals = Memory {
3166            data: [1.0f32, 2.0, 3.0, 4.0],
3167        };
3168
3169        // guaranteed to be aligned to 16 bytes
3170        let p = vals.data.as_ptr();
3171
3172        let r = unsafe { _mm_load_ps(p) };
3173        let e = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3174        assert_eq_m128(r, e);
3175    }
3176
3177    #[simd_test(enable = "sse")]
3178    const fn test_mm_loadu_ps() {
3179        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3180        let p = unsafe { vals.as_ptr().add(3) };
3181        let r = unsafe { _mm_loadu_ps(black_box(p)) };
3182        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3183    }
3184
3185    #[simd_test(enable = "sse")]
3186    const fn test_mm_loadr_ps() {
3187        let vals = Memory {
3188            data: [1.0f32, 2.0, 3.0, 4.0],
3189        };
3190
3191        // guaranteed to be aligned to 16 bytes
3192        let p = vals.data.as_ptr();
3193
3194        let r = unsafe { _mm_loadr_ps(p) };
3195        let e = _mm_setr_ps(4.0, 3.0, 2.0, 1.0);
3196        assert_eq_m128(r, e);
3197    }
3198
3199    #[simd_test(enable = "sse")]
3200    const fn test_mm_store_ss() {
3201        let mut vals = [0.0f32; 8];
3202        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3203        unsafe {
3204            _mm_store_ss(vals.as_mut_ptr().add(1), a);
3205        }
3206
3207        assert_eq!(vals[0], 0.0);
3208        assert_eq!(vals[1], 1.0);
3209        assert_eq!(vals[2], 0.0);
3210    }
3211
3212    #[simd_test(enable = "sse")]
3213    const fn test_mm_store1_ps() {
3214        let mut vals = Memory { data: [0.0f32; 4] };
3215        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3216
3217        // guaranteed to be aligned to 16 bytes
3218        let p = vals.data.as_mut_ptr();
3219
3220        unsafe {
3221            _mm_store1_ps(p, *black_box(&a));
3222        }
3223
3224        assert_eq!(vals.data, [1.0, 1.0, 1.0, 1.0]);
3225    }
3226
3227    #[simd_test(enable = "sse")]
3228    const fn test_mm_store_ps() {
3229        let mut vals = Memory { data: [0.0f32; 4] };
3230        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3231
3232        // guaranteed to be aligned to 16 bytes
3233        let p = vals.data.as_mut_ptr();
3234
3235        unsafe {
3236            _mm_store_ps(p, *black_box(&a));
3237        }
3238
3239        assert_eq!(vals.data, [1.0, 2.0, 3.0, 4.0]);
3240    }
3241
3242    #[simd_test(enable = "sse")]
3243    const fn test_mm_storer_ps() {
3244        let mut vals = Memory { data: [0.0f32; 4] };
3245        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3246
3247        // guaranteed to be aligned to 16 bytes
3248        let p = vals.data.as_mut_ptr();
3249
3250        unsafe {
3251            _mm_storer_ps(p, *black_box(&a));
3252        }
3253
3254        assert_eq!(vals.data, [4.0, 3.0, 2.0, 1.0]);
3255    }
3256
3257    #[simd_test(enable = "sse")]
3258    const fn test_mm_storeu_ps() {
3259        #[repr(align(16))]
3260        struct Memory8 {
3261            data: [f32; 8],
3262        }
3263
3264        // guaranteed to be aligned to 16 bytes
3265        let mut vals = Memory8 { data: [0.0f32; 8] };
3266        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3267
3268        // guaranteed to be *not* aligned to 16 bytes
3269        let p = unsafe { vals.data.as_mut_ptr().offset(1) };
3270
3271        unsafe {
3272            _mm_storeu_ps(p, *black_box(&a));
3273        }
3274
3275        assert_eq!(vals.data, [0.0, 1.0, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0]);
3276    }
3277
3278    #[simd_test(enable = "sse")]
3279    const fn test_mm_move_ss() {
3280        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3281        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3282
3283        let r = _mm_move_ss(a, b);
3284        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3285        assert_eq_m128(e, r);
3286    }
3287
3288    #[simd_test(enable = "sse")]
3289    const fn test_mm_movemask_ps() {
3290        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3291        assert_eq!(r, 0b0101);
3292
3293        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3294        assert_eq!(r, 0b0111);
3295    }
3296
3297    #[simd_test(enable = "sse")]
3298    // Miri cannot support this until it is clear how it fits in the Rust memory model
3299    #[cfg_attr(miri, ignore)]
3300    fn test_mm_sfence() {
3301        _mm_sfence();
3302    }
3303
3304    #[simd_test(enable = "sse")]
3305    const fn test_MM_TRANSPOSE4_PS() {
3306        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3307        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3308        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3309        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3310
3311        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3312
3313        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3314        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3315        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3316        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3317    }
3318
3319    #[repr(align(16))]
3320    struct Memory {
3321        pub data: [f32; 4],
3322    }
3323
3324    #[simd_test(enable = "sse")]
3325    // Miri cannot support this until it is clear how it fits in the Rust memory model
3326    // (non-temporal store)
3327    #[cfg_attr(miri, ignore)]
3328    fn test_mm_stream_ps() {
3329        let a = _mm_set1_ps(7.0);
3330        let mut mem = Memory { data: [-1.0; 4] };
3331
3332        unsafe {
3333            _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3334        }
3335        _mm_sfence();
3336        for i in 0..4 {
3337            assert_eq!(mem.data[i], get_m128(a, i));
3338        }
3339    }
3340}