core/stdarch/crates/core_arch/src/amdgpu/
mod.rs

1//! amdgpu intrinsics
2//!
3//! The reference is the [LLVM amdgpu guide] and the [LLVM implementation].
4//! The order of intrinsics here follows the order in the [LLVM implementation].
5//!
6//! [LLVM amdgpu guide]: https://llvm.org/docs/AMDGPUUsage.html#llvm-ir-intrinsics
7//! [LLVM implementation]: https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
8
9#[allow(improper_ctypes)]
10unsafe extern "unadjusted" {
11    #[link_name = "llvm.amdgcn.workitem.id.x"]
12    safe fn llvm_workitem_id_x() -> u32;
13    #[link_name = "llvm.amdgcn.workitem.id.y"]
14    safe fn llvm_workitem_id_y() -> u32;
15    #[link_name = "llvm.amdgcn.workitem.id.z"]
16    safe fn llvm_workitem_id_z() -> u32;
17
18    #[link_name = "llvm.amdgcn.workgroup.id.x"]
19    safe fn llvm_workgroup_id_x() -> u32;
20    #[link_name = "llvm.amdgcn.workgroup.id.y"]
21    safe fn llvm_workgroup_id_y() -> u32;
22    #[link_name = "llvm.amdgcn.workgroup.id.z"]
23    safe fn llvm_workgroup_id_z() -> u32;
24
25    #[link_name = "llvm.amdgcn.groupstaticsize"]
26    safe fn llvm_groupstaticsize() -> u32;
27    #[link_name = "llvm.amdgcn.dispatch.id"]
28    safe fn llvm_dispatch_id() -> u64;
29
30    #[link_name = "llvm.amdgcn.wavefrontsize"]
31    safe fn llvm_wavefrontsize() -> u32;
32
33    #[link_name = "llvm.amdgcn.s.barrier"]
34    safe fn llvm_s_barrier();
35    #[link_name = "llvm.amdgcn.s.barrier.signal"]
36    fn llvm_s_barrier_signal(barrier_type: i32);
37    #[link_name = "llvm.amdgcn.s.barrier.signal.isfirst"]
38    fn llvm_s_barrier_signal_isfirst(barrier_type: i32) -> bool;
39    #[link_name = "llvm.amdgcn.s.barrier.wait"]
40    fn llvm_s_barrier_wait(barrier_type: i16);
41    #[link_name = "llvm.amdgcn.s.get.barrier.state"]
42    fn llvm_s_get_barrier_state(barrier_type: i32) -> u32;
43    #[link_name = "llvm.amdgcn.wave.barrier"]
44    safe fn llvm_wave_barrier();
45    #[link_name = "llvm.amdgcn.sched.barrier"]
46    fn llvm_sched_barrier(mask: u32);
47    #[link_name = "llvm.amdgcn.sched.group.barrier"]
48    fn llvm_sched_group_barrier(mask: u32, size: u32, sync_id: u32);
49
50    #[link_name = "llvm.amdgcn.s.sleep"]
51    safe fn llvm_s_sleep(count: u32);
52
53    #[link_name = "llvm.amdgcn.s.sethalt"]
54    safe fn llvm_s_sethalt(value: u32) -> !;
55
56    #[link_name = "llvm.amdgcn.s.getpc"]
57    safe fn llvm_s_getpc() -> i64;
58
59    #[link_name = "llvm.amdgcn.mbcnt.lo"]
60    safe fn llvm_mbcnt_lo(value: u32, init: u32) -> u32;
61    #[link_name = "llvm.amdgcn.mbcnt.hi"]
62    safe fn llvm_mbcnt_hi(value: u32, init: u32) -> u32;
63
64    #[link_name = "llvm.amdgcn.ballot"]
65    safe fn llvm_ballot(b: bool) -> u64;
66
67    #[link_name = "llvm.amdgcn.inverse.ballot"]
68    safe fn llvm_inverse_ballot(value: u64) -> bool;
69
70    #[link_name = "llvm.amdgcn.wave.reduce.umin"]
71    safe fn llvm_wave_reduce_umin(value: u32, strategy: u32) -> u32;
72    #[link_name = "llvm.amdgcn.wave.reduce.min"]
73    safe fn llvm_wave_reduce_min(value: i32, strategy: u32) -> i32;
74    #[link_name = "llvm.amdgcn.wave.reduce.umax"]
75    safe fn llvm_wave_reduce_umax(value: u32, strategy: u32) -> u32;
76    #[link_name = "llvm.amdgcn.wave.reduce.max"]
77    safe fn llvm_wave_reduce_max(value: i32, strategy: u32) -> i32;
78    #[link_name = "llvm.amdgcn.wave.reduce.add"]
79    safe fn llvm_wave_reduce_add(value: u32, strategy: u32) -> u32;
80    #[link_name = "llvm.amdgcn.wave.reduce.and"]
81    safe fn llvm_wave_reduce_and(value: u32, strategy: u32) -> u32;
82    #[link_name = "llvm.amdgcn.wave.reduce.or"]
83    safe fn llvm_wave_reduce_or(value: u32, strategy: u32) -> u32;
84    #[link_name = "llvm.amdgcn.wave.reduce.xor"]
85    safe fn llvm_wave_reduce_xor(value: u32, strategy: u32) -> u32;
86
87    // The following intrinsics can have multiple sizes
88
89    #[link_name = "llvm.amdgcn.readfirstlane.i32"]
90    safe fn llvm_readfirstlane_u32(value: u32) -> u32;
91    #[link_name = "llvm.amdgcn.readfirstlane.i64"]
92    safe fn llvm_readfirstlane_u64(value: u64) -> u64;
93    #[link_name = "llvm.amdgcn.readlane.i32"]
94    fn llvm_readlane_u32(value: u32, lane: u32) -> u32;
95    #[link_name = "llvm.amdgcn.readlane.i64"]
96    fn llvm_readlane_u64(value: u64, lane: u32) -> u64;
97    #[link_name = "llvm.amdgcn.writelane.i32"]
98    fn llvm_writelane_u32(value: u32, lane: u32, default: u32) -> u32;
99    #[link_name = "llvm.amdgcn.writelane.i64"]
100    fn llvm_writelane_u64(value: u64, lane: u32, default: u64) -> u64;
101
102    #[link_name = "llvm.amdgcn.endpgm"]
103    safe fn llvm_endpgm() -> !;
104
105    #[link_name = "llvm.amdgcn.update.dpp.i32"]
106    fn llvm_update_dpp(
107        old: u32,
108        src: u32,
109        dpp_ctrl: u32,
110        row_mask: u32,
111        bank_mask: u32,
112        bound_control: bool,
113    ) -> u32;
114
115    #[link_name = "llvm.amdgcn.s.memrealtime"]
116    safe fn llvm_s_memrealtime() -> u64;
117
118    #[link_name = "llvm.amdgcn.ds.permute"]
119    fn llvm_ds_permute(lane: u32, value: u32) -> u32;
120    #[link_name = "llvm.amdgcn.ds.bpermute"]
121    fn llvm_ds_bpermute(lane: u32, value: u32) -> u32;
122    #[link_name = "llvm.amdgcn.perm"]
123    fn llvm_perm(src0: u32, src1: u32, selector: u32) -> u32;
124
125    // gfx10
126    #[link_name = "llvm.amdgcn.permlane16.i32"]
127    fn llvm_permlane16_u32(
128        old: u32,
129        src0: u32,
130        src1: u32,
131        src2: u32,
132        fi: bool,
133        bound_control: bool,
134    ) -> u32;
135
136    // gfx10
137    #[link_name = "llvm.amdgcn.permlanex16.i32"]
138    fn llvm_permlanex16_u32(
139        old: u32,
140        src0: u32,
141        src1: u32,
142        src2: u32,
143        fi: bool,
144        bound_control: bool,
145    ) -> u32;
146
147    #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
148    safe fn llvm_s_get_waveid_in_workgroup() -> u32;
149
150    // gfx11
151    #[link_name = "llvm.amdgcn.permlane64.i32"]
152    fn llvm_permlane64_u32(value: u32) -> u32;
153
154    // gfx12
155    #[link_name = "llvm.amdgcn.permlane16.var"]
156    fn llvm_permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
157
158    // gfx12
159    #[link_name = "llvm.amdgcn.permlanex16.var"]
160    fn llvm_permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
161
162    #[link_name = "llvm.amdgcn.wave.id"]
163    safe fn llvm_wave_id() -> u32;
164
165    // gfx950
166    #[link_name = "llvm.amdgcn.permlane16.swap"]
167    fn llvm_permlane16_swap(
168        vdst_old: u32,
169        vsrc_src0: u32,
170        fi: bool,
171        bound_control: bool,
172    ) -> (u32, u32);
173
174    // gfx950
175    #[link_name = "llvm.amdgcn.permlane32.swap"]
176    fn llvm_permlane32_swap(
177        vdst_old: u32,
178        vsrc_src0: u32,
179        fi: bool,
180        bound_control: bool,
181    ) -> (u32, u32);
182}
183
184/// Returns the x coordinate of the workitem index within the workgroup.
185#[inline]
186#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
187pub fn workitem_id_x() -> u32 {
188    llvm_workitem_id_x()
189}
190/// Returns the y coordinate of the workitem index within the workgroup.
191#[inline]
192#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
193pub fn workitem_id_y() -> u32 {
194    llvm_workitem_id_y()
195}
196/// Returns the z coordinate of the workitem index within the workgroup.
197#[inline]
198#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
199pub fn workitem_id_z() -> u32 {
200    llvm_workitem_id_z()
201}
202
203/// Returns the x coordinate of the workgroup index within the dispatch.
204#[inline]
205#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
206pub fn workgroup_id_x() -> u32 {
207    llvm_workgroup_id_x()
208}
209/// Returns the y coordinate of the workgroup index within the dispatch.
210#[inline]
211#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
212pub fn workgroup_id_y() -> u32 {
213    llvm_workgroup_id_y()
214}
215/// Returns the z coordinate of the workgroup index within the dispatch.
216#[inline]
217#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
218pub fn workgroup_id_z() -> u32 {
219    llvm_workgroup_id_z()
220}
221
222/// Returns the size of statically allocated shared memory for this program in bytes.
223#[inline]
224#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
225pub fn groupstaticsize() -> u32 {
226    llvm_groupstaticsize()
227}
228/// Returns the id of the dispatch that is currently executed.
229#[inline]
230#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
231pub fn dispatch_id() -> u64 {
232    llvm_dispatch_id()
233}
234
235/// Returns the number of threads in a wavefront.
236///
237/// Is always a power of 2.
238#[inline]
239#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
240pub fn wavefrontsize() -> u32 {
241    llvm_wavefrontsize()
242}
243
244/// Synchronize all wavefronts in a workgroup.
245///
246/// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
247#[inline]
248#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
249pub fn s_barrier() {
250    llvm_s_barrier()
251}
252
253/// Signal a specific barrier type.
254///
255/// Only for non-named barriers.
256#[inline]
257#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
258pub unsafe fn s_barrier_signal<const BARRIER_TYPE: i32>() {
259    unsafe { llvm_s_barrier_signal(BARRIER_TYPE) }
260}
261
262/// Signal a specific barrier type.
263///
264/// Only for non-named barriers.
265/// Provides access to the s_barrier_signal_first instruction;
266/// additionally ensures that the result value is valid even when
267/// the intrinsic is used from a wavefront that is not running in a workgroup.
268#[inline]
269#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
270pub unsafe fn s_barrier_signal_isfirst<const BARRIER_TYPE: i32>() -> bool {
271    unsafe { llvm_s_barrier_signal_isfirst(BARRIER_TYPE) }
272}
273
274/// Wait for a specific barrier type.
275///
276/// Only for non-named barriers.
277#[inline]
278#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
279pub unsafe fn s_barrier_wait<const BARRIER_TYPE: i16>() {
280    unsafe { llvm_s_barrier_wait(BARRIER_TYPE) }
281}
282
283/// Get the state of a specific barrier type.
284///
285/// The `barrier_type` argument must be uniform, otherwise behavior is undefined.
286#[inline]
287#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
288pub unsafe fn s_get_barrier_state<const BARRIER_TYPE: i32>() -> u32 {
289    unsafe { llvm_s_get_barrier_state(BARRIER_TYPE) }
290}
291
292/// A barrier for only the threads within the current wavefront.
293///
294/// Does not result in an instruction but restricts the compiler.
295#[inline]
296#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
297pub fn wave_barrier() {
298    llvm_wave_barrier()
299}
300
301/// Prevent movement of some instruction types.
302///
303/// Controls the types of instructions that may be allowed to cross the intrinsic during instruction scheduling.
304/// The parameter is a mask for the instruction types that can cross the intrinsic.
305///
306/// - 0x0000: No instructions may be scheduled across `sched_barrier`.
307/// - 0x0001: All, non-memory, non-side-effect producing instructions may be scheduled across `sched_barrier`, i.e. allow ALU instructions to pass.
308/// - 0x0002: VALU instructions may be scheduled across `sched_barrier`.
309/// - 0x0004: SALU instructions may be scheduled across `sched_barrier`.
310/// - 0x0008: MFMA/WMMA instructions may be scheduled across `sched_barrier`.
311/// - 0x0010: All VMEM instructions may be scheduled across `sched_barrier`.
312/// - 0x0020: VMEM read instructions may be scheduled across `sched_barrier`.
313/// - 0x0040: VMEM write instructions may be scheduled across `sched_barrier`.
314/// - 0x0080: All DS instructions may be scheduled across `sched_barrier`.
315/// - 0x0100: All DS read instructions may be scheduled across `sched_barrier`.
316/// - 0x0200: All DS write instructions may be scheduled across `sched_barrier`.
317/// - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across `sched_barrier`.
318#[inline]
319#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
320pub unsafe fn sched_barrier<const MASK: u32>() {
321    static_assert_uimm_bits!(MASK, 11);
322    unsafe { llvm_sched_barrier(MASK) }
323}
324
325/// Creates schedule groups with specific properties to create custom scheduling pipelines.
326///
327/// The ordering between groups is enforced by the instruction scheduler.
328/// The intrinsic applies to the code that precedes the intrinsic.
329/// The intrinsic takes three values that control the behavior of the schedule groups.
330///
331/// - `mask`: Classify instruction groups using the [`sched_barrier`] mask values.
332/// - `size`: The number of instructions that are in the group.
333/// - `sync_id`: Order is enforced between groups with matching values.
334///
335/// The mask can include multiple instruction types. It is undefined behavior to set values beyond the range of valid masks.
336///
337/// Combining multiple `sched_group_barrier` intrinsics enables an ordering of specific instruction types during instruction scheduling.
338/// For example, the following enforces a sequence of 1 VMEM read, followed by 1 VALU instruction, followed by 5 MFMA instructions.
339///
340/// ```rust
341/// // 1 VMEM read
342/// sched_group_barrier::<32, 1, 0>()
343/// // 1 VALU
344/// sched_group_barrier::<2, 1, 0>()
345/// // 5 MFMA
346/// sched_group_barrier::<8, 5, 0>()
347/// ```
348#[inline]
349#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
350pub unsafe fn sched_group_barrier<const MASK: u32, const SIZE: u32, const SYNC_ID: u32>() {
351    static_assert_uimm_bits!(MASK, 11);
352    unsafe { llvm_sched_group_barrier(MASK, SIZE, SYNC_ID) }
353}
354
355/// Sleeps for approximately `COUNT * 64` cycles.
356///
357/// `COUNT` must be a constant.
358/// Only the lower 7 bits of `COUNT` are used.
359/// If `COUNT == 0x8000`, sleep forever until woken up, or killed.
360#[inline]
361#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
362pub fn s_sleep<const COUNT: u32>() {
363    llvm_s_sleep(COUNT)
364}
365
366/// Stop execution of the kernel.
367///
368/// This usually signals an error state.
369#[inline]
370#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
371pub fn s_sethalt<const VALUE: u32>() -> ! {
372    static_assert_uimm_bits!(VALUE, 3);
373    llvm_s_sethalt(VALUE)
374}
375
376/// Returns the current process counter.
377///
378/// Provides access to the s_getpc_b64 instruction, but with the return value sign-extended
379/// from the width of the underlying PC hardware register even on processors where the
380/// s_getpc_b64 instruction returns a zero-extended value.
381#[inline]
382#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
383pub fn s_getpc() -> i64 {
384    llvm_s_getpc()
385}
386
387/// Masked bit count, low 32 lanes.
388///
389/// Computes the number of bits set in `value`, masked with a thread mask
390/// which contains 1 for all active threads less than the current thread within a wavefront.
391/// `init` is added to the result.
392#[inline]
393#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
394pub fn mbcnt_lo(value: u32, init: u32) -> u32 {
395    llvm_mbcnt_lo(value, init)
396}
397/// Masked bit count, high 32 lanes.
398///
399/// Computes the number of bits set in `value`, masked with a thread mask
400/// which contains 1 for all active threads less than the current thread within a wavefront.
401/// `init` is added to the result.
402#[inline]
403#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
404pub fn mbcnt_hi(value: u32, init: u32) -> u32 {
405    llvm_mbcnt_hi(value, init)
406}
407
408/// Returns a bitfield (`u32` or `u64`) containing the result of its i1 argument
409/// in all active lanes, and zero in all inactive lanes.
410#[inline]
411#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
412pub fn ballot(b: bool) -> u64 {
413    llvm_ballot(b)
414}
415
416/// Indexes into the `value` with the current lane id and returns for each lane
417/// if the corresponding bit is set.
418///
419/// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
420/// This means `inverse_ballot(ballot(b)) == b`.
421/// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
422#[inline]
423#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
424pub fn inverse_ballot(value: u64) -> bool {
425    llvm_inverse_ballot(value)
426}
427
428/// Performs an arithmetic min reduction on the unsigned values provided by each lane in the wavefront.
429///
430/// The `STRATEGY` argument is a hint for the reduction strategy.
431/// - 0: Target default preference
432/// - 1: Iterative strategy
433/// - 2: DPP
434///
435/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
436#[inline]
437#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
438pub fn wave_reduce_umin<const STRATEGY: u32>(value: u32) -> u32 {
439    static_assert!(STRATEGY <= 2);
440    llvm_wave_reduce_umin(value, STRATEGY)
441}
442/// Performs an arithmetic min reduction on the signed values provided by each lane in the wavefront.
443///
444/// The `STRATEGY` argument is a hint for the reduction strategy.
445/// - 0: Target default preference
446/// - 1: Iterative strategy
447/// - 2: DPP
448///
449/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
450#[inline]
451#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
452pub fn wave_reduce_min<const STRATEGY: u32>(value: i32) -> i32 {
453    static_assert!(STRATEGY <= 2);
454    llvm_wave_reduce_min(value, STRATEGY)
455}
456
457/// Performs an arithmetic max reduction on the unsigned values provided by each lane in the wavefront.
458///
459/// The `STRATEGY` argument is a hint for the reduction strategy.
460/// - 0: Target default preference
461/// - 1: Iterative strategy
462/// - 2: DPP
463///
464/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
465#[inline]
466#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
467pub fn wave_reduce_umax<const STRATEGY: u32>(value: u32) -> u32 {
468    static_assert!(STRATEGY <= 2);
469    llvm_wave_reduce_umax(value, STRATEGY)
470}
471/// Performs an arithmetic max reduction on the signed values provided by each lane in the wavefront.
472///
473/// The `STRATEGY` argument is a hint for the reduction strategy.
474/// - 0: Target default preference
475/// - 1: Iterative strategy
476/// - 2: DPP
477///
478/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
479#[inline]
480#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
481pub fn wave_reduce_max<const STRATEGY: u32>(value: i32) -> i32 {
482    static_assert!(STRATEGY <= 2);
483    llvm_wave_reduce_max(value, STRATEGY)
484}
485
486/// Performs an arithmetic add reduction on the values provided by each lane in the wavefront.
487///
488/// The `STRATEGY` argument is a hint for the reduction strategy.
489/// - 0: Target default preference
490/// - 1: Iterative strategy
491/// - 2: DPP
492///
493/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
494#[inline]
495#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
496pub fn wave_reduce_add<const STRATEGY: u32>(value: u32) -> u32 {
497    static_assert!(STRATEGY <= 2);
498    llvm_wave_reduce_add(value, STRATEGY)
499}
500
501/// Performs a logical and reduction on the unsigned values provided by each lane in the wavefront.
502///
503/// The `STRATEGY` argument is a hint for the reduction strategy.
504/// - 0: Target default preference
505/// - 1: Iterative strategy
506/// - 2: DPP
507///
508/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
509#[inline]
510#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
511pub fn wave_reduce_and<const STRATEGY: u32>(value: u32) -> u32 {
512    static_assert!(STRATEGY <= 2);
513    llvm_wave_reduce_and(value, STRATEGY)
514}
515/// Performs a logical or reduction on the unsigned values provided by each lane in the wavefront.
516///
517/// The `STRATEGY` argument is a hint for the reduction strategy.
518/// - 0: Target default preference
519/// - 1: Iterative strategy
520/// - 2: DPP
521///
522/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
523#[inline]
524#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
525pub fn wave_reduce_or<const STRATEGY: u32>(value: u32) -> u32 {
526    static_assert!(STRATEGY <= 2);
527    llvm_wave_reduce_or(value, STRATEGY)
528}
529/// Performs a logical xor reduction on the unsigned values provided by each lane in the wavefront.
530///
531/// The `STRATEGY` argument is a hint for the reduction strategy.
532/// - 0: Target default preference
533/// - 1: Iterative strategy
534/// - 2: DPP
535///
536/// If target does not support the DPP operations (e.g. gfx6/7), reduction will be performed using default iterative strategy.
537#[inline]
538#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
539pub fn wave_reduce_xor<const STRATEGY: u32>(value: u32) -> u32 {
540    static_assert!(STRATEGY <= 2);
541    llvm_wave_reduce_xor(value, STRATEGY)
542}
543
544// The following intrinsics can have multiple sizes
545
546/// Get `value` from the first active lane in the wavefront.
547#[inline]
548#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
549pub fn readfirstlane_u32(value: u32) -> u32 {
550    llvm_readfirstlane_u32(value)
551}
552/// Get `value` from the first active lane in the wavefront.
553#[inline]
554#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
555pub fn readfirstlane_u64(value: u64) -> u64 {
556    llvm_readfirstlane_u64(value)
557}
558/// Get `value` from the lane at index `lane` in the wavefront.
559///
560/// The lane argument must be uniform across the currently active threads
561/// of the current wavefront. Otherwise, the result is undefined.
562#[inline]
563#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
564pub unsafe fn readlane_u32(value: u32, lane: u32) -> u32 {
565    unsafe { llvm_readlane_u32(value, lane) }
566}
567/// Get `value` from the lane at index `lane` in the wavefront.
568///
569/// The lane argument must be uniform across the currently active threads
570/// of the current wavefront. Otherwise, the result is undefined.
571#[inline]
572#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
573pub unsafe fn readlane_u64(value: u64, lane: u32) -> u64 {
574    unsafe { llvm_readlane_u64(value, lane) }
575}
576/// Return `value` for the lane at index `lane` in the wavefront.
577/// Return `default` for all other lanes.
578///
579/// The value to write and lane select arguments must be uniform across the
580/// currently active threads of the current wavefront. Otherwise, the result is
581/// undefined.
582///
583/// `value` is the value returned by `lane`.
584/// `default` is the value returned by all lanes other than `lane`.
585#[inline]
586#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
587pub unsafe fn writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
588    unsafe { llvm_writelane_u32(value, lane, default) }
589}
590/// Return `value` for the lane at index `lane` in the wavefront.
591/// Return `default` for all other lanes.
592///
593/// The value to write and lane select arguments must be uniform across the
594/// currently active threads of the current wavefront. Otherwise, the result is
595/// undefined.
596///
597/// `value` is the value returned by `lane`.
598/// `default` is the value returned by all lanes other than `lane`.
599#[inline]
600#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
601pub unsafe fn writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
602    unsafe { llvm_writelane_u64(value, lane, default) }
603}
604
605/// Stop execution of the wavefront.
606///
607/// This usually signals the end of a successful execution.
608#[inline]
609#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
610pub fn endpgm() -> ! {
611    llvm_endpgm()
612}
613
614/// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
615/// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
616/// This operation is equivalent to a sequence of `v_mov_b32` operations.
617///
618/// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
619/// Should be equivalent to:
620/// ```asm
621/// v_mov_b32 <dest> <old>
622/// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
623/// ```
624#[inline]
625#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
626pub unsafe fn update_dpp<
627    const DPP_CTRL: u32,
628    const ROW_MASK: u32,
629    const BANK_MASK: u32,
630    const BOUND_CONTROL: bool,
631>(
632    old: u32,
633    src: u32,
634) -> u32 {
635    unsafe { llvm_update_dpp(old, src, DPP_CTRL, ROW_MASK, BANK_MASK, BOUND_CONTROL) }
636}
637
638/// Measures time based on a fixed frequency.
639///
640/// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
641/// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
642#[inline]
643#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
644pub fn s_memrealtime() -> u64 {
645    llvm_s_memrealtime()
646}
647
648/// Scatter data across all lanes in a wavefront.
649///
650/// Writes `value` to the lane `lane`.
651///
652/// Reading from inactive lanes returns `0`.
653/// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
654#[inline]
655#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
656pub unsafe fn ds_permute(lane: u32, value: u32) -> u32 {
657    unsafe { llvm_ds_permute(lane, value) }
658}
659/// Gather data across all lanes in a wavefront.
660///
661/// Returns the `value` given to `ds_permute` by lane `lane`.
662///
663/// Reading from inactive lanes returns `0`.
664#[inline]
665#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
666pub unsafe fn ds_bpermute(lane: u32, value: u32) -> u32 {
667    unsafe { llvm_ds_bpermute(lane, value) }
668}
669/// Permute a 64-bit value.
670///
671/// `selector` selects between different patterns in which the 64-bit values represented by `src0` and `src1` are permuted.
672#[inline]
673#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
674pub unsafe fn perm(src0: u32, src1: u32, selector: u32) -> u32 {
675    unsafe { llvm_perm(src0, src1, selector) }
676}
677
678// gfx10
679/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
680///
681/// The third and fourth inputs must be uniform across the current wavefront.
682/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
683#[inline]
684#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
685pub unsafe fn permlane16_u32<const FI: bool, const BOUND_CONTROL: bool>(
686    old: u32,
687    src0: u32,
688    src1: u32,
689    src2: u32,
690) -> u32 {
691    unsafe { llvm_permlane16_u32(old, src0, src1, src2, FI, BOUND_CONTROL) }
692}
693
694// gfx10
695/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
696///
697/// The third and fourth inputs must be uniform across the current wavefront.
698/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
699#[inline]
700#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
701pub unsafe fn permlanex16_u32<const FI: bool, const BOUND_CONTROL: bool>(
702    old: u32,
703    src0: u32,
704    src1: u32,
705    src2: u32,
706) -> u32 {
707    unsafe { llvm_permlanex16_u32(old, src0, src1, src2, FI, BOUND_CONTROL) }
708}
709
710/// Get the index of the current wavefront in the workgroup.
711#[inline]
712#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
713pub fn s_get_waveid_in_workgroup() -> u32 {
714    llvm_s_get_waveid_in_workgroup()
715}
716
717// gfx11
718/// Swap `value` between upper and lower 32 lanes in a wavefront.
719///
720/// Does nothing for wave32.
721#[inline]
722#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
723pub unsafe fn permlane64_u32(value: u32) -> u32 {
724    unsafe { llvm_permlane64_u32(value) }
725}
726
727// gfx12
728/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
729///
730/// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
731#[inline]
732#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
733pub unsafe fn permlane16_var<const FI: bool, const BOUND_CONTROL: bool>(
734    old: u32,
735    src0: u32,
736    src1: u32,
737) -> u32 {
738    unsafe { llvm_permlane16_var(old, src0, src1, FI, BOUND_CONTROL) }
739}
740
741// gfx12
742/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
743///
744/// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
745#[inline]
746#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
747pub unsafe fn permlanex16_var<const FI: bool, const BOUND_CONTROL: bool>(
748    old: u32,
749    src0: u32,
750    src1: u32,
751) -> u32 {
752    unsafe { llvm_permlanex16_var(old, src0, src1, FI, BOUND_CONTROL) }
753}
754
755/// Get the index of the current wavefront in the workgroup.
756#[inline]
757#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
758pub fn wave_id() -> u32 {
759    llvm_wave_id()
760}
761
762// gfx950
763/// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
764///
765/// Swaps the values across lanes of first 2 operands.
766/// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
767/// Returns a pair for the swapped registers.
768/// The first element of the return corresponds to the swapped element of the first argument.
769#[inline]
770#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
771pub unsafe fn permlane16_swap<const FI: bool, const BOUND_CONTROL: bool>(
772    vdst_old: u32,
773    vsrc_src0: u32,
774) -> (u32, u32) {
775    unsafe { llvm_permlane16_swap(vdst_old, vsrc_src0, FI, BOUND_CONTROL) }
776}
777
778// gfx950
779/// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
780///
781/// Swaps the values across lanes of first 2 operands.
782/// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
783/// Returns a pair for the swapped registers.
784/// The first element of the return corresponds to the swapped element of the first argument.
785#[inline]
786#[unstable(feature = "stdarch_amdgpu", issue = "149988")]
787pub unsafe fn permlane32_swap<const FI: bool, const BOUND_CONTROL: bool>(
788    vdst_old: u32,
789    vsrc_src0: u32,
790) -> (u32, u32) {
791    unsafe { llvm_permlane32_swap(vdst_old, vsrc_src0, FI, BOUND_CONTROL) }
792}
793
794// Functions to generate code, used to check that the intrinsics build.
795// Marked as no_mangle, so the compiler does not remove the functions.
796// To test, uncomment the `#[cfg(test)]` line below and run
797// NORUN=1 NOSTD=1 TARGET=amdgcn-amd-amdhsa CARGO_UNSTABLE_BUILD_STD=core ci/run.sh
798//
799// Note that depending on the target-cpu set in run.sh, some of these intrinsics are not available
800// and compilation fails with `Cannot select: intrinsic %llvm.amdgcn...`.
801// Uncomment these intrinsics to check.
802#[cfg(test)]
803mod tests {
804    use super::*;
805
806    #[unsafe(no_mangle)]
807    fn test_workitem_id_x() -> u32 {
808        workitem_id_x()
809    }
810    #[unsafe(no_mangle)]
811    fn test_workitem_id_y() -> u32 {
812        workitem_id_y()
813    }
814    #[unsafe(no_mangle)]
815    fn test_workitem_id_z() -> u32 {
816        workitem_id_z()
817    }
818
819    #[unsafe(no_mangle)]
820    fn test_workgroup_id_x() -> u32 {
821        workgroup_id_x()
822    }
823    #[unsafe(no_mangle)]
824    fn test_workgroup_id_y() -> u32 {
825        workgroup_id_y()
826    }
827    #[unsafe(no_mangle)]
828    fn test_workgroup_id_z() -> u32 {
829        workgroup_id_z()
830    }
831
832    #[unsafe(no_mangle)]
833    fn test_groupstaticsize() -> u32 {
834        groupstaticsize()
835    }
836    #[unsafe(no_mangle)]
837    fn test_dispatch_id() -> u64 {
838        dispatch_id()
839    }
840
841    #[unsafe(no_mangle)]
842    fn test_wavefrontsize() -> u32 {
843        wavefrontsize()
844    }
845
846    #[unsafe(no_mangle)]
847    fn test_s_barrier() {
848        s_barrier()
849    }
850
851    #[unsafe(no_mangle)]
852    fn test_s_barrier_signal() {
853        unsafe { s_barrier_signal::<-1>() }
854    }
855
856    #[unsafe(no_mangle)]
857    fn test_s_barrier_signal_isfirst() -> bool {
858        unsafe { s_barrier_signal_isfirst::<-1>() }
859    }
860
861    #[unsafe(no_mangle)]
862    fn test_s_barrier_wait() {
863        unsafe { s_barrier_wait::<-1>() }
864    }
865
866    #[unsafe(no_mangle)]
867    fn test_s_get_barrier_state() -> u32 {
868        unsafe { s_get_barrier_state::<-1>() }
869    }
870
871    #[unsafe(no_mangle)]
872    fn test_wave_barrier() {
873        wave_barrier()
874    }
875
876    #[unsafe(no_mangle)]
877    fn test_sched_barrier() {
878        unsafe { sched_barrier::<1>() }
879    }
880
881    #[unsafe(no_mangle)]
882    fn test_sched_group_barrier() {
883        unsafe { sched_group_barrier::<1, 1, 0>() }
884    }
885
886    #[unsafe(no_mangle)]
887    fn test_s_sleep() {
888        s_sleep::<1>()
889    }
890
891    #[unsafe(no_mangle)]
892    fn test_s_sethalt() -> ! {
893        s_sethalt::<1>()
894    }
895
896    #[unsafe(no_mangle)]
897    fn test_s_getpc() -> i64 {
898        s_getpc()
899    }
900
901    #[unsafe(no_mangle)]
902    fn test_mbcnt_lo(value: u32, init: u32) -> u32 {
903        mbcnt_lo(value, init)
904    }
905    #[unsafe(no_mangle)]
906    fn test_mbcnt_hi(value: u32, init: u32) -> u32 {
907        mbcnt_hi(value, init)
908    }
909
910    #[unsafe(no_mangle)]
911    fn test_ballot(b: bool) -> u64 {
912        ballot(b)
913    }
914
915    #[unsafe(no_mangle)]
916    fn test_inverse_ballot(value: u64) -> bool {
917        inverse_ballot(value)
918    }
919
920    #[unsafe(no_mangle)]
921    fn test_wave_reduce_umin(value: u32) -> u32 {
922        wave_reduce_umin::<0>(value)
923    }
924    #[unsafe(no_mangle)]
925    fn test_wave_reduce_min(value: i32) -> i32 {
926        wave_reduce_min::<0>(value)
927    }
928
929    #[unsafe(no_mangle)]
930    fn test_wave_reduce_umax(value: u32) -> u32 {
931        wave_reduce_umax::<0>(value)
932    }
933    #[unsafe(no_mangle)]
934    fn test_wave_reduce_max(value: i32) -> i32 {
935        wave_reduce_max::<0>(value)
936    }
937
938    #[unsafe(no_mangle)]
939    fn test_wave_reduce_add(value: u32) -> u32 {
940        wave_reduce_add::<0>(value)
941    }
942
943    #[unsafe(no_mangle)]
944    fn test_wave_reduce_and(value: u32) -> u32 {
945        wave_reduce_and::<0>(value)
946    }
947    #[unsafe(no_mangle)]
948    fn test_wave_reduce_or(value: u32) -> u32 {
949        wave_reduce_or::<0>(value)
950    }
951    #[unsafe(no_mangle)]
952    fn test_wave_reduce_xor(value: u32) -> u32 {
953        wave_reduce_xor::<0>(value)
954    }
955
956    #[unsafe(no_mangle)]
957    fn test_readfirstlane_u32(value: u32) -> u32 {
958        readfirstlane_u32(value)
959    }
960    #[unsafe(no_mangle)]
961    fn test_readfirstlane_u64(value: u64) -> u64 {
962        readfirstlane_u64(value)
963    }
964    #[unsafe(no_mangle)]
965    fn test_readlane_u32(value: u32, lane: u32) -> u32 {
966        unsafe { readlane_u32(value, lane) }
967    }
968    #[unsafe(no_mangle)]
969    fn test_readlane_u64(value: u64, lane: u32) -> u64 {
970        unsafe { readlane_u64(value, lane) }
971    }
972    #[unsafe(no_mangle)]
973    fn test_writelane_u32(value: u32, lane: u32, default: u32) -> u32 {
974        unsafe { writelane_u32(value, lane, default) }
975    }
976    #[unsafe(no_mangle)]
977    fn test_writelane_u64(value: u64, lane: u32, default: u64) -> u64 {
978        unsafe { writelane_u64(value, lane, default) }
979    }
980
981    #[unsafe(no_mangle)]
982    fn test_endpgm() -> ! {
983        endpgm()
984    }
985
986    #[unsafe(no_mangle)]
987    fn test_update_dpp(old: u32, src: u32) -> u32 {
988        unsafe { update_dpp::<0, 0, 0, true>(old, src) }
989    }
990
991    #[unsafe(no_mangle)]
992    fn test_s_memrealtime() -> u64 {
993        s_memrealtime()
994    }
995
996    #[unsafe(no_mangle)]
997    fn test_ds_permute(lane: u32, value: u32) -> u32 {
998        unsafe { ds_permute(lane, value) }
999    }
1000    #[unsafe(no_mangle)]
1001    fn test_ds_bpermute(lane: u32, value: u32) -> u32 {
1002        unsafe { ds_bpermute(lane, value) }
1003    }
1004    #[unsafe(no_mangle)]
1005    fn test_perm(src0: u32, src1: u32, selector: u32) -> u32 {
1006        unsafe { perm(src0, src1, selector) }
1007    }
1008
1009    #[unsafe(no_mangle)]
1010    fn test_permlane16_u32(old: u32, src0: u32, src1: u32, src2: u32) -> u32 {
1011        unsafe { permlane16_u32::<false, true>(old, src0, src1, src2) }
1012    }
1013
1014    #[unsafe(no_mangle)]
1015    fn test_permlanex16_u32(old: u32, src0: u32, src1: u32, src2: u32) -> u32 {
1016        unsafe { permlanex16_u32::<false, true>(old, src0, src1, src2) }
1017    }
1018
1019    #[unsafe(no_mangle)]
1020    fn test_s_get_waveid_in_workgroup() -> u32 {
1021        s_get_waveid_in_workgroup()
1022    }
1023
1024    #[unsafe(no_mangle)]
1025    fn test_permlane64_u32(value: u32) -> u32 {
1026        unsafe { permlane64_u32(value) }
1027    }
1028
1029    #[unsafe(no_mangle)]
1030    fn test_permlane16_var(old: u32, src0: u32, src1: u32) -> u32 {
1031        unsafe { permlane16_var::<false, true>(old, src0, src1) }
1032    }
1033
1034    #[unsafe(no_mangle)]
1035    fn test_permlanex16_var(old: u32, src0: u32, src1: u32) -> u32 {
1036        unsafe { permlanex16_var::<false, true>(old, src0, src1) }
1037    }
1038
1039    #[unsafe(no_mangle)]
1040    fn test_wave_id() -> u32 {
1041        wave_id()
1042    }
1043
1044    #[unsafe(no_mangle)]
1045    fn test_permlane16_swap(vdst_old: u32, vsrc_src0: u32) -> (u32, u32) {
1046        unsafe { permlane16_swap::<false, true>(vdst_old, vsrc_src0) }
1047    }
1048
1049    #[unsafe(no_mangle)]
1050    fn test_permlane32_swap(vdst_old: u32, vsrc_src0: u32) -> (u32, u32) {
1051        unsafe { permlane32_swap::<false, true>(vdst_old, vsrc_src0) }
1052    }
1053}
core/stdarch/crates/core_arch/src/amdgpu/mod.rs

core/stdarch/crates/core_arch/src/amdgpu/
mod.rs