Skip to main content

core/slice/sort/shared/
smallsort.rs

1//! This module contains a variety of sort implementations that are optimized for small lengths.
2
3use crate::mem::{self, ManuallyDrop, MaybeUninit};
4use crate::slice::sort::shared::FreezeMarker;
5use crate::{hint, intrinsics, ptr, slice};
6
7// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
8// small slices and small-sort performance sorting small sub-slices as part of
9// the main quicksort loop. For the former, testing showed that the
10// representative benchmarks for real-world performance are cold CPU state and
11// not single-size hot benchmarks. For the latter the CPU will call them many
12// times, so hot benchmarks are fine and more realistic. And it's worth it to
13// optimize sorting small sub-slices with more sophisticated solutions than
14// insertion sort.
15
16/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
17/// abstractions.
18pub(crate) trait StableSmallSortTypeImpl: Sized {
19    /// For which input length <= return value of this function, is it valid to call `small_sort`.
20    fn small_sort_threshold() -> usize;
21
22    /// Sorts `v` using strategies optimized for small sizes.
23    fn small_sort<F: FnMut(&Self, &Self) -> bool>(
24        v: &mut [Self],
25        scratch: &mut [MaybeUninit<Self>],
26        is_less: &mut F,
27    );
28}
29
30impl<T> StableSmallSortTypeImpl for T {
31    #[inline(always)]
32    default fn small_sort_threshold() -> usize {
33        // Optimal number of comparisons, and good perf.
34        SMALL_SORT_FALLBACK_THRESHOLD
35    }
36
37    #[inline(always)]
38    default fn small_sort<F: FnMut(&T, &T) -> bool>(
39        v: &mut [T],
40        _scratch: &mut [MaybeUninit<T>],
41        is_less: &mut F,
42    ) {
43        if v.len() >= 2 {
44            insertion_sort_shift_left(v, 1, is_less);
45        }
46    }
47}
48
49impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
50    #[inline(always)]
51    fn small_sort_threshold() -> usize {
52        SMALL_SORT_GENERAL_THRESHOLD
53    }
54
55    #[inline(always)]
56    fn small_sort<F: FnMut(&T, &T) -> bool>(
57        v: &mut [T],
58        scratch: &mut [MaybeUninit<T>],
59        is_less: &mut F,
60    ) {
61        small_sort_general_with_scratch(v, scratch, is_less);
62    }
63}
64
65/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
66/// abstractions.
67pub(crate) trait UnstableSmallSortTypeImpl: Sized {
68    /// For which input length <= return value of this function, is it valid to call `small_sort`.
69    fn small_sort_threshold() -> usize;
70
71    /// Sorts `v` using strategies optimized for small sizes.
72    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
73}
74
75impl<T> UnstableSmallSortTypeImpl for T {
76    #[inline(always)]
77    default fn small_sort_threshold() -> usize {
78        SMALL_SORT_FALLBACK_THRESHOLD
79    }
80
81    #[inline(always)]
82    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
83    where
84        F: FnMut(&T, &T) -> bool,
85    {
86        small_sort_fallback(v, is_less);
87    }
88}
89
90impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
91    #[inline(always)]
92    fn small_sort_threshold() -> usize {
93        <T as UnstableSmallSortFreezeTypeImpl>::small_sort_threshold()
94    }
95
96    #[inline(always)]
97    fn small_sort<F>(v: &mut [T], is_less: &mut F)
98    where
99        F: FnMut(&T, &T) -> bool,
100    {
101        <T as UnstableSmallSortFreezeTypeImpl>::small_sort(v, is_less);
102    }
103}
104
105/// FIXME(const_trait_impl) use original ipnsort approach with choose_unstable_small_sort,
106/// as found here <https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36>.
107pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker {
108    fn small_sort_threshold() -> usize;
109
110    fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
111}
112
113impl<T: FreezeMarker> UnstableSmallSortFreezeTypeImpl for T {
114    #[inline(always)]
115    default fn small_sort_threshold() -> usize {
116        if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
117            SMALL_SORT_GENERAL_THRESHOLD
118        } else {
119            SMALL_SORT_FALLBACK_THRESHOLD
120        }
121    }
122
123    #[inline(always)]
124    default fn small_sort<F>(v: &mut [T], is_less: &mut F)
125    where
126        F: FnMut(&T, &T) -> bool,
127    {
128        if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
129            small_sort_general(v, is_less);
130        } else {
131            small_sort_fallback(v, is_less);
132        }
133    }
134}
135
136/// SAFETY: Only used for run-time optimization heuristic.
137#[rustc_unsafe_specialization_marker]
138trait CopyMarker {}
139
140impl<T: Copy> CopyMarker for T {}
141
142impl<T: FreezeMarker + CopyMarker> UnstableSmallSortFreezeTypeImpl for T {
143    #[inline(always)]
144    fn small_sort_threshold() -> usize {
145        if has_efficient_in_place_swap::<T>()
146            && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
147        {
148            SMALL_SORT_NETWORK_THRESHOLD
149        } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
150            SMALL_SORT_GENERAL_THRESHOLD
151        } else {
152            SMALL_SORT_FALLBACK_THRESHOLD
153        }
154    }
155
156    #[inline(always)]
157    fn small_sort<F>(v: &mut [T], is_less: &mut F)
158    where
159        F: FnMut(&T, &T) -> bool,
160    {
161        if has_efficient_in_place_swap::<T>()
162            && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
163        {
164            small_sort_network(v, is_less);
165        } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
166            small_sort_general(v, is_less);
167        } else {
168            small_sort_fallback(v, is_less);
169        }
170    }
171}
172
173/// Optimal number of comparisons, and good perf.
174const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
175
176/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
177/// wall-clock performance choosing 32 yielded better performance overall.
178///
179/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
180const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
181
182/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
183/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
184/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
185/// scratch buffer size.
186///
187/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
188pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
189
190/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
191const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
192const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
193
194/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
195/// conservative we limit the usage of small-sorts that require a stack array to types that fit
196/// within this limit.
197const MAX_STACK_ARRAY_SIZE: usize = 4096;
198
199fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
200    if v.len() >= 2 {
201        insertion_sort_shift_left(v, 1, is_less);
202    }
203}
204
205fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
206    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
207
208    // SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len
209    // is the same.
210    let scratch = unsafe {
211        slice::from_raw_parts_mut(
212            stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
213            SMALL_SORT_GENERAL_SCRATCH_LEN,
214        )
215    };
216
217    small_sort_general_with_scratch(v, scratch, is_less);
218}
219
220fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
221    v: &mut [T],
222    scratch: &mut [MaybeUninit<T>],
223    is_less: &mut F,
224) {
225    let len = v.len();
226    if len < 2 {
227        return;
228    }
229
230    if scratch.len() < len + 16 {
231        intrinsics::abort();
232    }
233
234    let v_base = v.as_mut_ptr();
235    let len_div_2 = len / 2;
236
237    // SAFETY: See individual comments.
238    unsafe {
239        let scratch_base = scratch.as_mut_ptr() as *mut T;
240
241        let presorted_len = if const { size_of::<T>() <= 16 } && len >= 16 {
242            // SAFETY: scratch_base is valid and has enough space.
243            sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
244            sort8_stable(
245                v_base.add(len_div_2),
246                scratch_base.add(len_div_2),
247                scratch_base.add(len + 8),
248                is_less,
249            );
250
251            8
252        } else if len >= 8 {
253            // SAFETY: scratch_base is valid and has enough space.
254            sort4_stable(v_base, scratch_base, is_less);
255            sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
256
257            4
258        } else {
259            ptr::copy_nonoverlapping(v_base, scratch_base, 1);
260            ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
261
262            1
263        };
264
265        for offset in [0, len_div_2] {
266            // SAFETY: at this point dst is initialized with presorted_len elements.
267            // We extend this to desired_len, src is valid for desired_len elements.
268            let src = v_base.add(offset);
269            let dst = scratch_base.add(offset);
270            let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
271
272            for i in presorted_len..desired_len {
273                ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
274                insert_tail(dst, dst.add(i), is_less);
275            }
276        }
277
278        // SAFETY: see comment in `CopyOnDrop::drop`.
279        let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
280
281        // SAFETY: at this point scratch_base is fully initialized, allowing us
282        // to use it as the source of our merge back into the original array.
283        // If a panic occurs we ensure the original array is restored to a valid
284        // permutation of the input through drop_guard. This technique is similar
285        // to ping-pong merging.
286        bidirectional_merge(
287            slice::from_raw_parts(drop_guard.src, drop_guard.len),
288            drop_guard.dst,
289            is_less,
290        );
291        mem::forget(drop_guard);
292    }
293}
294
295struct CopyOnDrop<T> {
296    src: *const T,
297    dst: *mut T,
298    len: usize,
299}
300
301impl<T> Drop for CopyOnDrop<T> {
302    fn drop(&mut self) {
303        // SAFETY: `src` must contain `len` initialized elements, and dst must
304        // be valid to write `len` elements.
305        unsafe {
306            ptr::copy_nonoverlapping(self.src, self.dst, self.len);
307        }
308    }
309}
310
311fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
312where
313    T: FreezeMarker,
314    F: FnMut(&T, &T) -> bool,
315{
316    // This implementation is tuned to be efficient for integer types.
317
318    let len = v.len();
319    if len < 2 {
320        return;
321    }
322
323    if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
324        intrinsics::abort();
325    }
326
327    let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
328
329    let len_div_2 = len / 2;
330    let no_merge = len < 18;
331
332    let v_base = v.as_mut_ptr();
333    let initial_region_len = if no_merge { len } else { len_div_2 };
334    // SAFETY: Both possible values of `initial_region_len` are in-bounds.
335    let mut region = unsafe { slice::from_raw_parts_mut(v_base, initial_region_len) };
336
337    // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
338    loop {
339        let presorted_len = if region.len() >= 13 {
340            sort13_optimal(region, is_less);
341            13
342        } else if region.len() >= 9 {
343            sort9_optimal(region, is_less);
344            9
345        } else {
346            1
347        };
348
349        insertion_sort_shift_left(region, presorted_len, is_less);
350
351        if no_merge {
352            return;
353        }
354
355        if region.as_ptr() != v_base {
356            break;
357        }
358
359        // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
360        unsafe { region = slice::from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
361    }
362
363    // SAFETY: We checked that T is Freeze and thus observation safe.
364    // Should is_less panic v was not modified in parity_merge and retains it's original input.
365    // scratch and v must not alias and scratch has v.len() space.
366    unsafe {
367        let scratch_base = stack_array.as_mut_ptr() as *mut T;
368        bidirectional_merge(slice::from_raw_parts_mut(v_base, len), scratch_base, is_less);
369        ptr::copy_nonoverlapping(scratch_base, v_base, len);
370    }
371}
372
373/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
374/// value at position `b_pos` is less than the one at position `a_pos`.
375///
376/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
377/// types. `is_less` could be a huge function and we want to give the compiler an option to
378/// not inline this function. For the same reasons that this function is very perf critical
379/// it should be in the same module as the functions that use it.
380unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
381where
382    F: FnMut(&T, &T) -> bool,
383{
384    // SAFETY: the caller must guarantee that `a_pos` and `b_pos` each added to `v_base` yield valid
385    // pointers into `v_base`, and are properly aligned, and part of the same allocation.
386    unsafe {
387        let v_a = v_base.add(a_pos);
388        let v_b = v_base.add(b_pos);
389
390        // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
391        // in a well defined state, without duplicates.
392
393        // Important to only swap if it is more and not if it is equal. is_less should return false for
394        // equal, so we don't swap.
395        let should_swap = is_less(&*v_b, &*v_a);
396
397        // This is a branchless version of swap if.
398        // The equivalent code with a branch would be:
399        //
400        // if should_swap {
401        //     ptr::swap(v_a, v_b, 1);
402        // }
403
404        // The goal is to generate cmov instructions here.
405        let v_a_swap = hint::select_unpredictable(should_swap, v_b, v_a);
406        let v_b_swap = hint::select_unpredictable(should_swap, v_a, v_b);
407
408        let v_b_swap_tmp = ManuallyDrop::new(ptr::read(v_b_swap));
409        ptr::copy(v_a_swap, v_a, 1);
410        ptr::copy_nonoverlapping(&*v_b_swap_tmp, v_b, 1);
411    }
412}
413
414/// Sorts the first 9 elements of `v` with a fast fixed function.
415///
416/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
417/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
418/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
419/// `small_sort_network` makes sure to only call this once.
420fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
421where
422    F: FnMut(&T, &T) -> bool,
423{
424    if v.len() < 9 {
425        intrinsics::abort();
426    }
427
428    let v_base = v.as_mut_ptr();
429
430    // Optimal sorting network see:
431    // https://bertdobbelaere.github.io/sorting_networks.html.
432
433    // SAFETY: We checked the len.
434    unsafe {
435        swap_if_less(v_base, 0, 3, is_less);
436        swap_if_less(v_base, 1, 7, is_less);
437        swap_if_less(v_base, 2, 5, is_less);
438        swap_if_less(v_base, 4, 8, is_less);
439        swap_if_less(v_base, 0, 7, is_less);
440        swap_if_less(v_base, 2, 4, is_less);
441        swap_if_less(v_base, 3, 8, is_less);
442        swap_if_less(v_base, 5, 6, is_less);
443        swap_if_less(v_base, 0, 2, is_less);
444        swap_if_less(v_base, 1, 3, is_less);
445        swap_if_less(v_base, 4, 5, is_less);
446        swap_if_less(v_base, 7, 8, is_less);
447        swap_if_less(v_base, 1, 4, is_less);
448        swap_if_less(v_base, 3, 6, is_less);
449        swap_if_less(v_base, 5, 7, is_less);
450        swap_if_less(v_base, 0, 1, is_less);
451        swap_if_less(v_base, 2, 4, is_less);
452        swap_if_less(v_base, 3, 5, is_less);
453        swap_if_less(v_base, 6, 8, is_less);
454        swap_if_less(v_base, 2, 3, is_less);
455        swap_if_less(v_base, 4, 5, is_less);
456        swap_if_less(v_base, 6, 7, is_less);
457        swap_if_less(v_base, 1, 2, is_less);
458        swap_if_less(v_base, 3, 4, is_less);
459        swap_if_less(v_base, 5, 6, is_less);
460    }
461}
462
463/// Sorts the first 13 elements of `v` with a fast fixed function.
464///
465/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
466/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
467/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
468/// `small_sort_network` makes sure to only call this once.
469fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
470where
471    F: FnMut(&T, &T) -> bool,
472{
473    if v.len() < 13 {
474        intrinsics::abort();
475    }
476
477    let v_base = v.as_mut_ptr();
478
479    // Optimal sorting network see:
480    // https://bertdobbelaere.github.io/sorting_networks.html.
481
482    // SAFETY: We checked the len.
483    unsafe {
484        swap_if_less(v_base, 0, 12, is_less);
485        swap_if_less(v_base, 1, 10, is_less);
486        swap_if_less(v_base, 2, 9, is_less);
487        swap_if_less(v_base, 3, 7, is_less);
488        swap_if_less(v_base, 5, 11, is_less);
489        swap_if_less(v_base, 6, 8, is_less);
490        swap_if_less(v_base, 1, 6, is_less);
491        swap_if_less(v_base, 2, 3, is_less);
492        swap_if_less(v_base, 4, 11, is_less);
493        swap_if_less(v_base, 7, 9, is_less);
494        swap_if_less(v_base, 8, 10, is_less);
495        swap_if_less(v_base, 0, 4, is_less);
496        swap_if_less(v_base, 1, 2, is_less);
497        swap_if_less(v_base, 3, 6, is_less);
498        swap_if_less(v_base, 7, 8, is_less);
499        swap_if_less(v_base, 9, 10, is_less);
500        swap_if_less(v_base, 11, 12, is_less);
501        swap_if_less(v_base, 4, 6, is_less);
502        swap_if_less(v_base, 5, 9, is_less);
503        swap_if_less(v_base, 8, 11, is_less);
504        swap_if_less(v_base, 10, 12, is_less);
505        swap_if_less(v_base, 0, 5, is_less);
506        swap_if_less(v_base, 3, 8, is_less);
507        swap_if_less(v_base, 4, 7, is_less);
508        swap_if_less(v_base, 6, 11, is_less);
509        swap_if_less(v_base, 9, 10, is_less);
510        swap_if_less(v_base, 0, 1, is_less);
511        swap_if_less(v_base, 2, 5, is_less);
512        swap_if_less(v_base, 6, 9, is_less);
513        swap_if_less(v_base, 7, 8, is_less);
514        swap_if_less(v_base, 10, 11, is_less);
515        swap_if_less(v_base, 1, 3, is_less);
516        swap_if_less(v_base, 2, 4, is_less);
517        swap_if_less(v_base, 5, 6, is_less);
518        swap_if_less(v_base, 9, 10, is_less);
519        swap_if_less(v_base, 1, 2, is_less);
520        swap_if_less(v_base, 3, 4, is_less);
521        swap_if_less(v_base, 5, 7, is_less);
522        swap_if_less(v_base, 6, 8, is_less);
523        swap_if_less(v_base, 2, 3, is_less);
524        swap_if_less(v_base, 4, 5, is_less);
525        swap_if_less(v_base, 6, 7, is_less);
526        swap_if_less(v_base, 8, 9, is_less);
527        swap_if_less(v_base, 3, 4, is_less);
528        swap_if_less(v_base, 5, 6, is_less);
529    }
530}
531
532/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
533///
534/// # Safety
535/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
536unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
537    // SAFETY: see individual comments.
538    unsafe {
539        // SAFETY: in-bounds as tail > begin.
540        let mut sift = tail.sub(1);
541        if !is_less(&*tail, &*sift) {
542            return;
543        }
544
545        // SAFETY: after this read tail is never read from again, as we only ever
546        // read from sift, sift < tail and we only ever decrease sift. Thus this is
547        // effectively a move, not a copy. Should a panic occur, or we have found
548        // the correct insertion position, gap_guard ensures the element is moved
549        // back into the array.
550        let tmp = ManuallyDrop::new(tail.read());
551        let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
552
553        loop {
554            // SAFETY: we move sift into the gap (which is valid), and point the
555            // gap guard destination at sift, ensuring that if a panic occurs the
556            // gap is once again filled.
557            ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
558            gap_guard.dst = sift;
559
560            if sift == begin {
561                break;
562            }
563
564            // SAFETY: we checked that sift != begin, thus this is in-bounds.
565            sift = sift.sub(1);
566            if !is_less(&tmp, &*sift) {
567                break;
568            }
569        }
570    }
571}
572
573/// Sort `v` assuming `v[..offset]` is already sorted.
574pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
575    v: &mut [T],
576    offset: usize,
577    is_less: &mut F,
578) {
579    let len = v.len();
580    if offset == 0 || offset > len {
581        intrinsics::abort();
582    }
583
584    // SAFETY: see individual comments.
585    unsafe {
586        // We write this basic loop directly using pointers, as when we use a
587        // for loop LLVM likes to unroll this loop which we do not want.
588        // SAFETY: v_end is the one-past-end pointer, and we checked that
589        // offset <= len, thus tail is also in-bounds.
590        let v_base = v.as_mut_ptr();
591        let v_end = v_base.add(len);
592        let mut tail = v_base.add(offset);
593        while tail != v_end {
594            // SAFETY: v_base and tail are both valid pointers to elements, and
595            // v_base < tail since we checked offset != 0.
596            insert_tail(v_base, tail, is_less);
597
598            // SAFETY: we checked that tail is not yet the one-past-end pointer.
599            tail = tail.add(1);
600        }
601    }
602}
603
604/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
605/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
606pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
607    v_base: *const T,
608    dst: *mut T,
609    is_less: &mut F,
610) {
611    // By limiting select to picking pointers, we are guaranteed good cmov code-gen
612    // regardless of type T's size. Further this only does 5 instead of 6
613    // comparisons compared to a stable transposition 4 element sorting-network,
614    // and always copies each element exactly once.
615
616    // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
617    // thus in-bounds by the precondition.
618    unsafe {
619        // Stably create two pairs a <= b and c <= d.
620        let c1 = is_less(&*v_base.add(1), &*v_base);
621        let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
622        let a = v_base.add(c1 as usize);
623        let b = v_base.add(!c1 as usize);
624        let c = v_base.add(2 + c2 as usize);
625        let d = v_base.add(2 + (!c2 as usize));
626
627        // Compare (a, c) and (b, d) to identify max/min. We're left with two
628        // unknown elements, but because we are a stable sort we must know which
629        // one is leftmost and which one is rightmost.
630        // c3, c4 | min max unknown_left unknown_right
631        //  0,  0 |  a   d    b         c
632        //  0,  1 |  a   b    c         d
633        //  1,  0 |  c   d    a         b
634        //  1,  1 |  c   b    a         d
635        let c3 = is_less(&*c, &*a);
636        let c4 = is_less(&*d, &*b);
637        let min = hint::select_unpredictable(c3, c, a);
638        let max = hint::select_unpredictable(c4, b, d);
639        let unknown_left = hint::select_unpredictable(c3, a, hint::select_unpredictable(c4, c, b));
640        let unknown_right = hint::select_unpredictable(c4, d, hint::select_unpredictable(c3, b, c));
641
642        // Sort the last two unknown elements.
643        let c5 = is_less(&*unknown_right, &*unknown_left);
644        let lo = hint::select_unpredictable(c5, unknown_right, unknown_left);
645        let hi = hint::select_unpredictable(c5, unknown_left, unknown_right);
646
647        ptr::copy_nonoverlapping(min, dst, 1);
648        ptr::copy_nonoverlapping(lo, dst.add(1), 1);
649        ptr::copy_nonoverlapping(hi, dst.add(2), 1);
650        ptr::copy_nonoverlapping(max, dst.add(3), 1);
651    }
652}
653
654/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
655/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
656/// be stored in `dst[0..8]`.
657unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
658    v_base: *mut T,
659    dst: *mut T,
660    scratch_base: *mut T,
661    is_less: &mut F,
662) {
663    // SAFETY: these pointers are all in-bounds by the precondition of our function.
664    unsafe {
665        sort4_stable(v_base, scratch_base, is_less);
666        sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
667    }
668
669    // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
670    // into dst.
671    unsafe {
672        bidirectional_merge(slice::from_raw_parts(scratch_base, 8), dst, is_less);
673    }
674}
675
676#[inline(always)]
677unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
678    mut left_src: *const T,
679    mut right_src: *const T,
680    mut dst: *mut T,
681    is_less: &mut F,
682) -> (*const T, *const T, *mut T) {
683    // This is a branchless merge utility function.
684    // The equivalent code with a branch would be:
685    //
686    // if !is_less(&*right_src, &*left_src) {
687    //     ptr::copy_nonoverlapping(left_src, dst, 1);
688    //     left_src = left_src.add(1);
689    // } else {
690    //     ptr::copy_nonoverlapping(right_src, dst, 1);
691    //     right_src = right_src.add(1);
692    // }
693    // dst = dst.add(1);
694
695    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
696    // to read and `dst` is valid to write, while not aliasing.
697    unsafe {
698        let is_l = !is_less(&*right_src, &*left_src);
699        let src = if is_l { left_src } else { right_src };
700        ptr::copy_nonoverlapping(src, dst, 1);
701        right_src = right_src.add(!is_l as usize);
702        left_src = left_src.add(is_l as usize);
703        dst = dst.add(1);
704    }
705
706    (left_src, right_src, dst)
707}
708
709#[inline(always)]
710unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
711    mut left_src: *const T,
712    mut right_src: *const T,
713    mut dst: *mut T,
714    is_less: &mut F,
715) -> (*const T, *const T, *mut T) {
716    // This is a branchless merge utility function.
717    // The equivalent code with a branch would be:
718    //
719    // if !is_less(&*right_src, &*left_src) {
720    //     ptr::copy_nonoverlapping(right_src, dst, 1);
721    //     right_src = right_src.wrapping_sub(1);
722    // } else {
723    //     ptr::copy_nonoverlapping(left_src, dst, 1);
724    //     left_src = left_src.wrapping_sub(1);
725    // }
726    // dst = dst.sub(1);
727
728    // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
729    // to read and `dst` is valid to write, while not aliasing.
730    unsafe {
731        let is_l = !is_less(&*right_src, &*left_src);
732        let src = if is_l { right_src } else { left_src };
733        ptr::copy_nonoverlapping(src, dst, 1);
734        right_src = right_src.wrapping_sub(is_l as usize);
735        left_src = left_src.wrapping_sub(!is_l as usize);
736        dst = dst.sub(1);
737    }
738
739    (left_src, right_src, dst)
740}
741
742/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
743///
744/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
745/// adapted to only use merge up and down. In contrast to the original
746/// parity_merge function, it performs 2 writes instead of 4 per iteration.
747///
748/// # Safety
749/// The caller must guarantee that `dst` is valid for v.len() writes.
750/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
751///
752/// Note that T must be Freeze, the comparison function is evaluated on outdated
753/// temporary 'copies' that may not end up in the final array.
754unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
755    v: &[T],
756    dst: *mut T,
757    is_less: &mut F,
758) {
759    // It helps to visualize the merge:
760    //
761    // Initial:
762    //
763    //  |dst (in dst)
764    //  |left               |right
765    //  v                   v
766    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
767    //                     ^                   ^
768    //                     |left_rev           |right_rev
769    //                                         |dst_rev (in dst)
770    //
771    // After:
772    //
773    //                      |dst (in dst)
774    //        |left         |           |right
775    //        v             v           v
776    // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
777    //       ^             ^           ^
778    //       |left_rev     |           |right_rev
779    //                     |dst_rev (in dst)
780    //
781    // In each iteration one of left or right moves up one position, and one of
782    // left_rev or right_rev moves down one position, whereas dst always moves
783    // up one position and dst_rev always moves down one position. Assuming
784    // the input was sorted and the comparison function is correctly implemented
785    // at the end we will have left == left_rev + 1, and right == right_rev + 1,
786    // fully consuming the input having written it to dst.
787
788    let len = v.len();
789    let src = v.as_ptr();
790
791    let len_div_2 = len / 2;
792
793    // SAFETY: The caller has to ensure that len >= 2.
794    unsafe {
795        intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
796    }
797
798    // SAFETY: no matter what the result of the user-provided comparison function
799    // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
800    // will always be in bounds if the caller guarantees that `dst` is valid for
801    // `v.len()` writes.
802    unsafe {
803        let mut left = src;
804        let mut right = src.add(len_div_2);
805        let mut dst = dst;
806
807        let mut left_rev = src.add(len_div_2 - 1);
808        let mut right_rev = src.add(len - 1);
809        let mut dst_rev = dst.add(len - 1);
810
811        for _ in 0..len_div_2 {
812            (left, right, dst) = merge_up(left, right, dst, is_less);
813            (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
814        }
815
816        let left_end = left_rev.wrapping_add(1);
817        let right_end = right_rev.wrapping_add(1);
818
819        // Odd length, so one element is left unconsumed in the input.
820        if !len.is_multiple_of(2) {
821            let left_nonempty = left < left_end;
822            let last_src = if left_nonempty { left } else { right };
823            ptr::copy_nonoverlapping(last_src, dst, 1);
824            left = left.add(left_nonempty as usize);
825            right = right.add((!left_nonempty) as usize);
826        }
827
828        // We now should have consumed the full input exactly once. This can only fail if the
829        // user-provided comparison function fails to implement a strict weak ordering. In that case
830        // we panic and never access the inconsistent state in dst.
831        if left != left_end || right != right_end {
832            panic_on_ord_violation();
833        }
834    }
835}
836
837#[cfg_attr(not(panic = "immediate-abort"), inline(never), cold)]
838#[cfg_attr(panic = "immediate-abort", inline)]
839fn panic_on_ord_violation() -> ! {
840    // This is indicative of a logic bug in the user-provided comparison function or Ord
841    // implementation. They are expected to implement a total order as explained in the Ord
842    // documentation.
843    //
844    // By panicking we inform the user, that they have a logic bug in their program. If a strict
845    // weak ordering is not given, the concept of comparison based sorting cannot yield a sorted
846    // result. E.g.: a < b < c < a
847    //
848    // The Ord documentation requires users to implement a total order. Arguably that's
849    // unnecessarily strict in the context of sorting. Issues only arise if the weaker requirement
850    // of a strict weak ordering is violated.
851    //
852    // The panic message talks about a total order because that's what the Ord documentation talks
853    // about and requires, so as to not confuse users.
854    panic!("user-provided comparison function does not correctly implement a total order");
855}
856
857#[must_use]
858pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
859    // Heuristic that holds true on all tested 64-bit capable architectures.
860    size_of::<T>() <= 8 // size_of::<u64>()
861}