core/slice/sort/shared/smallsort.rs
1//! This module contains a variety of sort implementations that are optimized for small lengths.
2
3use crate::mem::{self, ManuallyDrop, MaybeUninit};
4use crate::slice::sort::shared::FreezeMarker;
5use crate::{hint, intrinsics, ptr, slice};
6
7// It's important to differentiate between SMALL_SORT_THRESHOLD performance for
8// small slices and small-sort performance sorting small sub-slices as part of
9// the main quicksort loop. For the former, testing showed that the
10// representative benchmarks for real-world performance are cold CPU state and
11// not single-size hot benchmarks. For the latter the CPU will call them many
12// times, so hot benchmarks are fine and more realistic. And it's worth it to
13// optimize sorting small sub-slices with more sophisticated solutions than
14// insertion sort.
15
16/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
17/// abstractions.
18pub(crate) trait StableSmallSortTypeImpl: Sized {
19 /// For which input length <= return value of this function, is it valid to call `small_sort`.
20 fn small_sort_threshold() -> usize;
21
22 /// Sorts `v` using strategies optimized for small sizes.
23 fn small_sort<F: FnMut(&Self, &Self) -> bool>(
24 v: &mut [Self],
25 scratch: &mut [MaybeUninit<Self>],
26 is_less: &mut F,
27 );
28}
29
30impl<T> StableSmallSortTypeImpl for T {
31 #[inline(always)]
32 default fn small_sort_threshold() -> usize {
33 // Optimal number of comparisons, and good perf.
34 SMALL_SORT_FALLBACK_THRESHOLD
35 }
36
37 #[inline(always)]
38 default fn small_sort<F: FnMut(&T, &T) -> bool>(
39 v: &mut [T],
40 _scratch: &mut [MaybeUninit<T>],
41 is_less: &mut F,
42 ) {
43 if v.len() >= 2 {
44 insertion_sort_shift_left(v, 1, is_less);
45 }
46 }
47}
48
49impl<T: FreezeMarker> StableSmallSortTypeImpl for T {
50 #[inline(always)]
51 fn small_sort_threshold() -> usize {
52 SMALL_SORT_GENERAL_THRESHOLD
53 }
54
55 #[inline(always)]
56 fn small_sort<F: FnMut(&T, &T) -> bool>(
57 v: &mut [T],
58 scratch: &mut [MaybeUninit<T>],
59 is_less: &mut F,
60 ) {
61 small_sort_general_with_scratch(v, scratch, is_less);
62 }
63}
64
65/// Using a trait allows us to specialize on `Freeze` which in turn allows us to make safe
66/// abstractions.
67pub(crate) trait UnstableSmallSortTypeImpl: Sized {
68 /// For which input length <= return value of this function, is it valid to call `small_sort`.
69 fn small_sort_threshold() -> usize;
70
71 /// Sorts `v` using strategies optimized for small sizes.
72 fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
73}
74
75impl<T> UnstableSmallSortTypeImpl for T {
76 #[inline(always)]
77 default fn small_sort_threshold() -> usize {
78 SMALL_SORT_FALLBACK_THRESHOLD
79 }
80
81 #[inline(always)]
82 default fn small_sort<F>(v: &mut [T], is_less: &mut F)
83 where
84 F: FnMut(&T, &T) -> bool,
85 {
86 small_sort_fallback(v, is_less);
87 }
88}
89
90impl<T: FreezeMarker> UnstableSmallSortTypeImpl for T {
91 #[inline(always)]
92 fn small_sort_threshold() -> usize {
93 <T as UnstableSmallSortFreezeTypeImpl>::small_sort_threshold()
94 }
95
96 #[inline(always)]
97 fn small_sort<F>(v: &mut [T], is_less: &mut F)
98 where
99 F: FnMut(&T, &T) -> bool,
100 {
101 <T as UnstableSmallSortFreezeTypeImpl>::small_sort(v, is_less);
102 }
103}
104
105/// FIXME(const_trait_impl) use original ipnsort approach with choose_unstable_small_sort,
106/// as found here <https://github.com/Voultapher/sort-research-rs/blob/438fad5d0495f65d4b72aa87f0b62fc96611dff3/ipnsort/src/smallsort.rs#L83C10-L83C36>.
107pub(crate) trait UnstableSmallSortFreezeTypeImpl: Sized + FreezeMarker {
108 fn small_sort_threshold() -> usize;
109
110 fn small_sort<F: FnMut(&Self, &Self) -> bool>(v: &mut [Self], is_less: &mut F);
111}
112
113impl<T: FreezeMarker> UnstableSmallSortFreezeTypeImpl for T {
114 #[inline(always)]
115 default fn small_sort_threshold() -> usize {
116 if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
117 SMALL_SORT_GENERAL_THRESHOLD
118 } else {
119 SMALL_SORT_FALLBACK_THRESHOLD
120 }
121 }
122
123 #[inline(always)]
124 default fn small_sort<F>(v: &mut [T], is_less: &mut F)
125 where
126 F: FnMut(&T, &T) -> bool,
127 {
128 if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
129 small_sort_general(v, is_less);
130 } else {
131 small_sort_fallback(v, is_less);
132 }
133 }
134}
135
136/// SAFETY: Only used for run-time optimization heuristic.
137#[rustc_unsafe_specialization_marker]
138trait CopyMarker {}
139
140impl<T: Copy> CopyMarker for T {}
141
142impl<T: FreezeMarker + CopyMarker> UnstableSmallSortFreezeTypeImpl for T {
143 #[inline(always)]
144 fn small_sort_threshold() -> usize {
145 if has_efficient_in_place_swap::<T>()
146 && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
147 {
148 SMALL_SORT_NETWORK_THRESHOLD
149 } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
150 SMALL_SORT_GENERAL_THRESHOLD
151 } else {
152 SMALL_SORT_FALLBACK_THRESHOLD
153 }
154 }
155
156 #[inline(always)]
157 fn small_sort<F>(v: &mut [T], is_less: &mut F)
158 where
159 F: FnMut(&T, &T) -> bool,
160 {
161 if has_efficient_in_place_swap::<T>()
162 && (size_of::<T>() * SMALL_SORT_NETWORK_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE
163 {
164 small_sort_network(v, is_less);
165 } else if (size_of::<T>() * SMALL_SORT_GENERAL_SCRATCH_LEN) <= MAX_STACK_ARRAY_SIZE {
166 small_sort_general(v, is_less);
167 } else {
168 small_sort_fallback(v, is_less);
169 }
170 }
171}
172
173/// Optimal number of comparisons, and good perf.
174const SMALL_SORT_FALLBACK_THRESHOLD: usize = 16;
175
176/// From a comparison perspective 20 was ~2% more efficient for fully random input, but for
177/// wall-clock performance choosing 32 yielded better performance overall.
178///
179/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
180const SMALL_SORT_GENERAL_THRESHOLD: usize = 32;
181
182/// [`small_sort_general`] uses [`sort8_stable`] as primitive and does a kind of ping-pong merge,
183/// where the output of the first two [`sort8_stable`] calls is stored at the end of the scratch
184/// buffer. This simplifies panic handling and avoids additional copies. This affects the required
185/// scratch buffer size.
186///
187/// SAFETY: If you change this value, you have to adjust [`small_sort_general`] !
188pub(crate) const SMALL_SORT_GENERAL_SCRATCH_LEN: usize = SMALL_SORT_GENERAL_THRESHOLD + 16;
189
190/// SAFETY: If you change this value, you have to adjust [`small_sort_network`] !
191const SMALL_SORT_NETWORK_THRESHOLD: usize = 32;
192const SMALL_SORT_NETWORK_SCRATCH_LEN: usize = SMALL_SORT_NETWORK_THRESHOLD;
193
194/// Using a stack array, could cause a stack overflow if the type `T` is very large. To be
195/// conservative we limit the usage of small-sorts that require a stack array to types that fit
196/// within this limit.
197const MAX_STACK_ARRAY_SIZE: usize = 4096;
198
199fn small_sort_fallback<T, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
200 if v.len() >= 2 {
201 insertion_sort_shift_left(v, 1, is_less);
202 }
203}
204
205fn small_sort_general<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(v: &mut [T], is_less: &mut F) {
206 let mut stack_array = MaybeUninit::<[T; SMALL_SORT_GENERAL_SCRATCH_LEN]>::uninit();
207
208 // SAFETY: The memory is backed by `stack_array`, and the operation is safe as long as the len
209 // is the same.
210 let scratch = unsafe {
211 slice::from_raw_parts_mut(
212 stack_array.as_mut_ptr() as *mut MaybeUninit<T>,
213 SMALL_SORT_GENERAL_SCRATCH_LEN,
214 )
215 };
216
217 small_sort_general_with_scratch(v, scratch, is_less);
218}
219
220fn small_sort_general_with_scratch<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
221 v: &mut [T],
222 scratch: &mut [MaybeUninit<T>],
223 is_less: &mut F,
224) {
225 let len = v.len();
226 if len < 2 {
227 return;
228 }
229
230 if scratch.len() < len + 16 {
231 intrinsics::abort();
232 }
233
234 let v_base = v.as_mut_ptr();
235 let len_div_2 = len / 2;
236
237 // SAFETY: See individual comments.
238 unsafe {
239 let scratch_base = scratch.as_mut_ptr() as *mut T;
240
241 let presorted_len = if const { size_of::<T>() <= 16 } && len >= 16 {
242 // SAFETY: scratch_base is valid and has enough space.
243 sort8_stable(v_base, scratch_base, scratch_base.add(len), is_less);
244 sort8_stable(
245 v_base.add(len_div_2),
246 scratch_base.add(len_div_2),
247 scratch_base.add(len + 8),
248 is_less,
249 );
250
251 8
252 } else if len >= 8 {
253 // SAFETY: scratch_base is valid and has enough space.
254 sort4_stable(v_base, scratch_base, is_less);
255 sort4_stable(v_base.add(len_div_2), scratch_base.add(len_div_2), is_less);
256
257 4
258 } else {
259 ptr::copy_nonoverlapping(v_base, scratch_base, 1);
260 ptr::copy_nonoverlapping(v_base.add(len_div_2), scratch_base.add(len_div_2), 1);
261
262 1
263 };
264
265 for offset in [0, len_div_2] {
266 // SAFETY: at this point dst is initialized with presorted_len elements.
267 // We extend this to desired_len, src is valid for desired_len elements.
268 let src = v_base.add(offset);
269 let dst = scratch_base.add(offset);
270 let desired_len = if offset == 0 { len_div_2 } else { len - len_div_2 };
271
272 for i in presorted_len..desired_len {
273 ptr::copy_nonoverlapping(src.add(i), dst.add(i), 1);
274 insert_tail(dst, dst.add(i), is_less);
275 }
276 }
277
278 // SAFETY: see comment in `CopyOnDrop::drop`.
279 let drop_guard = CopyOnDrop { src: scratch_base, dst: v_base, len };
280
281 // SAFETY: at this point scratch_base is fully initialized, allowing us
282 // to use it as the source of our merge back into the original array.
283 // If a panic occurs we ensure the original array is restored to a valid
284 // permutation of the input through drop_guard. This technique is similar
285 // to ping-pong merging.
286 bidirectional_merge(
287 slice::from_raw_parts(drop_guard.src, drop_guard.len),
288 drop_guard.dst,
289 is_less,
290 );
291 mem::forget(drop_guard);
292 }
293}
294
295struct CopyOnDrop<T> {
296 src: *const T,
297 dst: *mut T,
298 len: usize,
299}
300
301impl<T> Drop for CopyOnDrop<T> {
302 fn drop(&mut self) {
303 // SAFETY: `src` must contain `len` initialized elements, and dst must
304 // be valid to write `len` elements.
305 unsafe {
306 ptr::copy_nonoverlapping(self.src, self.dst, self.len);
307 }
308 }
309}
310
311fn small_sort_network<T, F>(v: &mut [T], is_less: &mut F)
312where
313 T: FreezeMarker,
314 F: FnMut(&T, &T) -> bool,
315{
316 // This implementation is tuned to be efficient for integer types.
317
318 let len = v.len();
319 if len < 2 {
320 return;
321 }
322
323 if len > SMALL_SORT_NETWORK_SCRATCH_LEN {
324 intrinsics::abort();
325 }
326
327 let mut stack_array = MaybeUninit::<[T; SMALL_SORT_NETWORK_SCRATCH_LEN]>::uninit();
328
329 let len_div_2 = len / 2;
330 let no_merge = len < 18;
331
332 let v_base = v.as_mut_ptr();
333 let initial_region_len = if no_merge { len } else { len_div_2 };
334 // SAFETY: Both possible values of `initial_region_len` are in-bounds.
335 let mut region = unsafe { slice::from_raw_parts_mut(v_base, initial_region_len) };
336
337 // Avoid compiler unrolling, we *really* don't want that to happen here for binary-size reasons.
338 loop {
339 let presorted_len = if region.len() >= 13 {
340 sort13_optimal(region, is_less);
341 13
342 } else if region.len() >= 9 {
343 sort9_optimal(region, is_less);
344 9
345 } else {
346 1
347 };
348
349 insertion_sort_shift_left(region, presorted_len, is_less);
350
351 if no_merge {
352 return;
353 }
354
355 if region.as_ptr() != v_base {
356 break;
357 }
358
359 // SAFETY: The right side of `v` based on `len_div_2` is guaranteed in-bounds.
360 unsafe { region = slice::from_raw_parts_mut(v_base.add(len_div_2), len - len_div_2) };
361 }
362
363 // SAFETY: We checked that T is Freeze and thus observation safe.
364 // Should is_less panic v was not modified in parity_merge and retains it's original input.
365 // scratch and v must not alias and scratch has v.len() space.
366 unsafe {
367 let scratch_base = stack_array.as_mut_ptr() as *mut T;
368 bidirectional_merge(slice::from_raw_parts_mut(v_base, len), scratch_base, is_less);
369 ptr::copy_nonoverlapping(scratch_base, v_base, len);
370 }
371}
372
373/// Swap two values in the slice pointed to by `v_base` at the position `a_pos` and `b_pos` if the
374/// value at position `b_pos` is less than the one at position `a_pos`.
375///
376/// Purposefully not marked `#[inline]`, despite us wanting it to be inlined for integers like
377/// types. `is_less` could be a huge function and we want to give the compiler an option to
378/// not inline this function. For the same reasons that this function is very perf critical
379/// it should be in the same module as the functions that use it.
380unsafe fn swap_if_less<T, F>(v_base: *mut T, a_pos: usize, b_pos: usize, is_less: &mut F)
381where
382 F: FnMut(&T, &T) -> bool,
383{
384 // SAFETY: the caller must guarantee that `a_pos` and `b_pos` each added to `v_base` yield valid
385 // pointers into `v_base`, and are properly aligned, and part of the same allocation.
386 unsafe {
387 let v_a = v_base.add(a_pos);
388 let v_b = v_base.add(b_pos);
389
390 // PANIC SAFETY: if is_less panics, no scratch memory was created and the slice should still be
391 // in a well defined state, without duplicates.
392
393 // Important to only swap if it is more and not if it is equal. is_less should return false for
394 // equal, so we don't swap.
395 let should_swap = is_less(&*v_b, &*v_a);
396
397 // This is a branchless version of swap if.
398 // The equivalent code with a branch would be:
399 //
400 // if should_swap {
401 // ptr::swap(v_a, v_b, 1);
402 // }
403
404 // The goal is to generate cmov instructions here.
405 let v_a_swap = hint::select_unpredictable(should_swap, v_b, v_a);
406 let v_b_swap = hint::select_unpredictable(should_swap, v_a, v_b);
407
408 let v_b_swap_tmp = ManuallyDrop::new(ptr::read(v_b_swap));
409 ptr::copy(v_a_swap, v_a, 1);
410 ptr::copy_nonoverlapping(&*v_b_swap_tmp, v_b, 1);
411 }
412}
413
414/// Sorts the first 9 elements of `v` with a fast fixed function.
415///
416/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
417/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
418/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
419/// `small_sort_network` makes sure to only call this once.
420fn sort9_optimal<T, F>(v: &mut [T], is_less: &mut F)
421where
422 F: FnMut(&T, &T) -> bool,
423{
424 if v.len() < 9 {
425 intrinsics::abort();
426 }
427
428 let v_base = v.as_mut_ptr();
429
430 // Optimal sorting network see:
431 // https://bertdobbelaere.github.io/sorting_networks.html.
432
433 // SAFETY: We checked the len.
434 unsafe {
435 swap_if_less(v_base, 0, 3, is_less);
436 swap_if_less(v_base, 1, 7, is_less);
437 swap_if_less(v_base, 2, 5, is_less);
438 swap_if_less(v_base, 4, 8, is_less);
439 swap_if_less(v_base, 0, 7, is_less);
440 swap_if_less(v_base, 2, 4, is_less);
441 swap_if_less(v_base, 3, 8, is_less);
442 swap_if_less(v_base, 5, 6, is_less);
443 swap_if_less(v_base, 0, 2, is_less);
444 swap_if_less(v_base, 1, 3, is_less);
445 swap_if_less(v_base, 4, 5, is_less);
446 swap_if_less(v_base, 7, 8, is_less);
447 swap_if_less(v_base, 1, 4, is_less);
448 swap_if_less(v_base, 3, 6, is_less);
449 swap_if_less(v_base, 5, 7, is_less);
450 swap_if_less(v_base, 0, 1, is_less);
451 swap_if_less(v_base, 2, 4, is_less);
452 swap_if_less(v_base, 3, 5, is_less);
453 swap_if_less(v_base, 6, 8, is_less);
454 swap_if_less(v_base, 2, 3, is_less);
455 swap_if_less(v_base, 4, 5, is_less);
456 swap_if_less(v_base, 6, 7, is_less);
457 swap_if_less(v_base, 1, 2, is_less);
458 swap_if_less(v_base, 3, 4, is_less);
459 swap_if_less(v_base, 5, 6, is_less);
460 }
461}
462
463/// Sorts the first 13 elements of `v` with a fast fixed function.
464///
465/// Should `is_less` generate substantial amounts of code the compiler can choose to not inline
466/// `swap_if_less`. If the code of a sort impl changes so as to call this function in multiple
467/// places, `#[inline(never)]` is recommended to keep binary-size in check. The current design of
468/// `small_sort_network` makes sure to only call this once.
469fn sort13_optimal<T, F>(v: &mut [T], is_less: &mut F)
470where
471 F: FnMut(&T, &T) -> bool,
472{
473 if v.len() < 13 {
474 intrinsics::abort();
475 }
476
477 let v_base = v.as_mut_ptr();
478
479 // Optimal sorting network see:
480 // https://bertdobbelaere.github.io/sorting_networks.html.
481
482 // SAFETY: We checked the len.
483 unsafe {
484 swap_if_less(v_base, 0, 12, is_less);
485 swap_if_less(v_base, 1, 10, is_less);
486 swap_if_less(v_base, 2, 9, is_less);
487 swap_if_less(v_base, 3, 7, is_less);
488 swap_if_less(v_base, 5, 11, is_less);
489 swap_if_less(v_base, 6, 8, is_less);
490 swap_if_less(v_base, 1, 6, is_less);
491 swap_if_less(v_base, 2, 3, is_less);
492 swap_if_less(v_base, 4, 11, is_less);
493 swap_if_less(v_base, 7, 9, is_less);
494 swap_if_less(v_base, 8, 10, is_less);
495 swap_if_less(v_base, 0, 4, is_less);
496 swap_if_less(v_base, 1, 2, is_less);
497 swap_if_less(v_base, 3, 6, is_less);
498 swap_if_less(v_base, 7, 8, is_less);
499 swap_if_less(v_base, 9, 10, is_less);
500 swap_if_less(v_base, 11, 12, is_less);
501 swap_if_less(v_base, 4, 6, is_less);
502 swap_if_less(v_base, 5, 9, is_less);
503 swap_if_less(v_base, 8, 11, is_less);
504 swap_if_less(v_base, 10, 12, is_less);
505 swap_if_less(v_base, 0, 5, is_less);
506 swap_if_less(v_base, 3, 8, is_less);
507 swap_if_less(v_base, 4, 7, is_less);
508 swap_if_less(v_base, 6, 11, is_less);
509 swap_if_less(v_base, 9, 10, is_less);
510 swap_if_less(v_base, 0, 1, is_less);
511 swap_if_less(v_base, 2, 5, is_less);
512 swap_if_less(v_base, 6, 9, is_less);
513 swap_if_less(v_base, 7, 8, is_less);
514 swap_if_less(v_base, 10, 11, is_less);
515 swap_if_less(v_base, 1, 3, is_less);
516 swap_if_less(v_base, 2, 4, is_less);
517 swap_if_less(v_base, 5, 6, is_less);
518 swap_if_less(v_base, 9, 10, is_less);
519 swap_if_less(v_base, 1, 2, is_less);
520 swap_if_less(v_base, 3, 4, is_less);
521 swap_if_less(v_base, 5, 7, is_less);
522 swap_if_less(v_base, 6, 8, is_less);
523 swap_if_less(v_base, 2, 3, is_less);
524 swap_if_less(v_base, 4, 5, is_less);
525 swap_if_less(v_base, 6, 7, is_less);
526 swap_if_less(v_base, 8, 9, is_less);
527 swap_if_less(v_base, 3, 4, is_less);
528 swap_if_less(v_base, 5, 6, is_less);
529 }
530}
531
532/// Sorts range [begin, tail] assuming [begin, tail) is already sorted.
533///
534/// # Safety
535/// begin < tail and p must be valid and initialized for all begin <= p <= tail.
536unsafe fn insert_tail<T, F: FnMut(&T, &T) -> bool>(begin: *mut T, tail: *mut T, is_less: &mut F) {
537 // SAFETY: see individual comments.
538 unsafe {
539 // SAFETY: in-bounds as tail > begin.
540 let mut sift = tail.sub(1);
541 if !is_less(&*tail, &*sift) {
542 return;
543 }
544
545 // SAFETY: after this read tail is never read from again, as we only ever
546 // read from sift, sift < tail and we only ever decrease sift. Thus this is
547 // effectively a move, not a copy. Should a panic occur, or we have found
548 // the correct insertion position, gap_guard ensures the element is moved
549 // back into the array.
550 let tmp = ManuallyDrop::new(tail.read());
551 let mut gap_guard = CopyOnDrop { src: &*tmp, dst: tail, len: 1 };
552
553 loop {
554 // SAFETY: we move sift into the gap (which is valid), and point the
555 // gap guard destination at sift, ensuring that if a panic occurs the
556 // gap is once again filled.
557 ptr::copy_nonoverlapping(sift, gap_guard.dst, 1);
558 gap_guard.dst = sift;
559
560 if sift == begin {
561 break;
562 }
563
564 // SAFETY: we checked that sift != begin, thus this is in-bounds.
565 sift = sift.sub(1);
566 if !is_less(&tmp, &*sift) {
567 break;
568 }
569 }
570 }
571}
572
573/// Sort `v` assuming `v[..offset]` is already sorted.
574pub fn insertion_sort_shift_left<T, F: FnMut(&T, &T) -> bool>(
575 v: &mut [T],
576 offset: usize,
577 is_less: &mut F,
578) {
579 let len = v.len();
580 if offset == 0 || offset > len {
581 intrinsics::abort();
582 }
583
584 // SAFETY: see individual comments.
585 unsafe {
586 // We write this basic loop directly using pointers, as when we use a
587 // for loop LLVM likes to unroll this loop which we do not want.
588 // SAFETY: v_end is the one-past-end pointer, and we checked that
589 // offset <= len, thus tail is also in-bounds.
590 let v_base = v.as_mut_ptr();
591 let v_end = v_base.add(len);
592 let mut tail = v_base.add(offset);
593 while tail != v_end {
594 // SAFETY: v_base and tail are both valid pointers to elements, and
595 // v_base < tail since we checked offset != 0.
596 insert_tail(v_base, tail, is_less);
597
598 // SAFETY: we checked that tail is not yet the one-past-end pointer.
599 tail = tail.add(1);
600 }
601 }
602}
603
604/// SAFETY: The caller MUST guarantee that `v_base` is valid for 4 reads and
605/// `dst` is valid for 4 writes. The result will be stored in `dst[0..4]`.
606pub unsafe fn sort4_stable<T, F: FnMut(&T, &T) -> bool>(
607 v_base: *const T,
608 dst: *mut T,
609 is_less: &mut F,
610) {
611 // By limiting select to picking pointers, we are guaranteed good cmov code-gen
612 // regardless of type T's size. Further this only does 5 instead of 6
613 // comparisons compared to a stable transposition 4 element sorting-network,
614 // and always copies each element exactly once.
615
616 // SAFETY: all pointers have offset at most 3 from v_base and dst, and are
617 // thus in-bounds by the precondition.
618 unsafe {
619 // Stably create two pairs a <= b and c <= d.
620 let c1 = is_less(&*v_base.add(1), &*v_base);
621 let c2 = is_less(&*v_base.add(3), &*v_base.add(2));
622 let a = v_base.add(c1 as usize);
623 let b = v_base.add(!c1 as usize);
624 let c = v_base.add(2 + c2 as usize);
625 let d = v_base.add(2 + (!c2 as usize));
626
627 // Compare (a, c) and (b, d) to identify max/min. We're left with two
628 // unknown elements, but because we are a stable sort we must know which
629 // one is leftmost and which one is rightmost.
630 // c3, c4 | min max unknown_left unknown_right
631 // 0, 0 | a d b c
632 // 0, 1 | a b c d
633 // 1, 0 | c d a b
634 // 1, 1 | c b a d
635 let c3 = is_less(&*c, &*a);
636 let c4 = is_less(&*d, &*b);
637 let min = hint::select_unpredictable(c3, c, a);
638 let max = hint::select_unpredictable(c4, b, d);
639 let unknown_left = hint::select_unpredictable(c3, a, hint::select_unpredictable(c4, c, b));
640 let unknown_right = hint::select_unpredictable(c4, d, hint::select_unpredictable(c3, b, c));
641
642 // Sort the last two unknown elements.
643 let c5 = is_less(&*unknown_right, &*unknown_left);
644 let lo = hint::select_unpredictable(c5, unknown_right, unknown_left);
645 let hi = hint::select_unpredictable(c5, unknown_left, unknown_right);
646
647 ptr::copy_nonoverlapping(min, dst, 1);
648 ptr::copy_nonoverlapping(lo, dst.add(1), 1);
649 ptr::copy_nonoverlapping(hi, dst.add(2), 1);
650 ptr::copy_nonoverlapping(max, dst.add(3), 1);
651 }
652}
653
654/// SAFETY: The caller MUST guarantee that `v_base` is valid for 8 reads and
655/// writes, `scratch_base` and `dst` MUST be valid for 8 writes. The result will
656/// be stored in `dst[0..8]`.
657unsafe fn sort8_stable<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
658 v_base: *mut T,
659 dst: *mut T,
660 scratch_base: *mut T,
661 is_less: &mut F,
662) {
663 // SAFETY: these pointers are all in-bounds by the precondition of our function.
664 unsafe {
665 sort4_stable(v_base, scratch_base, is_less);
666 sort4_stable(v_base.add(4), scratch_base.add(4), is_less);
667 }
668
669 // SAFETY: scratch_base[0..8] is now initialized, allowing us to merge back
670 // into dst.
671 unsafe {
672 bidirectional_merge(slice::from_raw_parts(scratch_base, 8), dst, is_less);
673 }
674}
675
676#[inline(always)]
677unsafe fn merge_up<T, F: FnMut(&T, &T) -> bool>(
678 mut left_src: *const T,
679 mut right_src: *const T,
680 mut dst: *mut T,
681 is_less: &mut F,
682) -> (*const T, *const T, *mut T) {
683 // This is a branchless merge utility function.
684 // The equivalent code with a branch would be:
685 //
686 // if !is_less(&*right_src, &*left_src) {
687 // ptr::copy_nonoverlapping(left_src, dst, 1);
688 // left_src = left_src.add(1);
689 // } else {
690 // ptr::copy_nonoverlapping(right_src, dst, 1);
691 // right_src = right_src.add(1);
692 // }
693 // dst = dst.add(1);
694
695 // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
696 // to read and `dst` is valid to write, while not aliasing.
697 unsafe {
698 let is_l = !is_less(&*right_src, &*left_src);
699 let src = if is_l { left_src } else { right_src };
700 ptr::copy_nonoverlapping(src, dst, 1);
701 right_src = right_src.add(!is_l as usize);
702 left_src = left_src.add(is_l as usize);
703 dst = dst.add(1);
704 }
705
706 (left_src, right_src, dst)
707}
708
709#[inline(always)]
710unsafe fn merge_down<T, F: FnMut(&T, &T) -> bool>(
711 mut left_src: *const T,
712 mut right_src: *const T,
713 mut dst: *mut T,
714 is_less: &mut F,
715) -> (*const T, *const T, *mut T) {
716 // This is a branchless merge utility function.
717 // The equivalent code with a branch would be:
718 //
719 // if !is_less(&*right_src, &*left_src) {
720 // ptr::copy_nonoverlapping(right_src, dst, 1);
721 // right_src = right_src.wrapping_sub(1);
722 // } else {
723 // ptr::copy_nonoverlapping(left_src, dst, 1);
724 // left_src = left_src.wrapping_sub(1);
725 // }
726 // dst = dst.sub(1);
727
728 // SAFETY: The caller must guarantee that `left_src`, `right_src` are valid
729 // to read and `dst` is valid to write, while not aliasing.
730 unsafe {
731 let is_l = !is_less(&*right_src, &*left_src);
732 let src = if is_l { right_src } else { left_src };
733 ptr::copy_nonoverlapping(src, dst, 1);
734 right_src = right_src.wrapping_sub(is_l as usize);
735 left_src = left_src.wrapping_sub(!is_l as usize);
736 dst = dst.sub(1);
737 }
738
739 (left_src, right_src, dst)
740}
741
742/// Merge v assuming v[..len / 2] and v[len / 2..] are sorted.
743///
744/// Original idea for bi-directional merging by Igor van den Hoven (quadsort),
745/// adapted to only use merge up and down. In contrast to the original
746/// parity_merge function, it performs 2 writes instead of 4 per iteration.
747///
748/// # Safety
749/// The caller must guarantee that `dst` is valid for v.len() writes.
750/// Also `v.as_ptr()` and `dst` must not alias and v.len() must be >= 2.
751///
752/// Note that T must be Freeze, the comparison function is evaluated on outdated
753/// temporary 'copies' that may not end up in the final array.
754unsafe fn bidirectional_merge<T: FreezeMarker, F: FnMut(&T, &T) -> bool>(
755 v: &[T],
756 dst: *mut T,
757 is_less: &mut F,
758) {
759 // It helps to visualize the merge:
760 //
761 // Initial:
762 //
763 // |dst (in dst)
764 // |left |right
765 // v v
766 // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
767 // ^ ^
768 // |left_rev |right_rev
769 // |dst_rev (in dst)
770 //
771 // After:
772 //
773 // |dst (in dst)
774 // |left | |right
775 // v v v
776 // [xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx]
777 // ^ ^ ^
778 // |left_rev | |right_rev
779 // |dst_rev (in dst)
780 //
781 // In each iteration one of left or right moves up one position, and one of
782 // left_rev or right_rev moves down one position, whereas dst always moves
783 // up one position and dst_rev always moves down one position. Assuming
784 // the input was sorted and the comparison function is correctly implemented
785 // at the end we will have left == left_rev + 1, and right == right_rev + 1,
786 // fully consuming the input having written it to dst.
787
788 let len = v.len();
789 let src = v.as_ptr();
790
791 let len_div_2 = len / 2;
792
793 // SAFETY: The caller has to ensure that len >= 2.
794 unsafe {
795 intrinsics::assume(len_div_2 != 0); // This can avoid useless code-gen.
796 }
797
798 // SAFETY: no matter what the result of the user-provided comparison function
799 // is, all 4 read pointers will always be in-bounds. Writing `dst` and `dst_rev`
800 // will always be in bounds if the caller guarantees that `dst` is valid for
801 // `v.len()` writes.
802 unsafe {
803 let mut left = src;
804 let mut right = src.add(len_div_2);
805 let mut dst = dst;
806
807 let mut left_rev = src.add(len_div_2 - 1);
808 let mut right_rev = src.add(len - 1);
809 let mut dst_rev = dst.add(len - 1);
810
811 for _ in 0..len_div_2 {
812 (left, right, dst) = merge_up(left, right, dst, is_less);
813 (left_rev, right_rev, dst_rev) = merge_down(left_rev, right_rev, dst_rev, is_less);
814 }
815
816 let left_end = left_rev.wrapping_add(1);
817 let right_end = right_rev.wrapping_add(1);
818
819 // Odd length, so one element is left unconsumed in the input.
820 if !len.is_multiple_of(2) {
821 let left_nonempty = left < left_end;
822 let last_src = if left_nonempty { left } else { right };
823 ptr::copy_nonoverlapping(last_src, dst, 1);
824 left = left.add(left_nonempty as usize);
825 right = right.add((!left_nonempty) as usize);
826 }
827
828 // We now should have consumed the full input exactly once. This can only fail if the
829 // user-provided comparison function fails to implement a strict weak ordering. In that case
830 // we panic and never access the inconsistent state in dst.
831 if left != left_end || right != right_end {
832 panic_on_ord_violation();
833 }
834 }
835}
836
837#[cfg_attr(not(panic = "immediate-abort"), inline(never), cold)]
838#[cfg_attr(panic = "immediate-abort", inline)]
839fn panic_on_ord_violation() -> ! {
840 // This is indicative of a logic bug in the user-provided comparison function or Ord
841 // implementation. They are expected to implement a total order as explained in the Ord
842 // documentation.
843 //
844 // By panicking we inform the user, that they have a logic bug in their program. If a strict
845 // weak ordering is not given, the concept of comparison based sorting cannot yield a sorted
846 // result. E.g.: a < b < c < a
847 //
848 // The Ord documentation requires users to implement a total order. Arguably that's
849 // unnecessarily strict in the context of sorting. Issues only arise if the weaker requirement
850 // of a strict weak ordering is violated.
851 //
852 // The panic message talks about a total order because that's what the Ord documentation talks
853 // about and requires, so as to not confuse users.
854 panic!("user-provided comparison function does not correctly implement a total order");
855}
856
857#[must_use]
858pub(crate) const fn has_efficient_in_place_swap<T>() -> bool {
859 // Heuristic that holds true on all tested 64-bit capable architectures.
860 size_of::<T>() <= 8 // size_of::<u64>()
861}