From 6d81a994d8625607e7156a3ba6eb1e7692bbc600 Mon Sep 17 00:00:00 2001 From: Calcitem Date: Sat, 5 Oct 2019 13:45:43 +0800 Subject: [PATCH] =?UTF-8?q?sort:=20=E5=B0=86=20std::sort=20=E6=8E=92?= =?UTF-8?q?=E5=BA=8F=E7=AE=97=E6=B3=95=E6=9B=BF=E6=8D=A2=E4=B8=BA=20sqrt?= =?UTF-8?q?=5Fsort=5Fsort=5Fins=20=E7=AE=97=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 不需要通过转换为 vector 再调用 std::sort 排序。 自对弈时长由 52s 缩短到 45s。 排序算法代码来源: https://github.com/swenson/sort --- millgame.pro | 1 + millgame.vcxproj | 1 + millgame.vcxproj.filters | 3 + src/ai/search.cpp | 85 +- src/ai/search.h | 41 + src/base/sort.h | 3278 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 3394 insertions(+), 15 deletions(-) create mode 100644 src/base/sort.h diff --git a/millgame.pro b/millgame.pro index f438d0f0..0e14b2cd 100644 --- a/millgame.pro +++ b/millgame.pro @@ -60,6 +60,7 @@ HEADERS += \ src/base/hashMap.h \ src/base/memmgr.h \ src/base/misc.h \ + src/base/sort.h \ src/base/stack.h \ src/base/thread.h \ src/ai/search.h \ diff --git a/millgame.vcxproj b/millgame.vcxproj index df8166ac..0226a197 100644 --- a/millgame.vcxproj +++ b/millgame.vcxproj @@ -451,6 +451,7 @@ + diff --git a/millgame.vcxproj.filters b/millgame.vcxproj.filters index f211d584..c11031cf 100644 --- a/millgame.vcxproj.filters +++ b/millgame.vcxproj.filters @@ -126,6 +126,9 @@ base + + base + diff --git a/src/ai/search.cpp b/src/ai/search.cpp index 1d101ed9..c0999b19 100644 --- a/src/ai/search.cpp +++ b/src/ai/search.cpp @@ -33,6 +33,14 @@ #include "types.h" #include "option.h" +#define SORT_NAME nodep +#define SORT_TYPE AIAlgorithm::Node* +#define SORT_CMP(x, y) (-AIAlgorithm::nodeCompare((x), (y))) + +player_t gSideToMove; + +#include "sort.h" + using namespace CTSL; // 用于检测重复局面 (Position) @@ -286,6 +294,58 @@ bool AIAlgorithm::nodeGreater(const Node *first, const Node *second) #endif } +int AIAlgorithm::nodeCompare(const Node * first, const Node * second) +{ + int ret = 0; + + if (gSideToMove == PLAYER_BLACK) { + if (first->value > second->value) { + ret = 1; + goto out; + } + + if (first->value < second->value) { + ret = -1; + goto out; + } + + if (first->value == second->value) { + if (!first->pruned && second->pruned) { + ret = 1; + goto out; + } else if (first->pruned && !second->pruned) { + ret = -1; + goto out; + } + } + } + + if (gSideToMove == PLAYER_WHITE) { + if (first->value < second->value) { + ret = 1; + goto out; + } + + if (first->value > second->value) { + ret = -1; + goto out; + } + + if (first->value == second->value) { + if (!first->pruned && second->pruned) { + ret = 1; + goto out; + } else if (first->pruned && !second->pruned) { + ret = -1; + goto out; + } + } + } + +out: + return ret; +} + void AIAlgorithm::sortMoves(Node *node) { // 这个函数对效率的影响很大,排序好的话,剪枝较早,节省时间,但不能在此函数耗费太多时间 @@ -297,34 +357,29 @@ void AIAlgorithm::sortMoves(Node *node) //#define DEBUG_SORT #ifdef DEBUG_SORT for (int i = 0; i < node->childrenSize; i++) { - loggerDebug("* [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value); + loggerDebug("* [%d] %p: %d = %d (%d)\n", + i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned); } loggerDebug("\n"); #endif - // TODO: 暂时使用 std::sort 排序, 后续需实现自己的排序函数 +#define NODE_PTR_SORT_FUN(x) nodep_##x - vector vec; - vec.reserve(NODE_CHILDREN_SIZE); + gSideToMove = tempGame.position.sideToMove; // TODO: 暂时用全局变量 - for (int i = 0; i < node->childrenSize; i++) { - vec.push_back(node->children[i]); - } - - std::stable_sort(vec.begin(), vec.end(), cmp); - - for (int i = 0; i < node->childrenSize; i++) { - node->children[i] = vec[i]; - } + // 此处选用排序算法 + NODE_PTR_SORT_FUN(sqrt_sort_sort_ins)(node->children, node->childrenSize); #ifdef DEBUG_SORT if (tempGame.position.sideToMove == PLAYER_BLACK) { for (int i = 0; i < node->childrenSize; i++) { - loggerDebug("+ [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value); + loggerDebug("+ [%d] %p: %d = %d (%d)\n", + i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned); } } else { for (int i = 0; i < node->childrenSize; i++) { - loggerDebug("- [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value); + loggerDebug("- [%d] %p: %d = %d (%d)\n", + i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned); } } loggerDebug("\n----------------------------------------\n"); diff --git a/src/ai/search.h b/src/ai/search.h index 40a9633d..a649950a 100644 --- a/src/ai/search.h +++ b/src/ai/search.h @@ -100,6 +100,46 @@ public: #endif /* TRANSPOSITION_TABLE_ENABLE */ hash_t hash; // 哈希值 #endif /* DEBUG_AB_TREE */ + + bool operator < (const Node &other) + { + if (value < other.value) { + return true; + } + + if (value == other.value) { + if (pruned && !other.pruned) { + return true; + } + } + + return false; + } + + bool operator > (const Node &other) + { + if (value > other.value) { + return true; + } + + if (value == other.value) { + if (!pruned && other.pruned) { + return true; + } + } + + return false; + } + + bool operator == (const Node &other) + { + if (value == other.value && + pruned == other.pruned) { + return true; + } + + return false; + } }; #ifdef MEMORY_POOL @@ -139,6 +179,7 @@ public: #ifdef MEMORY_POOL static bool nodeLess(const Node *first, const Node *second); static bool nodeGreater(const Node *first, const Node *second); + static int nodeCompare(const Node *first, const Node *second); #endif // MEMORY_POOL #ifdef ENDGAME_LEARNING diff --git a/src/base/sort.h b/src/base/sort.h new file mode 100644 index 00000000..10d82556 --- /dev/null +++ b/src/base/sort.h @@ -0,0 +1,3278 @@ +/* Copyright (c) 2010-2019 Christopher Swenson. */ +/* Copyright (c) 2012 Vojtech Fried. */ +/* Copyright (c) 2012 Google Inc. All Rights Reserved. */ + +#include +#include +#include +#include + +#ifndef SORT_NAME +#error "Must declare SORT_NAME" +#endif + +#ifndef SORT_TYPE +#error "Must declare SORT_TYPE" +#endif + +#ifndef SORT_CMP +#define SORT_CMP(x, y) ((x) < (y) ? -1 : ((y) < (x) ? 1 : 0)) +#endif + +#ifndef TIM_SORT_STACK_SIZE +#define TIM_SORT_STACK_SIZE 128 +#endif + +#ifndef TIM_SORT_MIN_GALLOP +#define TIM_SORT_MIN_GALLOP 7 +#endif + +#ifndef SORT_SWAP +#define SORT_SWAP(x,y) {SORT_TYPE _sort_swap_temp = (x); (x) = (y); (y) = _sort_swap_temp;} +#endif + +/* Common, type-agnostic functions and constants that we don't want to declare twice. */ +#ifndef SORT_COMMON_H +#define SORT_COMMON_H + +#ifndef MAX +#define MAX(x,y) (((x) > (y) ? (x) : (y))) +#endif + +#ifndef MIN +#define MIN(x,y) (((x) < (y) ? (x) : (y))) +#endif + +static int compute_minrun(const uint64_t); + +/* From http://oeis.org/classic/A102549 */ +static const uint64_t shell_gaps[48] = {1, 4, 10, 23, 57, 132, 301, 701, 1750, 4376, 10941, 27353, 68383, 170958, 427396, 1068491, 2671228, 6678071, 16695178, 41737946, 104344866, 260862166, 652155416, 1630388541, 4075971353LL, 10189928383LL, 25474820958LL, 63687052396LL, 159217630991LL, 398044077478LL, 995110193696LL, 2487775484241LL, 6219438710603LL, 15548596776508LL, 38871491941271LL, 97178729853178LL, 242946824632946LL, 607367061582366LL, 1518417653955916LL, 3796044134889791LL, 9490110337224478LL, 23725275843061196LL, 59313189607652991LL, 148282974019132478LL, 370707435047831196LL, 926768587619577991LL, 2316921469048944978LL, 5792303672622362446LL}; + +#ifndef CLZ +/* clang-only */ +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif +#if __has_builtin(__builtin_clzll) || (defined(__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ > 3))) +#define CLZ __builtin_clzll +#else + +static int clzll(uint64_t); + +/* adapted from Hacker's Delight */ +static int clzll(uint64_t x) { + int n; + + if (x == 0) { + return 64; + } + + n = 0; + + if (x <= 0x00000000FFFFFFFFL) { + n = n + 32; + x = x << 32; + } + + if (x <= 0x0000FFFFFFFFFFFFL) { + n = n + 16; + x = x << 16; + } + + if (x <= 0x00FFFFFFFFFFFFFFL) { + n = n + 8; + x = x << 8; + } + + if (x <= 0x0FFFFFFFFFFFFFFFL) { + n = n + 4; + x = x << 4; + } + + if (x <= 0x3FFFFFFFFFFFFFFFL) { + n = n + 2; + x = x << 2; + } + + if (x <= 0x7FFFFFFFFFFFFFFFL) { + n = n + 1; + } + + return n; +} + +#define CLZ clzll +#endif +#endif + +static __inline int compute_minrun(const uint64_t size) { + const int top_bit = 64 - CLZ(size); + const int shift = MAX(top_bit, 6) - 6; + const int minrun = size >> shift; + const uint64_t mask = (1ULL << shift) - 1; + + if (mask & size) { + return minrun + 1; + } + + return minrun; +} + +static __inline size_t rbnd(size_t len) { + int k; + + if (len < 16) { + return 2; + } + + k = 62 - CLZ(len); + return 1ULL << ((2 * k) / 3); +} + +#endif /* SORT_COMMON_H */ + +#define SORT_CONCAT(x, y) x ## _ ## y +#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y) +#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x) + +#ifndef SMALL_SORT_BND +#define SMALL_SORT_BND 16 +#endif +#ifndef SMALL_SORT +#define SMALL_SORT BITONIC_SORT +/*#define SMALL_SORT BINARY_INSERTION_SORT*/ +#endif + +#define BITONIC_SORT SORT_MAKE_STR(bitonic_sort) +#define BINARY_INSERTION_FIND SORT_MAKE_STR(binary_insertion_find) +#define BINARY_INSERTION_SORT_START SORT_MAKE_STR(binary_insertion_sort_start) +#define BINARY_INSERTION_SORT SORT_MAKE_STR(binary_insertion_sort) +#define REVERSE_ELEMENTS SORT_MAKE_STR(reverse_elements) +#define COUNT_RUN SORT_MAKE_STR(count_run) +#define CHECK_INVARIANT SORT_MAKE_STR(check_invariant) +#define TIM_SORT SORT_MAKE_STR(tim_sort) +#define TIM_SORT_GALLOP SORT_MAKE_STR(tim_sort_gallop) +#define TIM_SORT_RESIZE SORT_MAKE_STR(tim_sort_resize) +#define TIM_SORT_MERGE SORT_MAKE_STR(tim_sort_merge) +#define TIM_SORT_MERGE_LEFT SORT_MAKE_STR(tim_sort_merge_left) +#define TIM_SORT_MERGE_RIGHT SORT_MAKE_STR(tim_sort_merge_right) +#define TIM_SORT_COLLAPSE SORT_MAKE_STR(tim_sort_collapse) +#define HEAP_SORT SORT_MAKE_STR(heap_sort) +#define MEDIAN SORT_MAKE_STR(median) +#define QUICK_SORT SORT_MAKE_STR(quick_sort) +#define MERGE_SORT SORT_MAKE_STR(merge_sort) +#define MERGE_SORT_RECURSIVE SORT_MAKE_STR(merge_sort_recursive) +#define MERGE_SORT_IN_PLACE SORT_MAKE_STR(merge_sort_in_place) +#define MERGE_SORT_IN_PLACE_RMERGE SORT_MAKE_STR(merge_sort_in_place_rmerge) +#define MERGE_SORT_IN_PLACE_BACKMERGE SORT_MAKE_STR(merge_sort_in_place_backmerge) +#define MERGE_SORT_IN_PLACE_FRONTMERGE SORT_MAKE_STR(merge_sort_in_place_frontmerge) +#define MERGE_SORT_IN_PLACE_ASWAP SORT_MAKE_STR(merge_sort_in_place_aswap) +#define SELECTION_SORT SORT_MAKE_STR(selection_sort) +#define SHELL_SORT SORT_MAKE_STR(shell_sort) +#define QUICK_SORT_PARTITION SORT_MAKE_STR(quick_sort_partition) +#define QUICK_SORT_RECURSIVE SORT_MAKE_STR(quick_sort_recursive) +#define HEAP_SIFT_DOWN SORT_MAKE_STR(heap_sift_down) +#define HEAPIFY SORT_MAKE_STR(heapify) +#define TIM_SORT_RUN_T SORT_MAKE_STR(tim_sort_run_t) +#define TEMP_STORAGE_T SORT_MAKE_STR(temp_storage_t) +#define PUSH_NEXT SORT_MAKE_STR(push_next) +#define GRAIL_SWAP1 SORT_MAKE_STR(grail_swap1) +#define REC_STABLE_SORT SORT_MAKE_STR(rec_stable_sort) +#define GRAIL_REC_MERGE SORT_MAKE_STR(grail_rec_merge) +#define GRAIL_SORT_DYN_BUFFER SORT_MAKE_STR(grail_sort_dyn_buffer) +#define GRAIL_SORT_FIXED_BUFFER SORT_MAKE_STR(grail_sort_fixed_buffer) +#define GRAIL_COMMON_SORT SORT_MAKE_STR(grail_common_sort) +#define GRAIL_SORT SORT_MAKE_STR(grail_sort) +#define GRAIL_COMBINE_BLOCKS SORT_MAKE_STR(grail_combine_blocks) +#define GRAIL_LAZY_STABLE_SORT SORT_MAKE_STR(grail_lazy_stable_sort) +#define GRAIL_MERGE_WITHOUT_BUFFER SORT_MAKE_STR(grail_merge_without_buffer) +#define GRAIL_ROTATE SORT_MAKE_STR(grail_rotate) +#define GRAIL_BIN_SEARCH_LEFT SORT_MAKE_STR(grail_bin_search_left) +#define GRAIL_BUILD_BLOCKS SORT_MAKE_STR(grail_build_blocks) +#define GRAIL_FIND_KEYS SORT_MAKE_STR(grail_find_keys) +#define GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(grail_merge_buffers_left_with_x_buf) +#define GRAIL_BIN_SEARCH_RIGHT SORT_MAKE_STR(grail_bin_search_right) +#define GRAIL_MERGE_BUFFERS_LEFT SORT_MAKE_STR(grail_merge_buffers_left) +#define GRAIL_SMART_MERGE_WITH_X_BUF SORT_MAKE_STR(grail_smart_merge_with_x_buf) +#define GRAIL_MERGE_LEFT_WITH_X_BUF SORT_MAKE_STR(grail_merge_left_with_x_buf) +#define GRAIL_SMART_MERGE_WITHOUT_BUFFER SORT_MAKE_STR(grail_smart_merge_without_buffer) +#define GRAIL_SMART_MERGE_WITH_BUFFER SORT_MAKE_STR(grail_smart_merge_with_buffer) +#define GRAIL_MERGE_RIGHT SORT_MAKE_STR(grail_merge_right) +#define GRAIL_MERGE_LEFT SORT_MAKE_STR(grail_merge_left) +#define GRAIL_SWAP_N SORT_MAKE_STR(grail_swap_n) +#define SQRT_SORT SORT_MAKE_STR(sqrt_sort) +#define SQRT_SORT_BUILD_BLOCKS SORT_MAKE_STR(sqrt_sort_build_blocks) +#define SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_buffers_left_with_x_buf) +#define SQRT_SORT_MERGE_DOWN SORT_MAKE_STR(sqrt_sort_merge_down) +#define SQRT_SORT_MERGE_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_left_with_x_buf) +#define SQRT_SORT_MERGE_RIGHT SORT_MAKE_STR(sqrt_sort_merge_right) +#define SQRT_SORT_SWAP_N SORT_MAKE_STR(sqrt_sort_swap_n) +#define SQRT_SORT_SWAP_1 SORT_MAKE_STR(sqrt_sort_swap_1) +#define SQRT_SORT_SMART_MERGE_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_smart_merge_with_x_buf) +#define SQRT_SORT_SORT_INS SORT_MAKE_STR(sqrt_sort_sort_ins) +#define SQRT_SORT_COMBINE_BLOCKS SORT_MAKE_STR(sqrt_sort_combine_blocks) +#define SQRT_SORT_COMMON_SORT SORT_MAKE_STR(sqrt_sort_common_sort) +#define BUBBLE_SORT SORT_MAKE_STR(bubble_sort) + +#ifndef MAX +#define MAX(x,y) (((x) > (y) ? (x) : (y))) +#endif +#ifndef MIN +#define MIN(x,y) (((x) < (y) ? (x) : (y))) +#endif +#ifndef SORT_CSWAP +#define SORT_CSWAP(x, y) { if(SORT_CMP((x),(y)) > 0) {SORT_SWAP((x),(y));}} +#endif + +typedef struct { + size_t start; + size_t length; +} TIM_SORT_RUN_T; + + +void SHELL_SORT(SORT_TYPE *dst, const size_t size); +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size); +void HEAP_SORT(SORT_TYPE *dst, const size_t size); +void QUICK_SORT(SORT_TYPE *dst, const size_t size); +void MERGE_SORT(SORT_TYPE *dst, const size_t size); +void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t size); +void SELECTION_SORT(SORT_TYPE *dst, const size_t size); +void TIM_SORT(SORT_TYPE *dst, const size_t size); +void BUBBLE_SORT(SORT_TYPE *dst, const size_t size); +void BITONIC_SORT(SORT_TYPE *dst, const size_t size); + +/* The full implementation of a bitonic sort is not here. Since we only want to use + sorting networks for small length lists we create optimal sorting networks for + lists of length <= 16 and call out to BINARY_INSERTION_SORT for anything larger + than 16. + Optimal sorting networks for small length lists. + Taken from https://pages.ripco.net/~jgamble/nw.html */ +#define BITONIC_SORT_2 SORT_MAKE_STR(bitonic_sort_2) +static __inline void BITONIC_SORT_2(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); +} + + +#define BITONIC_SORT_3 SORT_MAKE_STR(bitonic_sort_3) +static __inline void BITONIC_SORT_3(SORT_TYPE *dst) { + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[0], dst[1]); +} + + +#define BITONIC_SORT_4 SORT_MAKE_STR(bitonic_sort_4) +static __inline void BITONIC_SORT_4(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[1], dst[2]); +} + + +#define BITONIC_SORT_5 SORT_MAKE_STR(bitonic_sort_5) +static __inline void BITONIC_SORT_5(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[1], dst[2]); +} + + +#define BITONIC_SORT_6 SORT_MAKE_STR(bitonic_sort_6) +static __inline void BITONIC_SORT_6(SORT_TYPE *dst) { + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[2], dst[3]); +} + + +#define BITONIC_SORT_7 SORT_MAKE_STR(bitonic_sort_7) +static __inline void BITONIC_SORT_7(SORT_TYPE *dst) { + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[2], dst[3]); +} + + +#define BITONIC_SORT_8 SORT_MAKE_STR(bitonic_sort_8) +static __inline void BITONIC_SORT_8(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[3], dst[6]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[3], dst[4]); +} + + +#define BITONIC_SORT_9 SORT_MAKE_STR(bitonic_sort_9) +static __inline void BITONIC_SORT_9(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[5], dst[8]); + SORT_CSWAP(dst[3], dst[6]); + SORT_CSWAP(dst[4], dst[7]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[2], dst[3]); +} + + +#define BITONIC_SORT_10 SORT_MAKE_STR(bitonic_sort_10) +static __inline void BITONIC_SORT_10(SORT_TYPE *dst) { + SORT_CSWAP(dst[4], dst[9]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[2], dst[7]); + SORT_CSWAP(dst[1], dst[6]); + SORT_CSWAP(dst[0], dst[5]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[6], dst[9]); + SORT_CSWAP(dst[0], dst[3]); + SORT_CSWAP(dst[5], dst[8]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[3], dst[6]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[4], dst[7]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[4], dst[5]); +} + + +#define BITONIC_SORT_11 SORT_MAKE_STR(bitonic_sort_11) +static __inline void BITONIC_SORT_11(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[6], dst[10]); + SORT_CSWAP(dst[4], dst[8]); + SORT_CSWAP(dst[5], dst[9]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[6], dst[10]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[7], dst[10]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[7], dst[8]); +} + + +#define BITONIC_SORT_12 SORT_MAKE_STR(bitonic_sort_12) +static __inline void BITONIC_SORT_12(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[7], dst[11]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[6], dst[10]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[4], dst[8]); + SORT_CSWAP(dst[5], dst[9]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[7], dst[11]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[6], dst[10]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[7], dst[10]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[7], dst[8]); +} + + +#define BITONIC_SORT_13 SORT_MAKE_STR(bitonic_sort_13) +static __inline void BITONIC_SORT_13(SORT_TYPE *dst) { + SORT_CSWAP(dst[1], dst[7]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[8]); + SORT_CSWAP(dst[0], dst[12]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[11]); + SORT_CSWAP(dst[7], dst[12]); + SORT_CSWAP(dst[5], dst[9]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[6], dst[12]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[11], dst[12]); + SORT_CSWAP(dst[4], dst[9]); + SORT_CSWAP(dst[6], dst[10]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[1], dst[7]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[4], dst[7]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[0], dst[5]); + SORT_CSWAP(dst[2], dst[5]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); +} + + +#define BITONIC_SORT_14 SORT_MAKE_STR(bitonic_sort_14) +static __inline void BITONIC_SORT_14(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[12], dst[13]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[8], dst[12]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[9], dst[13]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[0], dst[8]); + SORT_CSWAP(dst[1], dst[9]); + SORT_CSWAP(dst[2], dst[10]); + SORT_CSWAP(dst[3], dst[11]); + SORT_CSWAP(dst[4], dst[12]); + SORT_CSWAP(dst[5], dst[13]); + SORT_CSWAP(dst[5], dst[10]); + SORT_CSWAP(dst[6], dst[9]); + SORT_CSWAP(dst[3], dst[12]); + SORT_CSWAP(dst[7], dst[11]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[8]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[7], dst[13]); + SORT_CSWAP(dst[2], dst[8]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[11], dst[13]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[7], dst[12]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[10], dst[12]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[11], dst[12]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); +} + + +#define BITONIC_SORT_15 SORT_MAKE_STR(bitonic_sort_15) +static __inline void BITONIC_SORT_15(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[12], dst[13]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[12], dst[14]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[8], dst[12]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[9], dst[13]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[10], dst[14]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[0], dst[8]); + SORT_CSWAP(dst[1], dst[9]); + SORT_CSWAP(dst[2], dst[10]); + SORT_CSWAP(dst[3], dst[11]); + SORT_CSWAP(dst[4], dst[12]); + SORT_CSWAP(dst[5], dst[13]); + SORT_CSWAP(dst[6], dst[14]); + SORT_CSWAP(dst[5], dst[10]); + SORT_CSWAP(dst[6], dst[9]); + SORT_CSWAP(dst[3], dst[12]); + SORT_CSWAP(dst[13], dst[14]); + SORT_CSWAP(dst[7], dst[11]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[8]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[7], dst[13]); + SORT_CSWAP(dst[2], dst[8]); + SORT_CSWAP(dst[11], dst[14]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[11], dst[13]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[7], dst[12]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[10], dst[12]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[11], dst[12]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); +} + + +#define BITONIC_SORT_16 SORT_MAKE_STR(bitonic_sort_16) +static __inline void BITONIC_SORT_16(SORT_TYPE *dst) { + SORT_CSWAP(dst[0], dst[1]); + SORT_CSWAP(dst[2], dst[3]); + SORT_CSWAP(dst[4], dst[5]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); + SORT_CSWAP(dst[10], dst[11]); + SORT_CSWAP(dst[12], dst[13]); + SORT_CSWAP(dst[14], dst[15]); + SORT_CSWAP(dst[0], dst[2]); + SORT_CSWAP(dst[4], dst[6]); + SORT_CSWAP(dst[8], dst[10]); + SORT_CSWAP(dst[12], dst[14]); + SORT_CSWAP(dst[1], dst[3]); + SORT_CSWAP(dst[5], dst[7]); + SORT_CSWAP(dst[9], dst[11]); + SORT_CSWAP(dst[13], dst[15]); + SORT_CSWAP(dst[0], dst[4]); + SORT_CSWAP(dst[8], dst[12]); + SORT_CSWAP(dst[1], dst[5]); + SORT_CSWAP(dst[9], dst[13]); + SORT_CSWAP(dst[2], dst[6]); + SORT_CSWAP(dst[10], dst[14]); + SORT_CSWAP(dst[3], dst[7]); + SORT_CSWAP(dst[11], dst[15]); + SORT_CSWAP(dst[0], dst[8]); + SORT_CSWAP(dst[1], dst[9]); + SORT_CSWAP(dst[2], dst[10]); + SORT_CSWAP(dst[3], dst[11]); + SORT_CSWAP(dst[4], dst[12]); + SORT_CSWAP(dst[5], dst[13]); + SORT_CSWAP(dst[6], dst[14]); + SORT_CSWAP(dst[7], dst[15]); + SORT_CSWAP(dst[5], dst[10]); + SORT_CSWAP(dst[6], dst[9]); + SORT_CSWAP(dst[3], dst[12]); + SORT_CSWAP(dst[13], dst[14]); + SORT_CSWAP(dst[7], dst[11]); + SORT_CSWAP(dst[1], dst[2]); + SORT_CSWAP(dst[4], dst[8]); + SORT_CSWAP(dst[1], dst[4]); + SORT_CSWAP(dst[7], dst[13]); + SORT_CSWAP(dst[2], dst[8]); + SORT_CSWAP(dst[11], dst[14]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[2], dst[4]); + SORT_CSWAP(dst[11], dst[13]); + SORT_CSWAP(dst[3], dst[8]); + SORT_CSWAP(dst[7], dst[12]); + SORT_CSWAP(dst[6], dst[8]); + SORT_CSWAP(dst[10], dst[12]); + SORT_CSWAP(dst[3], dst[5]); + SORT_CSWAP(dst[7], dst[9]); + SORT_CSWAP(dst[3], dst[4]); + SORT_CSWAP(dst[5], dst[6]); + SORT_CSWAP(dst[7], dst[8]); + SORT_CSWAP(dst[9], dst[10]); + SORT_CSWAP(dst[11], dst[12]); + SORT_CSWAP(dst[6], dst[7]); + SORT_CSWAP(dst[8], dst[9]); +} + +void BITONIC_SORT(SORT_TYPE *dst, const size_t size) { + switch (size) { + case 0: + case 1: + break; + + case 2: + BITONIC_SORT_2(dst); + break; + + case 3: + BITONIC_SORT_3(dst); + break; + + case 4: + BITONIC_SORT_4(dst); + break; + + case 5: + BITONIC_SORT_5(dst); + break; + + case 6: + BITONIC_SORT_6(dst); + break; + + case 7: + BITONIC_SORT_7(dst); + break; + + case 8: + BITONIC_SORT_8(dst); + break; + + case 9: + BITONIC_SORT_9(dst); + break; + + case 10: + BITONIC_SORT_10(dst); + break; + + case 11: + BITONIC_SORT_11(dst); + break; + + case 12: + BITONIC_SORT_12(dst); + break; + + case 13: + BITONIC_SORT_13(dst); + break; + + case 14: + BITONIC_SORT_14(dst); + break; + + case 15: + BITONIC_SORT_15(dst); + break; + + case 16: + BITONIC_SORT_16(dst); + break; + + default: + BINARY_INSERTION_SORT(dst, size); + } +} + + +/* Shell sort implementation based on Wikipedia article + http://en.wikipedia.org/wiki/Shell_sort +*/ +void SHELL_SORT(SORT_TYPE *dst, const size_t size) { + /* don't bother sorting an array of size 0 or 1 */ + /* TODO: binary search to find first gap? */ + int inci = 47; + size_t inc = shell_gaps[inci]; + size_t i; + + if (size <= 1) { + return; + } + + while (inc > (size >> 1)) { + inc = shell_gaps[--inci]; + } + + while (1) { + for (i = inc; i < size; i++) { + SORT_TYPE temp = dst[i]; + size_t j = i; + + while ((j >= inc) && (SORT_CMP(dst[j - inc], temp) > 0)) { + dst[j] = dst[j - inc]; + j -= inc; + } + + dst[j] = temp; + } + + if (inc == 1) { + break; + } + + inc = shell_gaps[--inci]; + } +} + +/* Function used to do a binary search for binary insertion sort */ +static __inline size_t BINARY_INSERTION_FIND(SORT_TYPE *dst, const SORT_TYPE x, + const size_t size) { + size_t l, c, r; + SORT_TYPE cx; + l = 0; + r = size - 1; + c = r >> 1; + + /* check for out of bounds at the beginning. */ + if (SORT_CMP(x, dst[0]) < 0) { + return 0; + } else if (SORT_CMP(x, dst[r]) > 0) { + return r; + } + + cx = dst[c]; + + while (1) { + const int val = SORT_CMP(x, cx); + + if (val < 0) { + if (c - l <= 1) { + return c; + } + + r = c; + } else { /* allow = for stability. The binary search favors the right. */ + if (r - c <= 1) { + return c + 1; + } + + l = c; + } + + c = l + ((r - l) >> 1); + cx = dst[c]; + } +} + +/* Binary insertion sort, but knowing that the first "start" entries are sorted. Used in timsort. */ +static void BINARY_INSERTION_SORT_START(SORT_TYPE *dst, const size_t start, const size_t size) { + size_t i; + + for (i = start; i < size; i++) { + size_t j; + SORT_TYPE x; + size_t location; + + /* If this entry is already correct, just move along */ + if (SORT_CMP(dst[i - 1], dst[i]) <= 0) { + continue; + } + + /* Else we need to find the right place, shift everything over, and squeeze in */ + x = dst[i]; + location = BINARY_INSERTION_FIND(dst, x, i); + + for (j = i - 1; j >= location; j--) { + dst[j + 1] = dst[j]; + + if (j == 0) { /* check edge case because j is unsigned */ + break; + } + } + + dst[location] = x; + } +} + +/* Binary insertion sort */ +void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size) { + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + BINARY_INSERTION_SORT_START(dst, 1, size); +} + +/* Selection sort */ +void SELECTION_SORT(SORT_TYPE *dst, const size_t size) { + size_t i, j; + + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + for (i = 0; i < size; i++) { + for (j = i + 1; j < size; j++) { + if (SORT_CMP(dst[j], dst[i]) < 0) { + SORT_SWAP(dst[i], dst[j]); + } + } + } +} + +/* In-place mergesort */ +void MERGE_SORT_IN_PLACE_ASWAP(SORT_TYPE * dst1, SORT_TYPE * dst2, size_t len) { + do { + SORT_SWAP(*dst1, *dst2); + dst1++; + dst2++; + } while (--len); +} + +void MERGE_SORT_IN_PLACE_FRONTMERGE(SORT_TYPE *dst1, size_t l1, SORT_TYPE *dst2, size_t l2) { + SORT_TYPE *dst0 = dst2 - l1; + + if (SORT_CMP(dst1[l1 - 1], dst2[0]) <= 0) { + MERGE_SORT_IN_PLACE_ASWAP(dst1, dst0, l1); + return; + } + + do { + while (SORT_CMP(*dst2, *dst1) > 0) { + SORT_SWAP(*dst1, *dst0); + dst1++; + dst0++; + + if (--l1 == 0) { + return; + } + } + + SORT_SWAP(*dst2, *dst0); + dst2++; + dst0++; + } while (--l2); + + do { + SORT_SWAP(*dst1, *dst0); + dst1++; + dst0++; + } while (--l1); +} + +size_t MERGE_SORT_IN_PLACE_BACKMERGE(SORT_TYPE * dst1, size_t l1, SORT_TYPE * dst2, size_t l2) { + size_t res; + SORT_TYPE *dst0 = dst2 + l1; + + if (SORT_CMP(dst1[1 - l1], dst2[0]) >= 0) { + MERGE_SORT_IN_PLACE_ASWAP(dst1 - l1 + 1, dst0 - l1 + 1, l1); + return l1; + } + + do { + while (SORT_CMP(*dst2, *dst1) < 0) { + SORT_SWAP(*dst1, *dst0); + dst1--; + dst0--; + + if (--l1 == 0) { + return 0; + } + } + + SORT_SWAP(*dst2, *dst0); + dst2--; + dst0--; + } while (--l2); + + res = l1; + + do { + SORT_SWAP(*dst1, *dst0); + dst1--; + dst0--; + } while (--l1); + + return res; +} + +/* merge dst[p0..p1) by buffer dst[p1..p1+r) */ +void MERGE_SORT_IN_PLACE_RMERGE(SORT_TYPE *dst, size_t len, size_t lp, size_t r) { + size_t i, lq; + int cv; + + if (SORT_CMP(dst[lp], dst[lp - 1]) >= 0) { + return; + } + + lq = lp; + + for (i = 0; i < len; i += r) { + /* select smallest dst[p0+n*r] */ + size_t q = i, j; + + for (j = lp; j <= lq; j += r) { + cv = SORT_CMP(dst[j], dst[q]); + + if (cv == 0) { + cv = SORT_CMP(dst[j + r - 1], dst[q + r - 1]); + } + + if (cv < 0) { + q = j; + } + } + + if (q != i) { + MERGE_SORT_IN_PLACE_ASWAP(dst + i, dst + q, r); /* swap it with current position */ + + if (q == lq && q < (len - r)) { + lq += r; + } + } + + if (i != 0 && SORT_CMP(dst[i], dst[i - 1]) < 0) { + MERGE_SORT_IN_PLACE_ASWAP(dst + len, dst + i, r); /* swap current position with buffer */ + MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len + r - 1), r, dst + (i - 1), + r); /* buffer :merge: dst[i-r..i) -> dst[i-r..i+r) */ + } + + if (lp == i) { + lp += r; + } + } +} + +/* In-place Merge Sort implementation. (c)2012, Andrey Astrelin, astrelin@tochka.ru */ +void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t len) { + /* don't bother sorting an array of size <= 1 */ + size_t r = rbnd(len); + size_t lr = (len / r - 1) * r; + SORT_TYPE *dst1 = dst - 1; + size_t p, m, q, q1, p0; + + if (len <= 1) { + return; + } + + if (len <= SMALL_SORT_BND) { + SMALL_SORT(dst, len); + return; + } + + for (p = 2; p <= lr; p += 2) { + dst1 += 2; + + if (SORT_CMP(dst1[0], dst1[-1]) < 0) { + SORT_SWAP(dst1[0], dst1[-1]); + } + + if (p & 2) { + continue; + } + + m = len - p; + q = 2; + + while ((p & q) == 0) { + if (SORT_CMP(dst1[1 - q], dst1[-(int) q]) < 0) { + break; + } + + q *= 2; + } + + if (p & q) { + continue; + } + + if (q < m) { + p0 = len - q; + MERGE_SORT_IN_PLACE_ASWAP(dst + p - q, dst + p0, q); + + for (;;) { + q1 = 2 * q; + + if ((q1 > m) || (p & q1)) { + break; + } + + p0 = len - q1; + MERGE_SORT_IN_PLACE_FRONTMERGE(dst + (p - q1), q, dst + p0 + q, q); + q = q1; + } + + MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len - 1), q, dst1 - q, q); + q *= 2; + } + + q1 = q; + + while (q1 > m) { + q1 /= 2; + } + + while ((q & p) == 0) { + q *= 2; + MERGE_SORT_IN_PLACE_RMERGE(dst + (p - q), q, q / 2, q1); + } + } + + q1 = 0; + + for (q = r; q < lr; q *= 2) { + if ((lr & q) != 0) { + q1 += q; + + if (q1 != q) { + MERGE_SORT_IN_PLACE_RMERGE(dst + (lr - q1), q1, q, r); + } + } + } + + m = len - lr; + MERGE_SORT_IN_PLACE(dst + lr, m); + MERGE_SORT_IN_PLACE_ASWAP(dst, dst + lr, m); + m += MERGE_SORT_IN_PLACE_BACKMERGE(dst + (m - 1), m, dst + (lr - 1), lr - m); + MERGE_SORT_IN_PLACE(dst, m); +} + +/* Standard merge sort */ +void MERGE_SORT_RECURSIVE(SORT_TYPE *newdst, SORT_TYPE *dst, const size_t size) { + const size_t middle = size / 2; + size_t out = 0; + size_t i = 0; + size_t j = middle; + + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + if (size <= SMALL_SORT_BND) { + BINARY_INSERTION_SORT(dst, size); + return; + } + + MERGE_SORT_RECURSIVE(newdst, dst, middle); + MERGE_SORT_RECURSIVE(newdst, &dst[middle], size - middle); + + while (out != size) { + if (i < middle) { + if (j < size) { + if (SORT_CMP(dst[i], dst[j]) <= 0) { + newdst[out] = dst[i++]; + } else { + newdst[out] = dst[j++]; + } + } else { + newdst[out] = dst[i++]; + } + } else { + newdst[out] = dst[j++]; + } + + out++; + } + + memcpy(dst, newdst, size * sizeof(SORT_TYPE)); +} + +/* Standard merge sort */ +void MERGE_SORT(SORT_TYPE *dst, const size_t size) { + SORT_TYPE *newdst; + + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + if (size <= SMALL_SORT_BND) { + BINARY_INSERTION_SORT(dst, size); + return; + } + + newdst = (SORT_TYPE *) malloc(size * sizeof(SORT_TYPE)); + MERGE_SORT_RECURSIVE(newdst, dst, size); + free(newdst); +} + + +static __inline size_t QUICK_SORT_PARTITION(SORT_TYPE *dst, const size_t left, + const size_t right, const size_t pivot) { + SORT_TYPE value = dst[pivot]; + size_t index = left; + size_t i; + int not_all_same = 0; + /* move the pivot to the right */ + SORT_SWAP(dst[pivot], dst[right]); + + for (i = left; i < right; i++) { + int cmp = SORT_CMP(dst[i], value); + /* check if everything is all the same */ + not_all_same |= cmp; + + if (cmp < 0) { + SORT_SWAP(dst[i], dst[index]); + index++; + } + } + + SORT_SWAP(dst[right], dst[index]); + + /* avoid degenerate case */ + if (not_all_same == 0) { + return SIZE_MAX; + } + + return index; +} + +/* Based on Knuth vol. 3 +static __inline size_t QUICK_SORT_HOARE_PARTITION(SORT_TYPE *dst, const size_t l, + const size_t r, const size_t pivot) { + SORT_TYPE value; + size_t i = l + 1; + size_t j = r; + + if (pivot != l) { + SORT_SWAP(dst[pivot], dst[l]); + } + value = dst[l]; + + while (1) { + while (SORT_CMP(dst[i], value) < 0) { + i++; + } + while (SORT_CMP(value, dst[j]) < 0) { + j--; + } + if (j <= i) { + SORT_SWAP(dst[l], dst[j]); + return j; + } + SORT_SWAP(dst[i], dst[j]); + i++; + j--; + } + return 0; +} +*/ + + +/* Return the median index of the objects at the three indices. */ +static __inline size_t MEDIAN(const SORT_TYPE *dst, const size_t a, const size_t b, + const size_t c) { + const int AB = SORT_CMP(dst[a], dst[b]) < 0; + + if (AB) { + /* a < b */ + const int BC = SORT_CMP(dst[b], dst[c]) < 0; + + if (BC) { + /* a < b < c */ + return b; + } else { + /* a < b, c < b */ + const int AC = SORT_CMP(dst[a], dst[c]) < 0; + + if (AC) { + /* a < c < b */ + return c; + } else { + /* c < a < b */ + return a; + } + } + } else { + /* b < a */ + const int AC = SORT_CMP(dst[a], dst[b]) < 0; + + if (AC) { + /* b < a < c */ + return a; + } else { + /* b < a, c < a */ + const int BC = SORT_CMP(dst[b], dst[c]) < 0; + + if (BC) { + /* b < c < a */ + return c; + } else { + /* c < b < a */ + return b; + } + } + } +} + +static void QUICK_SORT_RECURSIVE(SORT_TYPE *dst, const size_t original_left, + const size_t original_right) { + size_t left; + size_t right; + size_t pivot; + size_t new_pivot; + size_t middle; + int loop_count = 0; + const int max_loops = 64 - CLZ(original_right - original_left); /* ~lg N */ + left = original_left; + right = original_right; + + while (1) { + if (right <= left) { + return; + } + + if ((right - left + 1U) <= SMALL_SORT_BND) { + SMALL_SORT(&dst[left], right - left + 1U); + return; + } + + if (++loop_count >= max_loops) { + /* we have recursed / looped too many times; switch to heap sort */ + HEAP_SORT(&dst[left], right - left + 1U); + return; + } + + /* median of 5 */ + middle = left + ((right - left) >> 1); + pivot = MEDIAN((const SORT_TYPE *) dst, left, middle, right); + pivot = MEDIAN((const SORT_TYPE *) dst, left + ((middle - left) >> 1), pivot, + middle + ((right - middle) >> 1)); + new_pivot = QUICK_SORT_PARTITION(dst, left, right, pivot); + + /* check for partition all equal */ + if (new_pivot == SIZE_MAX) { + return; + } + + /* recurse only on the small part to avoid degenerate stack sizes */ + /* and manually do tail call on the large part */ + if (new_pivot - 1U - left > right - new_pivot - 1U) { + /* left is bigger than right */ + QUICK_SORT_RECURSIVE(dst, new_pivot + 1U, right); + /* tail call for left */ + right = new_pivot - 1U; + } else { + /* right is bigger than left */ + QUICK_SORT_RECURSIVE(dst, left, new_pivot - 1U); + /* tail call for right */ + left = new_pivot + 1U; + } + } +} + +void QUICK_SORT(SORT_TYPE *dst, const size_t size) { + /* don't bother sorting an array of size 1 */ + if (size <= 1) { + return; + } + + QUICK_SORT_RECURSIVE(dst, 0U, size - 1U); +} + + +/* timsort implementation, based on timsort.txt */ + +static __inline void REVERSE_ELEMENTS(SORT_TYPE *dst, size_t start, size_t end) { + while (1) { + if (start >= end) { + return; + } + + SORT_SWAP(dst[start], dst[end]); + start++; + end--; + } +} + +static size_t COUNT_RUN(SORT_TYPE *dst, const size_t start, const size_t size) { + size_t curr; + + if (size - start == 1) { + return 1; + } + + if (start >= size - 2) { + if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) { + SORT_SWAP(dst[size - 2], dst[size - 1]); + } + + return 2; + } + + curr = start + 2; + + if (SORT_CMP(dst[start], dst[start + 1]) <= 0) { + /* increasing run */ + while (1) { + if (curr == size - 1) { + break; + } + + if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) { + break; + } + + curr++; + } + + return curr - start; + } else { + /* decreasing run */ + while (1) { + if (curr == size - 1) { + break; + } + + if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) { + break; + } + + curr++; + } + + /* reverse in-place */ + REVERSE_ELEMENTS(dst, start, curr - 1); + return curr - start; + } +} + +static int CHECK_INVARIANT(TIM_SORT_RUN_T *stack, const int stack_curr) { + size_t A, B, C; + + if (stack_curr < 2) { + return 1; + } + + if (stack_curr == 2) { + const size_t A1 = stack[stack_curr - 2].length; + const size_t B1 = stack[stack_curr - 1].length; + + if (A1 <= B1) { + return 0; + } + + return 1; + } + + A = stack[stack_curr - 3].length; + B = stack[stack_curr - 2].length; + C = stack[stack_curr - 1].length; + + if ((A <= B + C) || (B <= C)) { + return 0; + } + + return 1; +} + +typedef struct { + size_t alloc; + SORT_TYPE *storage; +} TEMP_STORAGE_T; + +static void TIM_SORT_RESIZE(TEMP_STORAGE_T *store, const size_t new_size) { + if ((store->storage == NULL) || (store->alloc < new_size)) { + SORT_TYPE *tempstore = (SORT_TYPE *)realloc(store->storage, new_size * sizeof(SORT_TYPE)); + + if (tempstore == NULL) { + fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes", + (unsigned long)(sizeof(SORT_TYPE) * new_size)); + exit(1); + } + + store->storage = tempstore; + store->alloc = new_size; + } +} + + +static size_t TIM_SORT_GALLOP(SORT_TYPE *dst, const size_t size, const SORT_TYPE key, size_t anchor, + int right) { + int last_ofs = 0; + int ofs, max_ofs, ofs_sign, cmp; + size_t l, c, r; + cmp = SORT_CMP(key, dst[anchor]); + + if (cmp < 0 || (!right && cmp == 0)) { + /* short cut */ + if (anchor == 0) { + return 0; + } + + ofs = -1; + ofs_sign = -1; + max_ofs = -anchor; /* ensure anchor+max_ofs is valid idx */ + } else { + if (anchor == size - 1) { + return size; + } + + ofs = 1; + ofs_sign = 1; + max_ofs = size - anchor - 1; + } + + for (;;) { + /* deal with overflow */ + if (max_ofs / ofs <= 1) { + ofs = max_ofs; + + if (ofs < 0) { + cmp = SORT_CMP(key, dst[0]); + + if ((right && cmp < 0) || (!right && cmp <= 0)) { + return 0; + } + } else { + cmp = SORT_CMP(dst[size - 1], key); + + if ((right && cmp <= 0) || (!right && cmp < 0)) { + return size; + } + } + + break; + } + + c = anchor + ofs; + /* right, 0> 1); + cmp = SORT_CMP(key, dst[c]); + + if ((right && cmp < 0) || (!right && cmp <= 0)) { + r = c; + } else { + l = c; + } + } + + return r; +} + + + +static void TIM_SORT_MERGE_LEFT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B, + SORT_TYPE* storage, int *min_gallop_p) { + size_t pdst, pa, pb, k; + int a_count, b_count; + int min_gallop = *min_gallop_p; + SORT_TYPE *dst = A_src; + memcpy(storage, dst, A * sizeof(SORT_TYPE)); + A_src = storage; + pdst = pa = pb = 0; + /* first element must in B, otherwise skipped in the caller */ + dst[pdst++] = B_src[pb++]; + + if (B == 1) { + goto copyA; + } + + for (;;) { + a_count = b_count = 0; + + for (;;) { + if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) { + dst[pdst++] = A_src[pa++]; + ++a_count; + b_count = 0; + + /* No need to check if pa == A because the last element must be in A + * so pb will reach to B first. You can check pa == A-1 and do + * some optimization if you wish.*/ + if (min_gallop <= a_count) { + break; + } + } else { + dst[pdst++] = B_src[pb++]; + ++b_count; + a_count = 0; + + if (pb == B) { + goto copyA; + } + + if (min_gallop <= b_count) { + break; + } + } + } + + ++min_gallop; + + for (;;) { + if (min_gallop != 0) { + min_gallop --; + } + + k = TIM_SORT_GALLOP(&A_src[pa], A - pa, B_src[pb], 0, 1); + memcpy(&dst[pdst], &A_src[pa], k * sizeof(SORT_TYPE)); + pdst += k; + pa += k; + /* now we know the next must be in B */ + dst[pdst++] = B_src[pb++]; + + if (pb == B) { + goto copyA; + } + + if (a_count && k < TIM_SORT_MIN_GALLOP) { + ++min_gallop; + break; + } + + k = TIM_SORT_GALLOP(&B_src[pb], B - pb, A_src[pa], 0, 0); + memmove(&dst[pdst], &B_src[pb], k * sizeof(SORT_TYPE)); + pdst += k; + pb += k; + + if (pb == B) { + goto copyA; + } + + dst[pdst++] = A_src[pa++]; + + if (b_count && k < TIM_SORT_MIN_GALLOP) { + ++min_gallop; + break; + } + } + } + +copyA: + memcpy(&dst[pdst], &A_src[pa], (A - pa) * sizeof(SORT_TYPE)); + *min_gallop_p = min_gallop; + return; +} + + +static void TIM_SORT_MERGE_RIGHT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B, + SORT_TYPE* storage, int *min_gallop_p) { + size_t k; + int pdst, pa, pb, a_count, b_count; + int min_gallop = *min_gallop_p; + SORT_TYPE *dst = A_src; + pa = A - 1; + pb = B - 1; + pdst = A + B - 1; + memcpy(storage, B_src, B * sizeof(SORT_TYPE)); + B_src = storage; + /* last element must in A, otherwise skipped in the caller */ + dst[pdst--] = A_src[pa--]; + + if (A == 1) { + goto copyB; + } + + for (;;) { + a_count = b_count = 0; + + for (;;) { + if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) { + dst[pdst--] = B_src[pb--]; + ++b_count; + a_count = 0; + + if (min_gallop <= b_count) { + break; + } + + /* No need to check if pb == -1 because the first element must be in B + * so pa will reach to -1 first. You can check pb == 0 and do + * some optimization if you wish.*/ + } else { + dst[pdst--] = A_src[pa--]; + ++a_count; + b_count = 0; + + if (pa == -1) { + goto copyB; + } + + if (min_gallop <= a_count) { + break; + } + } + } + + ++min_gallop; + + for (;;) { + if (min_gallop != 0) { + min_gallop --; + } + + k = TIM_SORT_GALLOP(A_src, pa + 1, B_src[pb], pa, 1); + /* Understand the margin by considering k==0 */ + memmove(&dst[pb + k + 1], &A_src[k], (pa + 1 - k) * sizeof(SORT_TYPE)); + pdst = pb + k; + pa = k - 1; + + if (pa == -1) { + goto copyB; + } + + /* now we know the next must be in B */ + dst[pdst--] = B_src[pb--]; + + if (a_count && pa + 1 - k < TIM_SORT_MIN_GALLOP) { + ++min_gallop; + break; + } + + k = TIM_SORT_GALLOP(B_src, pb + 1, A_src[pa], pb, 0); + memcpy(&dst[pa + k + 1], &B_src[k], (pb + 1 - k) * sizeof(SORT_TYPE)); + pdst = pa + k; + pb = k - 1; + dst[pdst--] = A_src[pa--]; + + if (pa == -1) { + goto copyB; + } + + if (b_count && pb + 1 - k < TIM_SORT_MIN_GALLOP) { + ++min_gallop; + break; + } + } + } + +copyB: + memcpy(dst, B_src, (pb + 1) * sizeof(SORT_TYPE)); + *min_gallop_p = min_gallop; + return; +} + + +static void TIM_SORT_MERGE(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr, + TEMP_STORAGE_T *store, int* min_gallop_p) { + size_t A = stack[stack_curr - 2].length; + size_t B = stack[stack_curr - 1].length; + size_t A_start = stack[stack_curr - 2].start; + size_t B_start = stack[stack_curr - 1].start; + SORT_TYPE *storage; + size_t k; + /* A[k-1] <= B[0] < A[k] */ + k = TIM_SORT_GALLOP(&dst[A_start], A, dst[B_start], 0, 1); + A_start += k; + A -= k; + + if (A == 0) { + *min_gallop_p /= 2; + return; + } + + /* B[k-1] < A[A-1] <= B[k] */ + k = TIM_SORT_GALLOP(&dst[B_start], B, dst[B_start - 1], B - 1, 0); + B = k; + TIM_SORT_RESIZE(store, MIN(A, B)); + storage = store->storage; + + if (A < B) { + TIM_SORT_MERGE_LEFT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p); + } else { + TIM_SORT_MERGE_RIGHT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p); + } +} + +static int TIM_SORT_COLLAPSE(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr, + TEMP_STORAGE_T *store, const size_t size, int* min_gallop_p) { + while (1) { + size_t A, B, C, D; + int ABC, BCD, CD; + + /* if the stack only has one thing on it, we are done with the collapse */ + if (stack_curr <= 1) { + break; + } + + /* if this is the last merge, just do it */ + if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) { + TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p); + stack[0].length += stack[1].length; + stack_curr--; + break; + } + /* check if the invariant is off for a stack of 2 elements */ + else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) { + TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p); + stack[0].length += stack[1].length; + stack_curr--; + break; + } else if (stack_curr == 2) { + break; + } + + B = stack[stack_curr - 3].length; + C = stack[stack_curr - 2].length; + D = stack[stack_curr - 1].length; + + if (stack_curr >= 4) { + A = stack[stack_curr - 4].length; + ABC = (A <= B + C); + } else { + ABC = 0; + } + + BCD = (B <= C + D) || ABC; + CD = (C <= D); + + /* Both invariants are good */ + if (!BCD && !CD) { + break; + } + + /* left merge */ + if (BCD && !CD) { + TIM_SORT_MERGE(dst, stack, stack_curr - 1, store, min_gallop_p); + stack[stack_curr - 3].length += stack[stack_curr - 2].length; + stack[stack_curr - 2] = stack[stack_curr - 1]; + stack_curr--; + } else { + /* right merge */ + TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p); + stack[stack_curr - 2].length += stack[stack_curr - 1].length; + stack_curr--; + } + } + + return stack_curr; +} + +static __inline int PUSH_NEXT(SORT_TYPE *dst, + const size_t size, + TEMP_STORAGE_T *store, + const size_t minrun, + TIM_SORT_RUN_T *run_stack, + size_t *stack_curr, + size_t *curr, + int *min_gallop_p) { + size_t len = COUNT_RUN(dst, *curr, size); + size_t run = minrun; + + if (run > size - *curr) { + run = size - *curr; + } + + if (run > len) { + BINARY_INSERTION_SORT_START(&dst[*curr], len, run); + len = run; + } + + run_stack[*stack_curr].start = *curr; + run_stack[*stack_curr].length = len; + (*stack_curr)++; + *curr += len; + + if (*curr == size) { + /* finish up */ + while (*stack_curr > 1) { + TIM_SORT_MERGE(dst, run_stack, *stack_curr, store, min_gallop_p); + run_stack[*stack_curr - 2].length += run_stack[*stack_curr - 1].length; + (*stack_curr)--; + } + + if (store->storage != NULL) { + free(store->storage); + store->storage = NULL; + } + + return 0; + } + + return 1; +} + +void TIM_SORT(SORT_TYPE *dst, const size_t size) { + size_t minrun; + TEMP_STORAGE_T _store, *store; + TIM_SORT_RUN_T run_stack[TIM_SORT_STACK_SIZE]; + size_t stack_curr = 0; + size_t curr = 0; + int min_gallop = TIM_SORT_MIN_GALLOP; + + /* don't bother sorting an array of size 1 */ + if (size <= 1) { + return; + } + + if (size < 64) { + SMALL_SORT(dst, size); + return; + } + + /* compute the minimum run length */ + minrun = compute_minrun(size); + /* temporary storage for merges */ + store = &_store; + store->alloc = 0; + store->storage = NULL; + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) { + return; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) { + return; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) { + return; + } + + while (1) { + if (!CHECK_INVARIANT(run_stack, stack_curr)) { + stack_curr = TIM_SORT_COLLAPSE(dst, run_stack, stack_curr, store, size, &min_gallop); + continue; + } + + if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) { + return; + } + } +} + +/* heap sort: based on wikipedia */ + +static __inline void HEAP_SIFT_DOWN(SORT_TYPE *dst, const size_t start, const size_t end) { + size_t root = start; + + while ((root << 1) <= end) { + size_t child = root << 1; + + if ((child < end) && (SORT_CMP(dst[child], dst[child + 1]) < 0)) { + child++; + } + + if (SORT_CMP(dst[root], dst[child]) < 0) { + SORT_SWAP(dst[root], dst[child]); + root = child; + } else { + return; + } + } +} + +static __inline void HEAPIFY(SORT_TYPE *dst, const size_t size) { + size_t start = size >> 1; + + while (1) { + HEAP_SIFT_DOWN(dst, start, size - 1); + + if (start == 0) { + break; + } + + start--; + } +} + +void HEAP_SORT(SORT_TYPE *dst, const size_t size) { + size_t end = size - 1; + + /* don't bother sorting an array of size <= 1 */ + if (size <= 1) { + return; + } + + HEAPIFY(dst, size); + + while (end > 0) { + SORT_SWAP(dst[end], dst[0]); + HEAP_SIFT_DOWN(dst, 0, end - 1); + end--; + } +} + +/********* Sqrt sorting *********************************/ +/* */ +/* (c) 2014 by Andrey Astrelin */ +/* */ +/* */ +/* Stable sorting that works in O(N*log(N)) worst time */ +/* and uses O(sqrt(N)) extra memory */ +/* */ +/* Define SORT_TYPE and SORT_CMP */ +/* and then call SqrtSort() function */ +/* */ +/*********************************************************/ + +#define SORT_CMP_A(a,b) SORT_CMP(*(a),*(b)) + +static __inline void SQRT_SORT_SWAP_1(SORT_TYPE *a, SORT_TYPE *b) { + SORT_TYPE c = *a; + *a++ = *b; + *b++ = c; +} + +static __inline void SQRT_SORT_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) { + while (n--) { + SQRT_SORT_SWAP_1(a++, b++); + } +} + + +static void SQRT_SORT_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) { + int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1; + + while (p1 >= 0) { + if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) { + arr[p0--] = arr[p1--]; + } else { + arr[p0--] = arr[p2--]; + } + } + + if (p2 != p0) while (p2 >= L1) { + arr[p0--] = arr[p2--]; + } +} + +/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */ +static void SQRT_SORT_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) { + int p0 = 0, p1 = L1; + L2 += L1; + + while (p1 < L2) { + if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) { + arr[M++] = arr[p1++]; + } else { + arr[M++] = arr[p0++]; + } + } + + if (M != p0) while (p0 < L1) { + arr[M++] = arr[p0++]; + } +} + +/* arr[0,L1-1] ++ arr2[0,L2-1] -> arr[-L1,L2-1], arr2 is "before" arr1 */ +static void SQRT_SORT_MERGE_DOWN(SORT_TYPE *arr, SORT_TYPE *arr2, int L1, int L2) { + int p0 = 0, p1 = 0, M = -L2; + + while (p1 < L2) { + if (p0 == L1 || SORT_CMP_A(arr + p0, arr2 + p1) >= 0) { + arr[M++] = arr2[p1++]; + } else { + arr[M++] = arr[p0++]; + } + } + + if (M != p0) while (p0 < L1) { + arr[M++] = arr[p0++]; + } +} + +static void SQRT_SORT_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2, + int lkeys) { + int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2; + int ftype = 1 - *atype; /* 1 if inverted */ + + while (p1 < q1 && p2 < q2) { + if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) { + arr[p0++] = arr[p1++]; + } else { + arr[p0++] = arr[p2++]; + } + } + + if (p1 < q1) { + *alen1 = q1 - p1; + + while (p1 < q1) { + arr[--q2] = arr[--q1]; + } + } else { + *alen1 = q2 - p2; + *atype = ftype; + } +} + + +/* + arr - starting array. arr[-lblock..-1] - buffer (if havebuf). + lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded + keys - arrays of keys, in same order as blocks. key0, nblock2=0 is possible. +*/ +static void SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(int *keys, int midkey, SORT_TYPE *arr, + int nblock, int lblock, int nblock2, int llast) { + int l, prest, lrest, frest, pidx, cidx, fnext; + + if (nblock == 0) { + l = nblock2 * lblock; + SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock); + return; + } + + lrest = lblock; + frest = keys[0] < midkey ? 0 : 1; + pidx = lblock; + + for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) { + prest = pidx - lrest; + fnext = keys[cidx] < midkey ? 0 : 1; + + if (fnext == frest) { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + prest = pidx; + lrest = lblock; + } else { + SQRT_SORT_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock); + } + } + + prest = pidx - lrest; + + if (llast) { + if (frest) { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + prest = pidx; + lrest = lblock * nblock2; + frest = 0; + } else { + lrest += lblock * nblock2; + } + + SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock); + } else { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + } +} + +/* + build blocks of length K + input: [-K,-1] elements are buffer + output: first K elements are buffer, blocks 2*K and last subblock sorted +*/ +static void SQRT_SORT_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K) { + int m, u, h, p0, p1, rest, restk, p; + + for (m = 1; m < L; m += 2) { + u = 0; + + if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) { + u = 1; + } + + arr[m - 3] = arr[m - 1 + u]; + arr[m - 2] = arr[m - u]; + } + + if (L % 2) { + arr[L - 3] = arr[L - 1]; + } + + arr -= 2; + + for (h = 2; h < K; h *= 2) { + p0 = 0; + p1 = L - 2 * h; + + while (p0 <= p1) { + SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h); + p0 += 2 * h; + } + + rest = L - p0; + + if (rest > h) { + SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h); + } else { + for (; p0 < L; p0++) { + arr[p0 - h] = arr[p0]; + } + } + + arr -= h; + } + + restk = L % (2 * K); + p = L - restk; + + if (restk <= K) { + memcpy(arr + p + K, arr + p, restk * sizeof(SORT_TYPE)); + } else { + SQRT_SORT_MERGE_RIGHT(arr + p, K, restk - K, K); + } + + while (p > 0) { + p -= 2 * K; + SQRT_SORT_MERGE_RIGHT(arr + p, K, K, K); + } +} + + +static void SQRT_SORT_SORT_INS(SORT_TYPE *arr, int len) { + int i, j; + + for (i = 1; i < len; i++) { + for (j = i - 1; j >= 0 && SORT_CMP_A(arr + (j + 1), arr + j) < 0; j--) { + SQRT_SORT_SWAP_1(arr + j, arr + (j + 1)); + } + } +} + +/* + keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs + LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied +*/ +static void SQRT_SORT_COMBINE_BLOCKS(SORT_TYPE *arr, int len, int LL, int lblock, int *tags) { + int M, b, NBlk, midkey, lrest, u, i, p, v, kc, nbl2, llast; + SORT_TYPE *arr1; + M = len / (2 * LL); + lrest = len % (2 * LL); + + if (lrest <= LL) { + len -= lrest; + lrest = 0; + } + + for (b = 0; b <= M; b++) { + if (b == M && lrest == 0) { + break; + } + + arr1 = arr + b * 2 * LL; + NBlk = (b == M ? lrest : 2 * LL) / lblock; + u = NBlk + (b == M ? 1 : 0); + + for (i = 0; i <= u; i++) { + tags[i] = i; + } + + midkey = LL / lblock; + + for (u = 1; u < NBlk; u++) { + p = u - 1; + + for (v = u; v < NBlk; v++) { + kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock); + + if (kc > 0 || (kc == 0 && tags[p] > tags[v])) { + p = v; + } + } + + if (p != u - 1) { + SQRT_SORT_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock); + i = tags[u - 1]; + tags[u - 1] = tags[p]; + tags[p] = i; + } + } + + nbl2 = llast = 0; + + if (b == M) { + llast = lrest % lblock; + } + + if (llast != 0) { + while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) { + nbl2++; + } + } + + SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(tags, midkey, arr1, NBlk - nbl2, lblock, nbl2, llast); + } + + for (p = len; --p >= 0;) { + arr[p] = arr[p - lblock]; + } +} + + +static void SQRT_SORT_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int *Tags) { + int lblock, cbuf; + + if (Len < 16) { + SQRT_SORT_SORT_INS(arr, Len); + return; + } + + lblock = 1; + + while (lblock * lblock < Len) { + lblock *= 2; + } + + memcpy(extbuf, arr, lblock * sizeof(SORT_TYPE)); + SQRT_SORT_COMMON_SORT(extbuf, lblock, arr, Tags); + SQRT_SORT_BUILD_BLOCKS(arr + lblock, Len - lblock, lblock); + cbuf = lblock; + + while (Len > (cbuf *= 2)) { + SQRT_SORT_COMBINE_BLOCKS(arr + lblock, Len - lblock, cbuf, lblock, Tags); + } + + SQRT_SORT_MERGE_DOWN(arr + lblock, extbuf, Len - lblock, lblock); +} + +static void SQRT_SORT(SORT_TYPE *arr, size_t Len) { + int L = 1; + SORT_TYPE *ExtBuf; + int *Tags; + int NK; + + while (L * L < Len) { + L *= 2; + } + + NK = (Len - 1) / L + 2; + ExtBuf = (SORT_TYPE*)malloc(L * sizeof(SORT_TYPE)); + + if (ExtBuf == NULL) { + return; /* fail */ + } + + Tags = (int*)malloc(NK * sizeof(int)); + + if (Tags == NULL) { + return; + } + + SQRT_SORT_COMMON_SORT(arr, Len, ExtBuf, Tags); + free(Tags); + free(ExtBuf); +} + +/********* Grail sorting *********************************/ +/* */ +/* (c) 2013 by Andrey Astrelin */ +/* */ +/* */ +/* Stable sorting that works in O(N*log(N)) worst time */ +/* and uses O(1) extra memory */ +/* */ +/* Define SORT_TYPE and SORT_CMP */ +/* and then call GrailSort() function */ +/* */ +/* For sorting with fixed external buffer (512 items) */ +/* use GrailSortWithBuffer() */ +/* */ +/* For sorting with dynamic external buffer (O(sqrt(N)) items) */ +/* use GrailSortWithDynBuffer() */ +/* */ +/* Also classic in-place merge sort is implemented */ +/* under the name of RecStableSort() */ +/* */ +/*********************************************************/ + +#define GRAIL_EXT_BUFFER_LENGTH 512 + +static __inline void GRAIL_SWAP1(SORT_TYPE *a, SORT_TYPE *b) { + SORT_TYPE c = *a; + *a = *b; + *b = c; +} + +static __inline void GRAIL_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) { + while (n--) { + GRAIL_SWAP1(a++, b++); + } +} + +static void GRAIL_ROTATE(SORT_TYPE *a, int l1, int l2) { + while (l1 && l2) { + if (l1 <= l2) { + GRAIL_SWAP_N(a, a + l1, l1); + a += l1; + l2 -= l1; + } else { + GRAIL_SWAP_N(a + (l1 - l2), a + l1, l2); + l1 -= l2; + } + } +} + +static int GRAIL_BIN_SEARCH_LEFT(SORT_TYPE *arr, int len, SORT_TYPE *key) { + int a = -1, b = len, c; + + while (a < b - 1) { + c = a + ((b - a) >> 1); + + if (SORT_CMP_A(arr + c, key) >= 0) { + b = c; + } else { + a = c; + } + } + + return b; +} +static int GRAIL_BIN_SEARCH_RIGHT(SORT_TYPE *arr, int len, SORT_TYPE *key) { + int a = -1, b = len, c; + + while (a < b - 1) { + c = a + ((b - a) >> 1); + + if (SORT_CMP_A(arr + c, key) > 0) { + b = c; + } else { + a = c; + } + } + + return b; +} + +/* cost: 2*len+nk^2/2 */ +static int GRAIL_FIND_KEYS(SORT_TYPE *arr, int len, int nkeys) { + int h = 1, h0 = 0; /* first key is always here */ + int u = 1, r; + + while (u < len && h < nkeys) { + r = GRAIL_BIN_SEARCH_LEFT(arr + h0, h, arr + u); + + if (r == h || SORT_CMP_A(arr + u, arr + (h0 + r)) != 0) { + GRAIL_ROTATE(arr + h0, h, u - (h0 + h)); + h0 = u - h; + GRAIL_ROTATE(arr + (h0 + r), h - r, 1); + h++; + } + + u++; + } + + GRAIL_ROTATE(arr, h0, h); + return h; +} + +/* cost: min(L1,L2)^2+max(L1,L2) */ +static void GRAIL_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int len1, int len2) { + int h; + + if (len1 < len2) { + while (len1) { + h = GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr); + + if (h != 0) { + GRAIL_ROTATE(arr, len1, h); + arr += h; + len2 -= h; + } + + if (len2 == 0) { + break; + } + + do { + arr++; + len1--; + } while (len1 && SORT_CMP_A(arr, arr + len1) <= 0); + } + } else { + while (len2) { + h = GRAIL_BIN_SEARCH_RIGHT(arr, len1, arr + (len1 + len2 - 1)); + + if (h != len1) { + GRAIL_ROTATE(arr + h, len1 - h, len2); + len1 = h; + } + + if (len1 == 0) { + break; + } + + do { + len2--; + } while (len2 && SORT_CMP_A(arr + len1 - 1, arr + len1 + len2 - 1) <= 0); + } + } +} + +/* arr[M..-1] - buffer, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */ +static void GRAIL_MERGE_LEFT(SORT_TYPE *arr, int L1, int L2, int M) { + int p0 = 0, p1 = L1; + L2 += L1; + + while (p1 < L2) { + if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) { + GRAIL_SWAP1(arr + (M++), arr + (p1++)); + } else { + GRAIL_SWAP1(arr + (M++), arr + (p0++)); + } + } + + if (M != p0) { + GRAIL_SWAP_N(arr + M, arr + p0, L1 - p0); + } +} +static void GRAIL_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) { + int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1; + + while (p1 >= 0) { + if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) { + GRAIL_SWAP1(arr + (p0--), arr + (p1--)); + } else { + GRAIL_SWAP1(arr + (p0--), arr + (p2--)); + } + } + + if (p2 != p0) while (p2 >= L1) { + GRAIL_SWAP1(arr + (p0--), arr + (p2--)); + } +} + +static void GRAIL_SMART_MERGE_WITH_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int len2, + int lkeys) { + int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2; + int ftype = 1 - *atype; /* 1 if inverted */ + + while (p1 < q1 && p2 < q2) { + if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) { + GRAIL_SWAP1(arr + (p0++), arr + (p1++)); + } else { + GRAIL_SWAP1(arr + (p0++), arr + (p2++)); + } + } + + if (p1 < q1) { + *alen1 = q1 - p1; + + while (p1 < q1) { + GRAIL_SWAP1(arr + (--q1), arr + (--q2)); + } + } else { + *alen1 = q2 - p2; + *atype = ftype; + } +} +static void GRAIL_SMART_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int _len2) { + int len1, len2, ftype, h; + + if (!_len2) { + return; + } + + len1 = *alen1; + len2 = _len2; + ftype = 1 - *atype; + + if (len1 && SORT_CMP_A(arr + (len1 - 1), arr + len1) - ftype >= 0) { + while (len1) { + h = ftype ? GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr) : GRAIL_BIN_SEARCH_RIGHT(arr + len1, len2, + arr); + + if (h != 0) { + GRAIL_ROTATE(arr, len1, h); + arr += h; + len2 -= h; + } + + if (len2 == 0) { + *alen1 = len1; + return; + } + + do { + arr++; + len1--; + } while (len1 && SORT_CMP_A(arr, arr + len1) - ftype < 0); + } + } + + *alen1 = len2; + *atype = ftype; +} + +/***** Sort With Extra Buffer *****/ + +/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */ +static void GRAIL_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) { + int p0 = 0, p1 = L1; + L2 += L1; + + while (p1 < L2) { + if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) { + arr[M++] = arr[p1++]; + } else { + arr[M++] = arr[p0++]; + } + } + + if (M != p0) while (p0 < L1) { + arr[M++] = arr[p0++]; + } +} + +static void GRAIL_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2, + int lkeys) { + int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2; + int ftype = 1 - *atype; /* 1 if inverted */ + + while (p1 < q1 && p2 < q2) { + if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) { + arr[p0++] = arr[p1++]; + } else { + arr[p0++] = arr[p2++]; + } + } + + if (p1 < q1) { + *alen1 = q1 - p1; + + while (p1 < q1) { + arr[--q2] = arr[--q1]; + } + } else { + *alen1 = q2 - p2; + *atype = ftype; + } +} + +/* + arr - starting array. arr[-lblock..-1] - buffer (if havebuf). + lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded + keys - arrays of keys, in same order as blocks. key0, nblock2=0 is possible. +*/ +static void GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr, + int nblock, int lblock, int nblock2, int llast) { + int l, prest, lrest, frest, pidx, cidx, fnext; + + if (nblock == 0) { + l = nblock2 * lblock; + GRAIL_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock); + return; + } + + lrest = lblock; + frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1; + pidx = lblock; + + for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) { + prest = pidx - lrest; + fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1; + + if (fnext == frest) { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + prest = pidx; + lrest = lblock; + } else { + GRAIL_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock); + } + } + + prest = pidx - lrest; + + if (llast) { + if (frest) { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + prest = pidx; + lrest = lblock * nblock2; + frest = 0; + } else { + lrest += lblock * nblock2; + } + + GRAIL_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock); + } else { + memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE)); + } +} + +/***** End Sort With Extra Buffer *****/ + +/* + build blocks of length K + input: [-K,-1] elements are buffer + output: first K elements are buffer, blocks 2*K and last subblock sorted +*/ +static void GRAIL_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K, SORT_TYPE *extbuf, int LExtBuf) { + int m, u, h, p0, p1, rest, restk, p, kbuf; + kbuf = K < LExtBuf ? K : LExtBuf; + + while (kbuf & (kbuf - 1)) { + kbuf &= kbuf - 1; /* max power or 2 - just in case */ + } + + if (kbuf) { + memcpy(extbuf, arr - kbuf, kbuf * sizeof(SORT_TYPE)); + + for (m = 1; m < L; m += 2) { + u = 0; + + if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) { + u = 1; + } + + arr[m - 3] = arr[m - 1 + u]; + arr[m - 2] = arr[m - u]; + } + + if (L % 2) { + arr[L - 3] = arr[L - 1]; + } + + arr -= 2; + + for (h = 2; h < kbuf; h *= 2) { + p0 = 0; + p1 = L - 2 * h; + + while (p0 <= p1) { + GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h); + p0 += 2 * h; + } + + rest = L - p0; + + if (rest > h) { + GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h); + } else { + for (; p0 < L; p0++) { + arr[p0 - h] = arr[p0]; + } + } + + arr -= h; + } + + memcpy(arr + L, extbuf, kbuf * sizeof(SORT_TYPE)); + } else { + for (m = 1; m < L; m += 2) { + u = 0; + + if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) { + u = 1; + } + + GRAIL_SWAP1(arr + (m - 3), arr + (m - 1 + u)); + GRAIL_SWAP1(arr + (m - 2), arr + (m - u)); + } + + if (L % 2) { + GRAIL_SWAP1(arr + (L - 1), arr + (L - 3)); + } + + arr -= 2; + h = 2; + } + + for (; h < K; h *= 2) { + p0 = 0; + p1 = L - 2 * h; + + while (p0 <= p1) { + GRAIL_MERGE_LEFT(arr + p0, h, h, -h); + p0 += 2 * h; + } + + rest = L - p0; + + if (rest > h) { + GRAIL_MERGE_LEFT(arr + p0, h, rest - h, -h); + } else { + GRAIL_ROTATE(arr + p0 - h, h, rest); + } + + arr -= h; + } + + restk = L % (2 * K); + p = L - restk; + + if (restk <= K) { + GRAIL_ROTATE(arr + p, restk, K); + } else { + GRAIL_MERGE_RIGHT(arr + p, K, restk - K, K); + } + + while (p > 0) { + p -= 2 * K; + GRAIL_MERGE_RIGHT(arr + p, K, K, K); + } +} + +/* + arr - starting array. arr[-lblock..-1] - buffer (if havebuf). + lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded + keys - arrays of keys, in same order as blocks. key0, nblock2=0 is possible. +*/ +static void GRAIL_MERGE_BUFFERS_LEFT(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr, int nblock, + int lblock, int havebuf, int nblock2, int llast) { + int l, prest, lrest, frest, pidx, cidx, fnext; + + if (nblock == 0) { + l = nblock2 * lblock; + + if (havebuf) { + GRAIL_MERGE_LEFT(arr, l, llast, -lblock); + } else { + GRAIL_MERGE_WITHOUT_BUFFER(arr, l, llast); + } + + return; + } + + lrest = lblock; + frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1; + pidx = lblock; + + for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) { + prest = pidx - lrest; + fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1; + + if (fnext == frest) { + if (havebuf) { + GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest); + } + + prest = pidx; + lrest = lblock; + } else { + if (havebuf) { + GRAIL_SMART_MERGE_WITH_BUFFER(arr + prest, &lrest, &frest, lblock, lblock); + } else { + GRAIL_SMART_MERGE_WITHOUT_BUFFER(arr + prest, &lrest, &frest, lblock); + } + } + } + + prest = pidx - lrest; + + if (llast) { + if (frest) { + if (havebuf) { + GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest); + } + + prest = pidx; + lrest = lblock * nblock2; + frest = 0; + } else { + lrest += lblock * nblock2; + } + + if (havebuf) { + GRAIL_MERGE_LEFT(arr + prest, lrest, llast, -lblock); + } else { + GRAIL_MERGE_WITHOUT_BUFFER(arr + prest, lrest, llast); + } + } else { + if (havebuf) { + GRAIL_SWAP_N(arr + prest, arr + (prest - lblock), lrest); + } + } +} + +static void GRAIL_LAZY_STABLE_SORT(SORT_TYPE *arr, int L) { + int m, h, p0, p1, rest; + + for (m = 1; m < L; m += 2) { + if (SORT_CMP_A(arr + m - 1, arr + m) > 0) { + GRAIL_SWAP1(arr + (m - 1), arr + m); + } + } + + for (h = 2; h < L; h *= 2) { + p0 = 0; + p1 = L - 2 * h; + + while (p0 <= p1) { + GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, h); + p0 += 2 * h; + } + + rest = L - p0; + + if (rest > h) { + GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, rest - h); + } + } +} + +/* + keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs + LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied +*/ +static void GRAIL_COMBINE_BLOCKS(SORT_TYPE *keys, SORT_TYPE *arr, int len, int LL, int lblock, + int havebuf, SORT_TYPE *xbuf) { + int M, b, NBlk, midkey, lrest, u, p, v, kc, nbl2, llast; + SORT_TYPE *arr1; + M = len / (2 * LL); + lrest = len % (2 * LL); + + if (lrest <= LL) { + len -= lrest; + lrest = 0; + } + + if (xbuf) { + memcpy(xbuf, arr - lblock, lblock * sizeof(SORT_TYPE)); + } + + for (b = 0; b <= M; b++) { + if (b == M && lrest == 0) { + break; + } + + arr1 = arr + b * 2 * LL; + NBlk = (b == M ? lrest : 2 * LL) / lblock; + SMALL_SORT(keys, NBlk + (b == M ? 1 : 0)); + midkey = LL / lblock; + + for (u = 1; u < NBlk; u++) { + p = u - 1; + + for (v = u; v < NBlk; v++) { + kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock); + + if (kc > 0 || (kc == 0 && SORT_CMP_A(keys + p, keys + v) > 0)) { + p = v; + } + } + + if (p != u - 1) { + GRAIL_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock); + GRAIL_SWAP1(keys + (u - 1), keys + p); + + if (midkey == u - 1 || midkey == p) { + midkey ^= (u - 1)^p; + } + } + } + + nbl2 = llast = 0; + + if (b == M) { + llast = lrest % lblock; + } + + if (llast != 0) { + while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) { + nbl2++; + } + } + + if (xbuf) { + GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(keys, keys + midkey, arr1, NBlk - nbl2, lblock, nbl2, llast); + } else { + GRAIL_MERGE_BUFFERS_LEFT(keys, keys + midkey, arr1, NBlk - nbl2, lblock, havebuf, nbl2, llast); + } + } + + if (xbuf) { + for (p = len; --p >= 0;) { + arr[p] = arr[p - lblock]; + } + + memcpy(arr - lblock, xbuf, lblock * sizeof(SORT_TYPE)); + } else if (havebuf) while (--len >= 0) { + GRAIL_SWAP1(arr + len, arr + len - lblock); + } +} + + +static void GRAIL_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int LExtBuf) { + int lblock, nkeys, findkeys, ptr, cbuf, lb, nk; + int havebuf, chavebuf; + long long s; + + if (Len <= SMALL_SORT_BND) { + SMALL_SORT(arr, Len); + return; + } + + lblock = 1; + + while (lblock * lblock < Len) { + lblock *= 2; + } + + nkeys = (Len - 1) / lblock + 1; + findkeys = GRAIL_FIND_KEYS(arr, Len, nkeys + lblock); + havebuf = 1; + + if (findkeys < nkeys + lblock) { + if (findkeys < 4) { + GRAIL_LAZY_STABLE_SORT(arr, Len); + return; + } + + nkeys = lblock; + + while (nkeys > findkeys) { + nkeys /= 2; + } + + havebuf = 0; + lblock = 0; + } + + ptr = lblock + nkeys; + cbuf = havebuf ? lblock : nkeys; + + if (havebuf) { + GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, extbuf, LExtBuf); + } else { + GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, NULL, 0); + } + + /* 2*cbuf are built */ + while (Len - ptr > (cbuf *= 2)) { + lb = lblock; + chavebuf = havebuf; + + if (!havebuf) { + if (nkeys > 4 && nkeys / 8 * nkeys >= cbuf) { + lb = nkeys / 2; + chavebuf = 1; + } else { + nk = 1; + s = (long long)cbuf * findkeys / 2; + + while (nk < nkeys && s != 0) { + nk *= 2; + s /= 8; + } + + lb = (2 * cbuf) / nk; + } + } + + GRAIL_COMBINE_BLOCKS(arr, arr + ptr, Len - ptr, cbuf, lb, chavebuf, chavebuf + && lb <= LExtBuf ? extbuf : NULL); + } + + SMALL_SORT(arr, ptr); + GRAIL_MERGE_WITHOUT_BUFFER(arr, ptr, Len - ptr); +} + +static void GRAIL_SORT(SORT_TYPE *arr, size_t Len) { + GRAIL_COMMON_SORT(arr, Len, NULL, 0); +} + +static void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *arr, size_t Len) { + SORT_TYPE ExtBuf[GRAIL_EXT_BUFFER_LENGTH]; + GRAIL_COMMON_SORT(arr, Len, ExtBuf, GRAIL_EXT_BUFFER_LENGTH); +} + +static void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *arr, size_t Len) { + int L = 1; + SORT_TYPE *ExtBuf; + + while (L * L < Len) { + L *= 2; + } + + ExtBuf = (SORT_TYPE*)malloc(L * sizeof(SORT_TYPE)); + + if (ExtBuf == NULL) { + GRAIL_SORT_FIXED_BUFFER(arr, Len); + } else { + GRAIL_COMMON_SORT(arr, Len, ExtBuf, L); + free(ExtBuf); + } +} + +/****** classic MergeInPlace *************/ + +static void GRAIL_REC_MERGE(SORT_TYPE *A, int L1, int L2) { + int K, k1, k2, m1, m2; + + if (L1 < 3 || L2 < 3) { + GRAIL_MERGE_WITHOUT_BUFFER(A, L1, L2); + return; + } + + if (L1 < L2) { + K = L1 + L2 / 2; + } else { + K = L1 / 2; + } + + k1 = k2 = GRAIL_BIN_SEARCH_LEFT(A, L1, A + K); + + if (k2 < L1 && SORT_CMP_A(A + k2, A + K) == 0) { + k2 = GRAIL_BIN_SEARCH_RIGHT(A + k1, L1 - k1, A + K) + k1; + } + + m1 = GRAIL_BIN_SEARCH_LEFT(A + L1, L2, A + K); + m2 = m1; + + if (m2 < L2 && SORT_CMP_A(A + L1 + m2, A + K) == 0) { + m2 = GRAIL_BIN_SEARCH_RIGHT(A + L1 + m1, L2 - m1, A + K) + m1; + } + + if (k1 == k2) { + GRAIL_ROTATE(A + k2, L1 - k2, m2); + } else { + GRAIL_ROTATE(A + k1, L1 - k1, m1); + + if (m2 != m1) { + GRAIL_ROTATE(A + (k2 + m1), L1 - k2, m2 - m1); + } + } + + GRAIL_REC_MERGE(A + (k2 + m2), L1 - k2, L2 - m2); + GRAIL_REC_MERGE(A, k1, m1); +} + +static void REC_STABLE_SORT(SORT_TYPE *arr, size_t L) { + int m, h, p0, p1, rest; + + for (m = 1; m < L; m += 2) { + if (SORT_CMP_A(arr + m - 1, arr + m) > 0) { + GRAIL_SWAP1(arr + (m - 1), arr + m); + } + } + + for (h = 2; h < L; h *= 2) { + p0 = 0; + p1 = L - 2 * h; + + while (p0 <= p1) { + GRAIL_REC_MERGE(arr + p0, h, h); + p0 += 2 * h; + } + + rest = L - p0; + + if (rest > h) { + GRAIL_REC_MERGE(arr + p0, h, rest - h); + } + } +} + +/* Bubble sort implementation based on Wikipedia article + https://en.wikipedia.org/wiki/Bubble_sort +*/ +void BUBBLE_SORT(SORT_TYPE *dst, const size_t size) { + size_t n = size; + + while (n) { + size_t i, newn = 0U; + + for (i = 1U; i < n; ++i) { + if (SORT_CMP(dst[i - 1U], dst[i]) > 0) { + SORT_SWAP(dst[i - 1U], dst[i]); + newn = i; + } + } + + n = newn; + } +} + +#undef QUICK_SORT +#undef MEDIAN +#undef SORT_CONCAT +#undef SORT_MAKE_STR1 +#undef SORT_MAKE_STR +#undef SORT_NAME +#undef SORT_TYPE +#undef SORT_CMP +#undef TEMP_STORAGE_T +#undef TIM_SORT_RUN_T +#undef PUSH_NEXT +#undef SORT_SWAP +#undef SORT_CONCAT +#undef SORT_MAKE_STR1 +#undef SORT_MAKE_STR +#undef BINARY_INSERTION_FIND +#undef BINARY_INSERTION_SORT_START +#undef BINARY_INSERTION_SORT +#undef REVERSE_ELEMENTS +#undef COUNT_RUN +#undef TIM_SORT +#undef TIM_SORT_RESIZE +#undef TIM_SORT_COLLAPSE +#undef TIM_SORT_RUN_T +#undef TEMP_STORAGE_T +#undef MERGE_SORT +#undef MERGE_SORT_RECURSIVE +#undef MERGE_SORT_IN_PLACE +#undef MERGE_SORT_IN_PLACE_RMERGE +#undef MERGE_SORT_IN_PLACE_BACKMERGE +#undef MERGE_SORT_IN_PLACE_FRONTMERGE +#undef MERGE_SORT_IN_PLACE_ASWAP +#undef GRAIL_SWAP1 +#undef REC_STABLE_SORT +#undef GRAIL_REC_MERGE +#undef GRAIL_SORT_DYN_BUFFER +#undef GRAIL_SORT_FIXED_BUFFER +#undef GRAIL_COMMON_SORT +#undef GRAIL_SORT +#undef GRAIL_COMBINE_BLOCKS +#undef GRAIL_LAZY_STABLE_SORT +#undef GRAIL_MERGE_WITHOUT_BUFFER +#undef GRAIL_ROTATE +#undef GRAIL_BIN_SEARCH_LEFT +#undef GRAIL_BUILD_BLOCKS +#undef GRAIL_FIND_KEYS +#undef GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF +#undef GRAIL_BIN_SEARCH_RIGHT +#undef GRAIL_MERGE_BUFFERS_LEFT +#undef GRAIL_SMART_MERGE_WITH_X_BUF +#undef GRAIL_MERGE_LEFT_WITH_X_BUF +#undef GRAIL_SMART_MERGE_WITHOUT_BUFFER +#undef GRAIL_SMART_MERGE_WITH_BUFFER +#undef GRAIL_MERGE_RIGHT +#undef GRAIL_MERGE_LEFT +#undef GRAIL_SWAP_N +#undef SQRT_SORT +#undef SQRT_SORT_BUILD_BLOCKS +#undef SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF +#undef SQRT_SORT_MERGE_DOWN +#undef SQRT_SORT_MERGE_LEFT_WITH_X_BUF +#undef SQRT_SORT_MERGE_RIGHT +#undef SQRT_SORT_SWAP_N +#undef SQRT_SORT_SWAP_1 +#undef SQRT_SORT_SMART_MERGE_WITH_X_BUF +#undef SQRT_SORT_SORT_INS +#undef SQRT_SORT_COMBINE_BLOCKS +#undef SQRT_SORT_COMMON_SORT +#undef SORT_CMP_A +#undef BUBBLE_SORT