From 6d81a994d8625607e7156a3ba6eb1e7692bbc600 Mon Sep 17 00:00:00 2001
From: Calcitem <calcitem@outlook.com>
Date: Sat, 5 Oct 2019 13:45:43 +0800
Subject: [PATCH] =?UTF-8?q?sort:=20=E5=B0=86=20std::sort=20=E6=8E=92?=
 =?UTF-8?q?=E5=BA=8F=E7=AE=97=E6=B3=95=E6=9B=BF=E6=8D=A2=E4=B8=BA=20sqrt?=
 =?UTF-8?q?=5Fsort=5Fsort=5Fins=20=E7=AE=97=E6=B3=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

不需要通过转换为 vector 再调用 std::sort 排序。
自对弈时长由 52s 缩短到 45s。

排序算法代码来源:
https://github.com/swenson/sort
---
 millgame.pro             |    1 +
 millgame.vcxproj         |    1 +
 millgame.vcxproj.filters |    3 +
 src/ai/search.cpp        |   85 +-
 src/ai/search.h          |   41 +
 src/base/sort.h          | 3278 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 3394 insertions(+), 15 deletions(-)
 create mode 100644 src/base/sort.h
diff --git a/millgame.pro b/millgame.pro
index f438d0f0..0e14b2cd 100644
--- a/millgame.pro
+++ b/millgame.pro
@@ -60,6 +60,7 @@ HEADERS  += \
     src/base/hashMap.h \
     src/base/memmgr.h \
     src/base/misc.h \
+    src/base/sort.h \
     src/base/stack.h \
     src/base/thread.h \
     src/ai/search.h \
diff --git a/millgame.vcxproj b/millgame.vcxproj
index df8166ac..0226a197 100644
--- a/millgame.vcxproj
+++ b/millgame.vcxproj
@@ -451,6 +451,7 @@
     <ClInclude Include="src\base\HashNode.h" />
     <ClInclude Include="src\base\memmgr.h" />
     <ClInclude Include="src\base\misc.h" />
+    <ClInclude Include="src\base\sort.h" />
     <ClInclude Include="src\base\stack.h" />
     <QtMoc Include="src\base\thread.h" />
     <ClInclude Include="src\base\zobrist.h" />
diff --git a/millgame.vcxproj.filters b/millgame.vcxproj.filters
index f211d584..c11031cf 100644
--- a/millgame.vcxproj.filters
+++ b/millgame.vcxproj.filters
@@ -126,6 +126,9 @@
     <ClInclude Include="src\base\memmgr.h">
       <Filter>base</Filter>
     </ClInclude>
+    <ClInclude Include="src\base\sort.h">
+      <Filter>base</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="debug\moc_predefs.h.cbt">
diff --git a/src/ai/search.cpp b/src/ai/search.cpp
index 1d101ed9..c0999b19 100644
--- a/src/ai/search.cpp
+++ b/src/ai/search.cpp
@@ -33,6 +33,14 @@
 #include "types.h"
 #include "option.h"
 
+#define SORT_NAME nodep
+#define SORT_TYPE AIAlgorithm::Node*
+#define SORT_CMP(x, y) (-AIAlgorithm::nodeCompare((x), (y)))
+
+player_t gSideToMove;
+
+#include "sort.h"
+
 using namespace CTSL;
 
 // 用于检测重复局面 (Position)
@@ -286,6 +294,58 @@ bool AIAlgorithm::nodeGreater(const Node *first, const Node *second)
 #endif
 }
 
+int AIAlgorithm::nodeCompare(const Node * first, const Node * second)
+{
+    int ret = 0;
+
+    if (gSideToMove == PLAYER_BLACK) {
+        if (first->value > second->value) {
+            ret = 1;
+            goto out;
+        }
+
+        if (first->value < second->value) {
+            ret = -1;
+            goto out;
+        }
+
+        if (first->value == second->value) {
+            if (!first->pruned && second->pruned) {
+                ret = 1;
+                goto out;
+            } else if (first->pruned && !second->pruned) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+    if (gSideToMove == PLAYER_WHITE) {
+        if (first->value < second->value) {
+            ret = 1;
+            goto out;
+        }
+
+        if (first->value > second->value) {
+            ret = -1;
+            goto out;
+        }
+
+        if (first->value == second->value) {
+            if (!first->pruned && second->pruned) {
+                ret = 1;
+                goto out;
+            } else if (first->pruned && !second->pruned) {
+                ret = -1;
+                goto out;
+            }
+        }
+    }
+
+out:
+    return ret;
+}
+
 void AIAlgorithm::sortMoves(Node *node)
 {
     // 这个函数对效率的影响很大，排序好的话，剪枝较早，节省时间，但不能在此函数耗费太多时间
@@ -297,34 +357,29 @@ void AIAlgorithm::sortMoves(Node *node)
     //#define DEBUG_SORT
 #ifdef DEBUG_SORT
     for (int i = 0; i < node->childrenSize; i++) {
-        loggerDebug("* [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value);
+        loggerDebug("* [%d] %p: %d = %d (%d)\n",
+                    i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned);
     }
     loggerDebug("\n");
 #endif
 
-    // TODO: 暂时使用 std::sort 排序, 后续需实现自己的排序函数
+#define NODE_PTR_SORT_FUN(x) nodep_##x
 
-    vector<Node *> vec;
-    vec.reserve(NODE_CHILDREN_SIZE);
+    gSideToMove = tempGame.position.sideToMove; // TODO: 暂时用全局变量
 
-    for (int i = 0; i < node->childrenSize; i++) {
-        vec.push_back(node->children[i]);
-    }
-
-    std::stable_sort(vec.begin(), vec.end(), cmp);
-
-    for (int i = 0; i < node->childrenSize; i++) {
-        node->children[i] = vec[i];
-    }
+    // 此处选用排序算法
+    NODE_PTR_SORT_FUN(sqrt_sort_sort_ins)(node->children, node->childrenSize);
 
 #ifdef DEBUG_SORT
     if (tempGame.position.sideToMove == PLAYER_BLACK) {
         for (int i = 0; i < node->childrenSize; i++) {
-            loggerDebug("+ [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value);
+            loggerDebug("+ [%d] %p: %d = %d (%d)\n",
+                        i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned);
         }
      } else {
         for (int i = 0; i < node->childrenSize; i++) {
-            loggerDebug("- [%d] %x = %d\n", i, &(node->children[i]), node->children[i]->value);
+            loggerDebug("- [%d] %p: %d = %d (%d)\n",
+                        i, &(node->children[i]), node->children[i]->move, node->children[i]->value, !node->children[i]->pruned);
         }
     }
     loggerDebug("\n----------------------------------------\n");
diff --git a/src/ai/search.h b/src/ai/search.h
index 40a9633d..a649950a 100644
--- a/src/ai/search.h
+++ b/src/ai/search.h
@@ -100,6 +100,46 @@ public:
 #endif /* TRANSPOSITION_TABLE_ENABLE */
         hash_t hash;                  //  哈希值
 #endif /* DEBUG_AB_TREE */
+
+        bool operator < (const Node &other)
+        {
+            if (value < other.value) {
+                return true;
+            }
+
+            if (value == other.value) {
+                if (pruned && !other.pruned) {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        bool operator > (const Node &other)
+        {
+            if (value > other.value) {
+                return true;
+            }
+
+            if (value == other.value) {
+                if (!pruned && other.pruned) {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        bool operator == (const Node &other)
+        {
+            if (value == other.value &&
+                pruned == other.pruned) {
+                return true;
+            }
+
+            return false;
+        }
     };
 
 #ifdef MEMORY_POOL
@@ -139,6 +179,7 @@ public:
 #ifdef MEMORY_POOL
     static bool nodeLess(const Node *first, const Node *second);
     static bool nodeGreater(const Node *first, const Node *second);
+    static int nodeCompare(const Node *first, const Node *second);
 #endif // MEMORY_POOL
 
 #ifdef ENDGAME_LEARNING
diff --git a/src/base/sort.h b/src/base/sort.h
new file mode 100644
index 00000000..10d82556
--- /dev/null
+++ b/src/base/sort.h
@@ -0,0 +1,3278 @@
+﻿/* Copyright (c) 2010-2019 Christopher Swenson. */
+/* Copyright (c) 2012 Vojtech Fried. */
+/* Copyright (c) 2012 Google Inc. All Rights Reserved. */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#ifndef SORT_NAME
+#error "Must declare SORT_NAME"
+#endif
+
+#ifndef SORT_TYPE
+#error "Must declare SORT_TYPE"
+#endif
+
+#ifndef SORT_CMP
+#define SORT_CMP(x, y)  ((x) < (y) ? -1 : ((y) < (x) ? 1 : 0))
+#endif
+
+#ifndef TIM_SORT_STACK_SIZE
+#define TIM_SORT_STACK_SIZE 128
+#endif
+
+#ifndef TIM_SORT_MIN_GALLOP
+#define TIM_SORT_MIN_GALLOP 7
+#endif
+
+#ifndef SORT_SWAP
+#define SORT_SWAP(x,y) {SORT_TYPE _sort_swap_temp = (x); (x) = (y); (y) = _sort_swap_temp;}
+#endif
+
+/* Common, type-agnostic functions and constants that we don't want to declare twice. */
+#ifndef SORT_COMMON_H
+#define SORT_COMMON_H
+
+#ifndef MAX
+#define MAX(x,y) (((x) > (y) ? (x) : (y)))
+#endif
+
+#ifndef MIN
+#define MIN(x,y) (((x) < (y) ? (x) : (y)))
+#endif
+
+static int compute_minrun(const uint64_t);
+
+/* From http://oeis.org/classic/A102549 */
+static const uint64_t shell_gaps[48] = {1, 4, 10, 23, 57, 132, 301, 701, 1750, 4376, 10941, 27353, 68383, 170958, 427396, 1068491, 2671228, 6678071, 16695178, 41737946, 104344866, 260862166, 652155416, 1630388541, 4075971353LL, 10189928383LL, 25474820958LL, 63687052396LL, 159217630991LL, 398044077478LL, 995110193696LL, 2487775484241LL, 6219438710603LL, 15548596776508LL, 38871491941271LL, 97178729853178LL, 242946824632946LL, 607367061582366LL, 1518417653955916LL, 3796044134889791LL, 9490110337224478LL, 23725275843061196LL, 59313189607652991LL, 148282974019132478LL, 370707435047831196LL, 926768587619577991LL, 2316921469048944978LL, 5792303672622362446LL};
+
+#ifndef CLZ
+/* clang-only */
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+#if __has_builtin(__builtin_clzll) || (defined(__GNUC__) && ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || (__GNUC__ > 3)))
+#define CLZ __builtin_clzll
+#else
+
+static int clzll(uint64_t);
+
+/* adapted from Hacker's Delight */
+static int clzll(uint64_t x) {
+  int n;
+
+  if (x == 0) {
+    return 64;
+  }
+
+  n = 0;
+
+  if (x <= 0x00000000FFFFFFFFL) {
+    n = n + 32;
+    x = x << 32;
+  }
+
+  if (x <= 0x0000FFFFFFFFFFFFL) {
+    n = n + 16;
+    x = x << 16;
+  }
+
+  if (x <= 0x00FFFFFFFFFFFFFFL) {
+    n = n + 8;
+    x = x << 8;
+  }
+
+  if (x <= 0x0FFFFFFFFFFFFFFFL) {
+    n = n + 4;
+    x = x << 4;
+  }
+
+  if (x <= 0x3FFFFFFFFFFFFFFFL) {
+    n = n + 2;
+    x = x << 2;
+  }
+
+  if (x <= 0x7FFFFFFFFFFFFFFFL) {
+    n = n + 1;
+  }
+
+  return n;
+}
+
+#define CLZ clzll
+#endif
+#endif
+
+static __inline int compute_minrun(const uint64_t size) {
+  const int top_bit = 64 - CLZ(size);
+  const int shift = MAX(top_bit, 6) - 6;
+  const int minrun = size >> shift;
+  const uint64_t mask = (1ULL << shift) - 1;
+
+  if (mask & size) {
+    return minrun + 1;
+  }
+
+  return minrun;
+}
+
+static __inline size_t rbnd(size_t len) {
+  int k;
+
+  if (len < 16) {
+    return 2;
+  }
+
+  k = 62 - CLZ(len);
+  return 1ULL << ((2 * k) / 3);
+}
+
+#endif /* SORT_COMMON_H */
+
+#define SORT_CONCAT(x, y) x ## _ ## y
+#define SORT_MAKE_STR1(x, y) SORT_CONCAT(x,y)
+#define SORT_MAKE_STR(x) SORT_MAKE_STR1(SORT_NAME,x)
+
+#ifndef SMALL_SORT_BND
+#define SMALL_SORT_BND 16
+#endif
+#ifndef SMALL_SORT
+#define SMALL_SORT BITONIC_SORT
+/*#define SMALL_SORT BINARY_INSERTION_SORT*/
+#endif
+
+#define BITONIC_SORT                   SORT_MAKE_STR(bitonic_sort)
+#define BINARY_INSERTION_FIND          SORT_MAKE_STR(binary_insertion_find)
+#define BINARY_INSERTION_SORT_START    SORT_MAKE_STR(binary_insertion_sort_start)
+#define BINARY_INSERTION_SORT          SORT_MAKE_STR(binary_insertion_sort)
+#define REVERSE_ELEMENTS               SORT_MAKE_STR(reverse_elements)
+#define COUNT_RUN                      SORT_MAKE_STR(count_run)
+#define CHECK_INVARIANT                SORT_MAKE_STR(check_invariant)
+#define TIM_SORT                       SORT_MAKE_STR(tim_sort)
+#define TIM_SORT_GALLOP                SORT_MAKE_STR(tim_sort_gallop)
+#define TIM_SORT_RESIZE                SORT_MAKE_STR(tim_sort_resize)
+#define TIM_SORT_MERGE                 SORT_MAKE_STR(tim_sort_merge)
+#define TIM_SORT_MERGE_LEFT            SORT_MAKE_STR(tim_sort_merge_left)
+#define TIM_SORT_MERGE_RIGHT           SORT_MAKE_STR(tim_sort_merge_right)
+#define TIM_SORT_COLLAPSE              SORT_MAKE_STR(tim_sort_collapse)
+#define HEAP_SORT                      SORT_MAKE_STR(heap_sort)
+#define MEDIAN                         SORT_MAKE_STR(median)
+#define QUICK_SORT                     SORT_MAKE_STR(quick_sort)
+#define MERGE_SORT                     SORT_MAKE_STR(merge_sort)
+#define MERGE_SORT_RECURSIVE           SORT_MAKE_STR(merge_sort_recursive)
+#define MERGE_SORT_IN_PLACE            SORT_MAKE_STR(merge_sort_in_place)
+#define MERGE_SORT_IN_PLACE_RMERGE     SORT_MAKE_STR(merge_sort_in_place_rmerge)
+#define MERGE_SORT_IN_PLACE_BACKMERGE  SORT_MAKE_STR(merge_sort_in_place_backmerge)
+#define MERGE_SORT_IN_PLACE_FRONTMERGE SORT_MAKE_STR(merge_sort_in_place_frontmerge)
+#define MERGE_SORT_IN_PLACE_ASWAP      SORT_MAKE_STR(merge_sort_in_place_aswap)
+#define SELECTION_SORT                 SORT_MAKE_STR(selection_sort)
+#define SHELL_SORT                     SORT_MAKE_STR(shell_sort)
+#define QUICK_SORT_PARTITION           SORT_MAKE_STR(quick_sort_partition)
+#define QUICK_SORT_RECURSIVE           SORT_MAKE_STR(quick_sort_recursive)
+#define HEAP_SIFT_DOWN                 SORT_MAKE_STR(heap_sift_down)
+#define HEAPIFY                        SORT_MAKE_STR(heapify)
+#define TIM_SORT_RUN_T                 SORT_MAKE_STR(tim_sort_run_t)
+#define TEMP_STORAGE_T                 SORT_MAKE_STR(temp_storage_t)
+#define PUSH_NEXT                      SORT_MAKE_STR(push_next)
+#define GRAIL_SWAP1                    SORT_MAKE_STR(grail_swap1)
+#define REC_STABLE_SORT                SORT_MAKE_STR(rec_stable_sort)
+#define GRAIL_REC_MERGE                SORT_MAKE_STR(grail_rec_merge)
+#define GRAIL_SORT_DYN_BUFFER          SORT_MAKE_STR(grail_sort_dyn_buffer)
+#define GRAIL_SORT_FIXED_BUFFER        SORT_MAKE_STR(grail_sort_fixed_buffer)
+#define GRAIL_COMMON_SORT              SORT_MAKE_STR(grail_common_sort)
+#define GRAIL_SORT                     SORT_MAKE_STR(grail_sort)
+#define GRAIL_COMBINE_BLOCKS           SORT_MAKE_STR(grail_combine_blocks)
+#define GRAIL_LAZY_STABLE_SORT         SORT_MAKE_STR(grail_lazy_stable_sort)
+#define GRAIL_MERGE_WITHOUT_BUFFER     SORT_MAKE_STR(grail_merge_without_buffer)
+#define GRAIL_ROTATE                   SORT_MAKE_STR(grail_rotate)
+#define GRAIL_BIN_SEARCH_LEFT          SORT_MAKE_STR(grail_bin_search_left)
+#define GRAIL_BUILD_BLOCKS             SORT_MAKE_STR(grail_build_blocks)
+#define GRAIL_FIND_KEYS                SORT_MAKE_STR(grail_find_keys)
+#define GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(grail_merge_buffers_left_with_x_buf)
+#define GRAIL_BIN_SEARCH_RIGHT         SORT_MAKE_STR(grail_bin_search_right)
+#define GRAIL_MERGE_BUFFERS_LEFT       SORT_MAKE_STR(grail_merge_buffers_left)
+#define GRAIL_SMART_MERGE_WITH_X_BUF   SORT_MAKE_STR(grail_smart_merge_with_x_buf)
+#define GRAIL_MERGE_LEFT_WITH_X_BUF    SORT_MAKE_STR(grail_merge_left_with_x_buf)
+#define GRAIL_SMART_MERGE_WITHOUT_BUFFER SORT_MAKE_STR(grail_smart_merge_without_buffer)
+#define GRAIL_SMART_MERGE_WITH_BUFFER  SORT_MAKE_STR(grail_smart_merge_with_buffer)
+#define GRAIL_MERGE_RIGHT              SORT_MAKE_STR(grail_merge_right)
+#define GRAIL_MERGE_LEFT               SORT_MAKE_STR(grail_merge_left)
+#define GRAIL_SWAP_N                   SORT_MAKE_STR(grail_swap_n)
+#define SQRT_SORT                      SORT_MAKE_STR(sqrt_sort)
+#define SQRT_SORT_BUILD_BLOCKS         SORT_MAKE_STR(sqrt_sort_build_blocks)
+#define SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_buffers_left_with_x_buf)
+#define SQRT_SORT_MERGE_DOWN           SORT_MAKE_STR(sqrt_sort_merge_down)
+#define SQRT_SORT_MERGE_LEFT_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_merge_left_with_x_buf)
+#define SQRT_SORT_MERGE_RIGHT          SORT_MAKE_STR(sqrt_sort_merge_right)
+#define SQRT_SORT_SWAP_N               SORT_MAKE_STR(sqrt_sort_swap_n)
+#define SQRT_SORT_SWAP_1               SORT_MAKE_STR(sqrt_sort_swap_1)
+#define SQRT_SORT_SMART_MERGE_WITH_X_BUF SORT_MAKE_STR(sqrt_sort_smart_merge_with_x_buf)
+#define SQRT_SORT_SORT_INS             SORT_MAKE_STR(sqrt_sort_sort_ins)
+#define SQRT_SORT_COMBINE_BLOCKS       SORT_MAKE_STR(sqrt_sort_combine_blocks)
+#define SQRT_SORT_COMMON_SORT          SORT_MAKE_STR(sqrt_sort_common_sort)
+#define BUBBLE_SORT                    SORT_MAKE_STR(bubble_sort)
+
+#ifndef MAX
+#define MAX(x,y) (((x) > (y) ? (x) : (y)))
+#endif
+#ifndef MIN
+#define MIN(x,y) (((x) < (y) ? (x) : (y)))
+#endif
+#ifndef SORT_CSWAP
+#define SORT_CSWAP(x, y) { if(SORT_CMP((x),(y)) > 0) {SORT_SWAP((x),(y));}}
+#endif
+
+typedef struct {
+  size_t start;
+  size_t length;
+} TIM_SORT_RUN_T;
+
+
+void SHELL_SORT(SORT_TYPE *dst, const size_t size);
+void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size);
+void HEAP_SORT(SORT_TYPE *dst, const size_t size);
+void QUICK_SORT(SORT_TYPE *dst, const size_t size);
+void MERGE_SORT(SORT_TYPE *dst, const size_t size);
+void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t size);
+void SELECTION_SORT(SORT_TYPE *dst, const size_t size);
+void TIM_SORT(SORT_TYPE *dst, const size_t size);
+void BUBBLE_SORT(SORT_TYPE *dst, const size_t size);
+void BITONIC_SORT(SORT_TYPE *dst, const size_t size);
+
+/* The full implementation of a bitonic sort is not here. Since we only want to use
+   sorting networks for small length lists we create optimal sorting networks for
+   lists of length <= 16 and call out to BINARY_INSERTION_SORT for anything larger
+   than 16.
+   Optimal sorting networks for small length lists.
+   Taken from https://pages.ripco.net/~jgamble/nw.html */
+#define BITONIC_SORT_2          SORT_MAKE_STR(bitonic_sort_2)
+static __inline void BITONIC_SORT_2(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+}
+
+
+#define BITONIC_SORT_3          SORT_MAKE_STR(bitonic_sort_3)
+static __inline void BITONIC_SORT_3(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[0], dst[1]);
+}
+
+
+#define BITONIC_SORT_4          SORT_MAKE_STR(bitonic_sort_4)
+static __inline void BITONIC_SORT_4(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[1], dst[2]);
+}
+
+
+#define BITONIC_SORT_5          SORT_MAKE_STR(bitonic_sort_5)
+static __inline void BITONIC_SORT_5(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[1], dst[2]);
+}
+
+
+#define BITONIC_SORT_6          SORT_MAKE_STR(bitonic_sort_6)
+static __inline void BITONIC_SORT_6(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_7          SORT_MAKE_STR(bitonic_sort_7)
+static __inline void BITONIC_SORT_7(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_8          SORT_MAKE_STR(bitonic_sort_8)
+static __inline void BITONIC_SORT_8(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[3], dst[4]);
+}
+
+
+#define BITONIC_SORT_9          SORT_MAKE_STR(bitonic_sort_9)
+static __inline void BITONIC_SORT_9(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[2], dst[3]);
+}
+
+
+#define BITONIC_SORT_10          SORT_MAKE_STR(bitonic_sort_10)
+static __inline void BITONIC_SORT_10(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[4], dst[9]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[2], dst[7]);
+  SORT_CSWAP(dst[1], dst[6]);
+  SORT_CSWAP(dst[0], dst[5]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[0], dst[3]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[6]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[4], dst[5]);
+}
+
+
+#define BITONIC_SORT_11          SORT_MAKE_STR(bitonic_sort_11)
+static __inline void BITONIC_SORT_11(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[10]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[7], dst[8]);
+}
+
+
+#define BITONIC_SORT_12          SORT_MAKE_STR(bitonic_sort_12)
+static __inline void BITONIC_SORT_12(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[10]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[7], dst[8]);
+}
+
+
+#define BITONIC_SORT_13          SORT_MAKE_STR(bitonic_sort_13)
+static __inline void BITONIC_SORT_13(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[1], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[8]);
+  SORT_CSWAP(dst[0], dst[12]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[11]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[5], dst[9]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[6], dst[12]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[4], dst[9]);
+  SORT_CSWAP(dst[6], dst[10]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[1], dst[7]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[4], dst[7]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[0], dst[5]);
+  SORT_CSWAP(dst[2], dst[5]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+}
+
+
+#define BITONIC_SORT_14          SORT_MAKE_STR(bitonic_sort_14)
+static __inline void BITONIC_SORT_14(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+
+#define BITONIC_SORT_15          SORT_MAKE_STR(bitonic_sort_15)
+static __inline void BITONIC_SORT_15(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[12], dst[14]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[10], dst[14]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[6], dst[14]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[13], dst[14]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[11], dst[14]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+
+#define BITONIC_SORT_16          SORT_MAKE_STR(bitonic_sort_16)
+static __inline void BITONIC_SORT_16(SORT_TYPE *dst) {
+  SORT_CSWAP(dst[0], dst[1]);
+  SORT_CSWAP(dst[2], dst[3]);
+  SORT_CSWAP(dst[4], dst[5]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+  SORT_CSWAP(dst[10], dst[11]);
+  SORT_CSWAP(dst[12], dst[13]);
+  SORT_CSWAP(dst[14], dst[15]);
+  SORT_CSWAP(dst[0], dst[2]);
+  SORT_CSWAP(dst[4], dst[6]);
+  SORT_CSWAP(dst[8], dst[10]);
+  SORT_CSWAP(dst[12], dst[14]);
+  SORT_CSWAP(dst[1], dst[3]);
+  SORT_CSWAP(dst[5], dst[7]);
+  SORT_CSWAP(dst[9], dst[11]);
+  SORT_CSWAP(dst[13], dst[15]);
+  SORT_CSWAP(dst[0], dst[4]);
+  SORT_CSWAP(dst[8], dst[12]);
+  SORT_CSWAP(dst[1], dst[5]);
+  SORT_CSWAP(dst[9], dst[13]);
+  SORT_CSWAP(dst[2], dst[6]);
+  SORT_CSWAP(dst[10], dst[14]);
+  SORT_CSWAP(dst[3], dst[7]);
+  SORT_CSWAP(dst[11], dst[15]);
+  SORT_CSWAP(dst[0], dst[8]);
+  SORT_CSWAP(dst[1], dst[9]);
+  SORT_CSWAP(dst[2], dst[10]);
+  SORT_CSWAP(dst[3], dst[11]);
+  SORT_CSWAP(dst[4], dst[12]);
+  SORT_CSWAP(dst[5], dst[13]);
+  SORT_CSWAP(dst[6], dst[14]);
+  SORT_CSWAP(dst[7], dst[15]);
+  SORT_CSWAP(dst[5], dst[10]);
+  SORT_CSWAP(dst[6], dst[9]);
+  SORT_CSWAP(dst[3], dst[12]);
+  SORT_CSWAP(dst[13], dst[14]);
+  SORT_CSWAP(dst[7], dst[11]);
+  SORT_CSWAP(dst[1], dst[2]);
+  SORT_CSWAP(dst[4], dst[8]);
+  SORT_CSWAP(dst[1], dst[4]);
+  SORT_CSWAP(dst[7], dst[13]);
+  SORT_CSWAP(dst[2], dst[8]);
+  SORT_CSWAP(dst[11], dst[14]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[2], dst[4]);
+  SORT_CSWAP(dst[11], dst[13]);
+  SORT_CSWAP(dst[3], dst[8]);
+  SORT_CSWAP(dst[7], dst[12]);
+  SORT_CSWAP(dst[6], dst[8]);
+  SORT_CSWAP(dst[10], dst[12]);
+  SORT_CSWAP(dst[3], dst[5]);
+  SORT_CSWAP(dst[7], dst[9]);
+  SORT_CSWAP(dst[3], dst[4]);
+  SORT_CSWAP(dst[5], dst[6]);
+  SORT_CSWAP(dst[7], dst[8]);
+  SORT_CSWAP(dst[9], dst[10]);
+  SORT_CSWAP(dst[11], dst[12]);
+  SORT_CSWAP(dst[6], dst[7]);
+  SORT_CSWAP(dst[8], dst[9]);
+}
+
+void BITONIC_SORT(SORT_TYPE *dst, const size_t size) {
+  switch (size) {
+  case 0:
+  case 1:
+    break;
+
+  case 2:
+    BITONIC_SORT_2(dst);
+    break;
+
+  case 3:
+    BITONIC_SORT_3(dst);
+    break;
+
+  case 4:
+    BITONIC_SORT_4(dst);
+    break;
+
+  case 5:
+    BITONIC_SORT_5(dst);
+    break;
+
+  case 6:
+    BITONIC_SORT_6(dst);
+    break;
+
+  case 7:
+    BITONIC_SORT_7(dst);
+    break;
+
+  case 8:
+    BITONIC_SORT_8(dst);
+    break;
+
+  case 9:
+    BITONIC_SORT_9(dst);
+    break;
+
+  case 10:
+    BITONIC_SORT_10(dst);
+    break;
+
+  case 11:
+    BITONIC_SORT_11(dst);
+    break;
+
+  case 12:
+    BITONIC_SORT_12(dst);
+    break;
+
+  case 13:
+    BITONIC_SORT_13(dst);
+    break;
+
+  case 14:
+    BITONIC_SORT_14(dst);
+    break;
+
+  case 15:
+    BITONIC_SORT_15(dst);
+    break;
+
+  case 16:
+    BITONIC_SORT_16(dst);
+    break;
+
+  default:
+    BINARY_INSERTION_SORT(dst, size);
+  }
+}
+
+
+/* Shell sort implementation based on Wikipedia article
+   http://en.wikipedia.org/wiki/Shell_sort
+*/
+void SHELL_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size 0 or 1 */
+  /* TODO: binary search to find first gap? */
+  int inci = 47;
+  size_t inc = shell_gaps[inci];
+  size_t i;
+
+  if (size <= 1) {
+    return;
+  }
+
+  while (inc > (size >> 1)) {
+    inc = shell_gaps[--inci];
+  }
+
+  while (1) {
+    for (i = inc; i < size; i++) {
+      SORT_TYPE temp = dst[i];
+      size_t j = i;
+
+      while ((j >= inc) && (SORT_CMP(dst[j - inc], temp) > 0)) {
+        dst[j] = dst[j - inc];
+        j -= inc;
+      }
+
+      dst[j] = temp;
+    }
+
+    if (inc == 1) {
+      break;
+    }
+
+    inc = shell_gaps[--inci];
+  }
+}
+
+/* Function used to do a binary search for binary insertion sort */
+static __inline size_t BINARY_INSERTION_FIND(SORT_TYPE *dst, const SORT_TYPE x,
+    const size_t size) {
+  size_t l, c, r;
+  SORT_TYPE cx;
+  l = 0;
+  r = size - 1;
+  c = r >> 1;
+
+  /* check for out of bounds at the beginning. */
+  if (SORT_CMP(x, dst[0]) < 0) {
+    return 0;
+  } else if (SORT_CMP(x, dst[r]) > 0) {
+    return r;
+  }
+
+  cx = dst[c];
+
+  while (1) {
+    const int val = SORT_CMP(x, cx);
+
+    if (val < 0) {
+      if (c - l <= 1) {
+        return c;
+      }
+
+      r = c;
+    } else { /* allow = for stability. The binary search favors the right. */
+      if (r - c <= 1) {
+        return c + 1;
+      }
+
+      l = c;
+    }
+
+    c = l + ((r - l) >> 1);
+    cx = dst[c];
+  }
+}
+
+/* Binary insertion sort, but knowing that the first "start" entries are sorted.  Used in timsort. */
+static void BINARY_INSERTION_SORT_START(SORT_TYPE *dst, const size_t start, const size_t size) {
+  size_t i;
+
+  for (i = start; i < size; i++) {
+    size_t j;
+    SORT_TYPE x;
+    size_t location;
+
+    /* If this entry is already correct, just move along */
+    if (SORT_CMP(dst[i - 1], dst[i]) <= 0) {
+      continue;
+    }
+
+    /* Else we need to find the right place, shift everything over, and squeeze in */
+    x = dst[i];
+    location = BINARY_INSERTION_FIND(dst, x, i);
+
+    for (j = i - 1; j >= location; j--) {
+      dst[j + 1] = dst[j];
+
+      if (j == 0) { /* check edge case because j is unsigned */
+        break;
+      }
+    }
+
+    dst[location] = x;
+  }
+}
+
+/* Binary insertion sort */
+void BINARY_INSERTION_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  BINARY_INSERTION_SORT_START(dst, 1, size);
+}
+
+/* Selection sort */
+void SELECTION_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t i, j;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  for (i = 0; i < size; i++) {
+    for (j = i + 1; j < size; j++) {
+      if (SORT_CMP(dst[j], dst[i]) < 0) {
+        SORT_SWAP(dst[i], dst[j]);
+      }
+    }
+  }
+}
+
+/* In-place mergesort */
+void MERGE_SORT_IN_PLACE_ASWAP(SORT_TYPE * dst1, SORT_TYPE * dst2, size_t len) {
+  do {
+    SORT_SWAP(*dst1, *dst2);
+    dst1++;
+    dst2++;
+  } while (--len);
+}
+
+void MERGE_SORT_IN_PLACE_FRONTMERGE(SORT_TYPE *dst1, size_t l1, SORT_TYPE *dst2, size_t l2) {
+  SORT_TYPE *dst0 = dst2 - l1;
+
+  if (SORT_CMP(dst1[l1 - 1], dst2[0]) <= 0) {
+    MERGE_SORT_IN_PLACE_ASWAP(dst1, dst0, l1);
+    return;
+  }
+
+  do {
+    while (SORT_CMP(*dst2, *dst1) > 0) {
+      SORT_SWAP(*dst1, *dst0);
+      dst1++;
+      dst0++;
+
+      if (--l1 == 0) {
+        return;
+      }
+    }
+
+    SORT_SWAP(*dst2, *dst0);
+    dst2++;
+    dst0++;
+  } while (--l2);
+
+  do {
+    SORT_SWAP(*dst1, *dst0);
+    dst1++;
+    dst0++;
+  } while (--l1);
+}
+
+size_t MERGE_SORT_IN_PLACE_BACKMERGE(SORT_TYPE * dst1, size_t l1, SORT_TYPE * dst2, size_t l2) {
+  size_t res;
+  SORT_TYPE *dst0 = dst2 + l1;
+
+  if (SORT_CMP(dst1[1 - l1], dst2[0]) >= 0) {
+    MERGE_SORT_IN_PLACE_ASWAP(dst1 - l1 + 1, dst0 - l1 + 1, l1);
+    return l1;
+  }
+
+  do {
+    while (SORT_CMP(*dst2, *dst1) < 0) {
+      SORT_SWAP(*dst1, *dst0);
+      dst1--;
+      dst0--;
+
+      if (--l1 == 0) {
+        return 0;
+      }
+    }
+
+    SORT_SWAP(*dst2, *dst0);
+    dst2--;
+    dst0--;
+  } while (--l2);
+
+  res = l1;
+
+  do {
+    SORT_SWAP(*dst1, *dst0);
+    dst1--;
+    dst0--;
+  } while (--l1);
+
+  return res;
+}
+
+/* merge dst[p0..p1) by buffer dst[p1..p1+r) */
+void MERGE_SORT_IN_PLACE_RMERGE(SORT_TYPE *dst, size_t len, size_t lp, size_t r) {
+  size_t i, lq;
+  int cv;
+
+  if (SORT_CMP(dst[lp], dst[lp - 1]) >= 0) {
+    return;
+  }
+
+  lq = lp;
+
+  for (i = 0; i < len; i += r) {
+    /* select smallest dst[p0+n*r] */
+    size_t q = i, j;
+
+    for (j = lp; j <= lq; j += r) {
+      cv = SORT_CMP(dst[j], dst[q]);
+
+      if (cv == 0) {
+        cv = SORT_CMP(dst[j + r - 1], dst[q + r - 1]);
+      }
+
+      if (cv < 0) {
+        q = j;
+      }
+    }
+
+    if (q != i) {
+      MERGE_SORT_IN_PLACE_ASWAP(dst + i, dst + q, r); /* swap it with current position */
+
+      if (q == lq && q < (len - r)) {
+        lq += r;
+      }
+    }
+
+    if (i != 0 && SORT_CMP(dst[i], dst[i - 1]) < 0) {
+      MERGE_SORT_IN_PLACE_ASWAP(dst + len, dst + i, r); /* swap current position with buffer */
+      MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len + r - 1), r, dst + (i - 1),
+                                    r);  /* buffer :merge: dst[i-r..i) -> dst[i-r..i+r) */
+    }
+
+    if (lp == i) {
+      lp += r;
+    }
+  }
+}
+
+/* In-place Merge Sort implementation. (c)2012, Andrey Astrelin, astrelin@tochka.ru */
+void MERGE_SORT_IN_PLACE(SORT_TYPE *dst, const size_t len) {
+  /* don't bother sorting an array of size <= 1 */
+  size_t r = rbnd(len);
+  size_t lr = (len / r - 1) * r;
+  SORT_TYPE *dst1 = dst - 1;
+  size_t p, m, q, q1, p0;
+
+  if (len <= 1) {
+    return;
+  }
+
+  if (len <= SMALL_SORT_BND) {
+    SMALL_SORT(dst, len);
+    return;
+  }
+
+  for (p = 2; p <= lr; p += 2) {
+    dst1 += 2;
+
+    if (SORT_CMP(dst1[0], dst1[-1]) < 0) {
+      SORT_SWAP(dst1[0], dst1[-1]);
+    }
+
+    if (p & 2) {
+      continue;
+    }
+
+    m = len - p;
+    q = 2;
+
+    while ((p & q) == 0) {
+      if (SORT_CMP(dst1[1 - q], dst1[-(int) q]) < 0) {
+        break;
+      }
+
+      q *= 2;
+    }
+
+    if (p & q) {
+      continue;
+    }
+
+    if (q < m) {
+      p0 = len - q;
+      MERGE_SORT_IN_PLACE_ASWAP(dst + p - q, dst + p0, q);
+
+      for (;;) {
+        q1 = 2 * q;
+
+        if ((q1 > m) || (p & q1)) {
+          break;
+        }
+
+        p0 = len - q1;
+        MERGE_SORT_IN_PLACE_FRONTMERGE(dst + (p - q1), q, dst + p0 + q, q);
+        q = q1;
+      }
+
+      MERGE_SORT_IN_PLACE_BACKMERGE(dst + (len - 1), q, dst1 - q, q);
+      q *= 2;
+    }
+
+    q1 = q;
+
+    while (q1 > m) {
+      q1 /= 2;
+    }
+
+    while ((q & p) == 0) {
+      q *= 2;
+      MERGE_SORT_IN_PLACE_RMERGE(dst + (p - q), q, q / 2, q1);
+    }
+  }
+
+  q1 = 0;
+
+  for (q = r; q < lr; q *= 2) {
+    if ((lr & q) != 0) {
+      q1 += q;
+
+      if (q1 != q) {
+        MERGE_SORT_IN_PLACE_RMERGE(dst + (lr - q1), q1, q, r);
+      }
+    }
+  }
+
+  m = len - lr;
+  MERGE_SORT_IN_PLACE(dst + lr, m);
+  MERGE_SORT_IN_PLACE_ASWAP(dst, dst + lr, m);
+  m += MERGE_SORT_IN_PLACE_BACKMERGE(dst + (m - 1), m, dst + (lr - 1), lr - m);
+  MERGE_SORT_IN_PLACE(dst, m);
+}
+
+/* Standard merge sort */
+void MERGE_SORT_RECURSIVE(SORT_TYPE *newdst, SORT_TYPE *dst, const size_t size) {
+  const size_t middle = size / 2;
+  size_t out = 0;
+  size_t i = 0;
+  size_t j = middle;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size <= SMALL_SORT_BND) {
+    BINARY_INSERTION_SORT(dst, size);
+    return;
+  }
+
+  MERGE_SORT_RECURSIVE(newdst, dst, middle);
+  MERGE_SORT_RECURSIVE(newdst, &dst[middle], size - middle);
+
+  while (out != size) {
+    if (i < middle) {
+      if (j < size) {
+        if (SORT_CMP(dst[i], dst[j]) <= 0) {
+          newdst[out] = dst[i++];
+        } else {
+          newdst[out] = dst[j++];
+        }
+      } else {
+        newdst[out] = dst[i++];
+      }
+    } else {
+      newdst[out] = dst[j++];
+    }
+
+    out++;
+  }
+
+  memcpy(dst, newdst, size * sizeof(SORT_TYPE));
+}
+
+/* Standard merge sort */
+void MERGE_SORT(SORT_TYPE *dst, const size_t size) {
+  SORT_TYPE *newdst;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size <= SMALL_SORT_BND) {
+    BINARY_INSERTION_SORT(dst, size);
+    return;
+  }
+
+  newdst = (SORT_TYPE *) malloc(size * sizeof(SORT_TYPE));
+  MERGE_SORT_RECURSIVE(newdst, dst, size);
+  free(newdst);
+}
+
+
+static __inline size_t QUICK_SORT_PARTITION(SORT_TYPE *dst, const size_t left,
+    const size_t right, const size_t pivot) {
+  SORT_TYPE value = dst[pivot];
+  size_t index = left;
+  size_t i;
+  int not_all_same = 0;
+  /* move the pivot to the right */
+  SORT_SWAP(dst[pivot], dst[right]);
+
+  for (i = left; i < right; i++) {
+    int cmp = SORT_CMP(dst[i], value);
+    /* check if everything is all the same */
+    not_all_same |= cmp;
+
+    if (cmp < 0) {
+      SORT_SWAP(dst[i], dst[index]);
+      index++;
+    }
+  }
+
+  SORT_SWAP(dst[right], dst[index]);
+
+  /* avoid degenerate case */
+  if (not_all_same == 0) {
+    return SIZE_MAX;
+  }
+
+  return index;
+}
+
+/* Based on Knuth vol. 3
+static __inline size_t QUICK_SORT_HOARE_PARTITION(SORT_TYPE *dst, const size_t l,
+    const size_t r, const size_t pivot) {
+  SORT_TYPE value;
+  size_t i = l + 1;
+  size_t j = r;
+
+  if (pivot != l) {
+    SORT_SWAP(dst[pivot], dst[l]);
+  }
+  value = dst[l];
+
+  while (1) {
+    while (SORT_CMP(dst[i], value) < 0) {
+      i++;
+    }
+    while (SORT_CMP(value, dst[j]) < 0) {
+      j--;
+    }
+    if (j <= i) {
+      SORT_SWAP(dst[l], dst[j]);
+      return j;
+    }
+    SORT_SWAP(dst[i], dst[j]);
+    i++;
+    j--;
+  }
+  return 0;
+}
+*/
+
+
+/* Return the median index of the objects at the three indices. */
+static __inline size_t MEDIAN(const SORT_TYPE *dst, const size_t a, const size_t b,
+                              const size_t c) {
+  const int AB = SORT_CMP(dst[a], dst[b]) < 0;
+
+  if (AB) {
+    /* a < b */
+    const int BC = SORT_CMP(dst[b], dst[c]) < 0;
+
+    if (BC) {
+      /* a < b < c */
+      return b;
+    } else {
+      /* a < b, c < b */
+      const int AC = SORT_CMP(dst[a], dst[c]) < 0;
+
+      if (AC) {
+        /* a < c < b */
+        return c;
+      } else {
+        /* c < a < b */
+        return a;
+      }
+    }
+  } else {
+    /* b < a */
+    const int AC = SORT_CMP(dst[a], dst[b]) < 0;
+
+    if (AC) {
+      /* b < a < c */
+      return a;
+    } else {
+      /* b < a, c < a */
+      const int BC = SORT_CMP(dst[b], dst[c]) < 0;
+
+      if (BC) {
+        /* b < c < a */
+        return c;
+      } else {
+        /* c < b < a */
+        return b;
+      }
+    }
+  }
+}
+
+static void QUICK_SORT_RECURSIVE(SORT_TYPE *dst, const size_t original_left,
+                                 const size_t original_right) {
+  size_t left;
+  size_t right;
+  size_t pivot;
+  size_t new_pivot;
+  size_t middle;
+  int loop_count = 0;
+  const int max_loops = 64 - CLZ(original_right - original_left); /* ~lg N */
+  left = original_left;
+  right = original_right;
+
+  while (1) {
+    if (right <= left) {
+      return;
+    }
+
+    if ((right - left + 1U) <= SMALL_SORT_BND) {
+      SMALL_SORT(&dst[left], right - left + 1U);
+      return;
+    }
+
+    if (++loop_count >= max_loops) {
+      /* we have recursed / looped too many times; switch to heap sort */
+      HEAP_SORT(&dst[left], right - left + 1U);
+      return;
+    }
+
+    /* median of 5 */
+    middle = left + ((right - left) >> 1);
+    pivot = MEDIAN((const SORT_TYPE *) dst, left, middle, right);
+    pivot = MEDIAN((const SORT_TYPE *) dst, left + ((middle - left) >> 1), pivot,
+                   middle + ((right - middle) >> 1));
+    new_pivot = QUICK_SORT_PARTITION(dst, left, right, pivot);
+
+    /* check for partition all equal */
+    if (new_pivot == SIZE_MAX) {
+      return;
+    }
+
+    /* recurse only on the small part to avoid degenerate stack sizes */
+    /* and manually do tail call on the large part */
+    if (new_pivot - 1U - left > right - new_pivot - 1U) {
+      /* left is bigger than right */
+      QUICK_SORT_RECURSIVE(dst, new_pivot + 1U, right);
+      /* tail call for left */
+      right = new_pivot - 1U;
+    } else {
+      /* right is bigger than left */
+      QUICK_SORT_RECURSIVE(dst, left, new_pivot - 1U);
+      /* tail call for right */
+      left = new_pivot + 1U;
+    }
+  }
+}
+
+void QUICK_SORT(SORT_TYPE *dst, const size_t size) {
+  /* don't bother sorting an array of size 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  QUICK_SORT_RECURSIVE(dst, 0U, size - 1U);
+}
+
+
+/* timsort implementation, based on timsort.txt */
+
+static __inline void REVERSE_ELEMENTS(SORT_TYPE *dst, size_t start, size_t end) {
+  while (1) {
+    if (start >= end) {
+      return;
+    }
+
+    SORT_SWAP(dst[start], dst[end]);
+    start++;
+    end--;
+  }
+}
+
+static size_t COUNT_RUN(SORT_TYPE *dst, const size_t start, const size_t size) {
+  size_t curr;
+
+  if (size - start == 1) {
+    return 1;
+  }
+
+  if (start >= size - 2) {
+    if (SORT_CMP(dst[size - 2], dst[size - 1]) > 0) {
+      SORT_SWAP(dst[size - 2], dst[size - 1]);
+    }
+
+    return 2;
+  }
+
+  curr = start + 2;
+
+  if (SORT_CMP(dst[start], dst[start + 1]) <= 0) {
+    /* increasing run */
+    while (1) {
+      if (curr == size - 1) {
+        break;
+      }
+
+      if (SORT_CMP(dst[curr - 1], dst[curr]) > 0) {
+        break;
+      }
+
+      curr++;
+    }
+
+    return curr - start;
+  } else {
+    /* decreasing run */
+    while (1) {
+      if (curr == size - 1) {
+        break;
+      }
+
+      if (SORT_CMP(dst[curr - 1], dst[curr]) <= 0) {
+        break;
+      }
+
+      curr++;
+    }
+
+    /* reverse in-place */
+    REVERSE_ELEMENTS(dst, start, curr - 1);
+    return curr - start;
+  }
+}
+
+static int CHECK_INVARIANT(TIM_SORT_RUN_T *stack, const int stack_curr) {
+  size_t A, B, C;
+
+  if (stack_curr < 2) {
+    return 1;
+  }
+
+  if (stack_curr == 2) {
+    const size_t A1 = stack[stack_curr - 2].length;
+    const size_t B1 = stack[stack_curr - 1].length;
+
+    if (A1 <= B1) {
+      return 0;
+    }
+
+    return 1;
+  }
+
+  A = stack[stack_curr - 3].length;
+  B = stack[stack_curr - 2].length;
+  C = stack[stack_curr - 1].length;
+
+  if ((A <= B + C) || (B <= C)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+typedef struct {
+  size_t alloc;
+  SORT_TYPE *storage;
+} TEMP_STORAGE_T;
+
+static void TIM_SORT_RESIZE(TEMP_STORAGE_T *store, const size_t new_size) {
+  if ((store->storage == NULL) || (store->alloc < new_size)) {
+    SORT_TYPE *tempstore = (SORT_TYPE *)realloc(store->storage, new_size * sizeof(SORT_TYPE));
+
+    if (tempstore == NULL) {
+      fprintf(stderr, "Error allocating temporary storage for tim sort: need %lu bytes",
+              (unsigned long)(sizeof(SORT_TYPE) * new_size));
+      exit(1);
+    }
+
+    store->storage = tempstore;
+    store->alloc = new_size;
+  }
+}
+
+
+static size_t TIM_SORT_GALLOP(SORT_TYPE *dst, const size_t size, const SORT_TYPE key, size_t anchor,
+                              int right) {
+  int last_ofs = 0;
+  int ofs, max_ofs, ofs_sign, cmp;
+  size_t  l, c, r;
+  cmp = SORT_CMP(key, dst[anchor]);
+
+  if (cmp < 0 || (!right && cmp == 0)) {
+    /* short cut */
+    if (anchor == 0) {
+      return 0;
+    }
+
+    ofs = -1;
+    ofs_sign = -1;
+    max_ofs = -anchor; /* ensure anchor+max_ofs is valid idx */
+  } else {
+    if (anchor == size - 1) {
+      return size;
+    }
+
+    ofs = 1;
+    ofs_sign = 1;
+    max_ofs = size - anchor - 1;
+  }
+
+  for (;;) {
+    /* deal with overflow */
+    if (max_ofs / ofs <= 1) {
+      ofs = max_ofs;
+
+      if (ofs < 0) {
+        cmp = SORT_CMP(key, dst[0]);
+
+        if ((right && cmp < 0) || (!right && cmp <= 0)) {
+          return 0;
+        }
+      } else {
+        cmp = SORT_CMP(dst[size - 1], key);
+
+        if ((right && cmp <= 0) || (!right && cmp < 0)) {
+          return size;
+        }
+      }
+
+      break;
+    }
+
+    c = anchor + ofs;
+    /* right, 0<ofs:  dst[anchor+last_ofs] <=  key  <  dst[anchor+ofs]      */
+    /* left,  0<ofs:  dst[anchor+last_ofs] <   key  <= dst[anchor+ofs]      */
+    /* right, ofs<0:  dst[anchor+ofs]      <=  key  <  dst[anchor+last_ofs] */
+    /* left,  ofs<0:  dst[anchor+ofs]      <   key  <= dst[anchor+last_ofs] */
+    cmp = SORT_CMP(key, dst[c]);
+
+    if (0 < ofs) {
+      if ((right && cmp < 0) || (!right && cmp <= 0)) {
+        break;
+      }
+    } else {
+      if ((right && 0 <= cmp) || (!right && 0 < cmp)) {
+        break;
+      }
+    }
+
+    last_ofs = ofs;
+    ofs = (ofs << 1) + ofs_sign;
+  }
+
+  /* key in region (l, r) , both l and r have already been compared */
+  if (ofs < 0) {
+    l = anchor + ofs;
+    r = anchor + last_ofs;
+  } else {
+    l = anchor + last_ofs;
+    r = anchor + ofs;
+  }
+
+  while (1 < r - l) {
+    c = l + ((r - l) >> 1);
+    cmp = SORT_CMP(key, dst[c]);
+
+    if ((right && cmp < 0) || (!right && cmp <= 0)) {
+      r = c;
+    } else {
+      l = c;
+    }
+  }
+
+  return r;
+}
+
+
+
+static void TIM_SORT_MERGE_LEFT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B,
+                                SORT_TYPE* storage, int *min_gallop_p) {
+  size_t pdst, pa, pb, k;
+  int a_count, b_count;
+  int min_gallop = *min_gallop_p;
+  SORT_TYPE *dst = A_src;
+  memcpy(storage, dst, A * sizeof(SORT_TYPE));
+  A_src = storage;
+  pdst = pa = pb = 0;
+  /* first element must in B, otherwise skipped in the caller  */
+  dst[pdst++] = B_src[pb++];
+
+  if (B == 1) {
+    goto copyA;
+  }
+
+  for (;;) {
+    a_count = b_count = 0;
+
+    for (;;) {
+      if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) {
+        dst[pdst++] = A_src[pa++];
+        ++a_count;
+        b_count = 0;
+
+        /* No need to check if pa == A because the last element must be in A
+         * so pb will reach to B first. You can check pa == A-1 and do
+         * some optimization if you wish.*/
+        if (min_gallop <= a_count) {
+          break;
+        }
+      } else {
+        dst[pdst++] = B_src[pb++];
+        ++b_count;
+        a_count = 0;
+
+        if (pb == B) {
+          goto copyA;
+        }
+
+        if (min_gallop <= b_count) {
+          break;
+        }
+      }
+    }
+
+    ++min_gallop;
+
+    for (;;) {
+      if (min_gallop != 0) {
+        min_gallop --;
+      }
+
+      k = TIM_SORT_GALLOP(&A_src[pa], A - pa, B_src[pb], 0, 1);
+      memcpy(&dst[pdst], &A_src[pa], k * sizeof(SORT_TYPE));
+      pdst += k;
+      pa += k;
+      /* now we know the next must be in B */
+      dst[pdst++] = B_src[pb++];
+
+      if (pb == B) {
+        goto copyA;
+      }
+
+      if (a_count && k < TIM_SORT_MIN_GALLOP) {
+        ++min_gallop;
+        break;
+      }
+
+      k = TIM_SORT_GALLOP(&B_src[pb], B - pb, A_src[pa], 0, 0);
+      memmove(&dst[pdst], &B_src[pb], k * sizeof(SORT_TYPE));
+      pdst += k;
+      pb += k;
+
+      if (pb == B) {
+        goto copyA;
+      }
+
+      dst[pdst++] = A_src[pa++];
+
+      if (b_count && k < TIM_SORT_MIN_GALLOP) {
+        ++min_gallop;
+        break;
+      }
+    }
+  }
+
+copyA:
+  memcpy(&dst[pdst], &A_src[pa], (A - pa) * sizeof(SORT_TYPE));
+  *min_gallop_p = min_gallop;
+  return;
+}
+
+
+static void TIM_SORT_MERGE_RIGHT(SORT_TYPE *A_src, SORT_TYPE *B_src, const size_t A, const size_t B,
+                                 SORT_TYPE* storage, int *min_gallop_p) {
+  size_t k;
+  int pdst, pa, pb, a_count, b_count;
+  int min_gallop = *min_gallop_p;
+  SORT_TYPE *dst = A_src;
+  pa = A - 1;
+  pb = B - 1;
+  pdst = A + B - 1;
+  memcpy(storage, B_src, B * sizeof(SORT_TYPE));
+  B_src = storage;
+  /* last element must in A, otherwise skipped in the caller  */
+  dst[pdst--] = A_src[pa--];
+
+  if (A == 1) {
+    goto copyB;
+  }
+
+  for (;;) {
+    a_count = b_count = 0;
+
+    for (;;) {
+      if (SORT_CMP(A_src[pa], B_src[pb]) <= 0) {
+        dst[pdst--] = B_src[pb--];
+        ++b_count;
+        a_count = 0;
+
+        if (min_gallop <= b_count) {
+          break;
+        }
+
+        /* No need to check if pb == -1 because the first element must be in B
+         * so pa will reach to -1 first. You can check pb == 0 and do
+         * some optimization if you wish.*/
+      } else {
+        dst[pdst--] = A_src[pa--];
+        ++a_count;
+        b_count = 0;
+
+        if (pa == -1) {
+          goto copyB;
+        }
+
+        if (min_gallop <= a_count) {
+          break;
+        }
+      }
+    }
+
+    ++min_gallop;
+
+    for (;;) {
+      if (min_gallop != 0) {
+        min_gallop --;
+      }
+
+      k = TIM_SORT_GALLOP(A_src, pa + 1, B_src[pb], pa, 1);
+      /* Understand the margin by considering k==0 */
+      memmove(&dst[pb + k + 1], &A_src[k], (pa + 1 - k) * sizeof(SORT_TYPE));
+      pdst = pb + k;
+      pa = k - 1;
+
+      if (pa == -1) {
+        goto copyB;
+      }
+
+      /* now we know the next must be in B */
+      dst[pdst--] = B_src[pb--];
+
+      if (a_count && pa + 1 - k < TIM_SORT_MIN_GALLOP) {
+        ++min_gallop;
+        break;
+      }
+
+      k = TIM_SORT_GALLOP(B_src, pb + 1, A_src[pa], pb, 0);
+      memcpy(&dst[pa + k + 1], &B_src[k], (pb + 1 - k) * sizeof(SORT_TYPE));
+      pdst = pa + k;
+      pb = k - 1;
+      dst[pdst--] = A_src[pa--];
+
+      if (pa == -1) {
+        goto copyB;
+      }
+
+      if (b_count && pb + 1 - k < TIM_SORT_MIN_GALLOP) {
+        ++min_gallop;
+        break;
+      }
+    }
+  }
+
+copyB:
+  memcpy(dst, B_src, (pb + 1) * sizeof(SORT_TYPE));
+  *min_gallop_p = min_gallop;
+  return;
+}
+
+
+static void TIM_SORT_MERGE(SORT_TYPE *dst, const TIM_SORT_RUN_T *stack, const int stack_curr,
+                           TEMP_STORAGE_T *store, int* min_gallop_p) {
+  size_t A = stack[stack_curr - 2].length;
+  size_t B = stack[stack_curr - 1].length;
+  size_t A_start = stack[stack_curr - 2].start;
+  size_t B_start = stack[stack_curr - 1].start;
+  SORT_TYPE *storage;
+  size_t k;
+  /* A[k-1] <= B[0] < A[k] */
+  k = TIM_SORT_GALLOP(&dst[A_start], A, dst[B_start], 0, 1);
+  A_start += k;
+  A -= k;
+
+  if (A == 0) {
+    *min_gallop_p /= 2;
+    return;
+  }
+
+  /* B[k-1] < A[A-1] <= B[k] */
+  k = TIM_SORT_GALLOP(&dst[B_start], B, dst[B_start - 1], B - 1, 0);
+  B = k;
+  TIM_SORT_RESIZE(store, MIN(A, B));
+  storage = store->storage;
+
+  if (A < B) {
+    TIM_SORT_MERGE_LEFT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p);
+  } else {
+    TIM_SORT_MERGE_RIGHT(&dst[A_start], &dst[B_start], A, B, storage, min_gallop_p);
+  }
+}
+
+static int TIM_SORT_COLLAPSE(SORT_TYPE *dst, TIM_SORT_RUN_T *stack, int stack_curr,
+                             TEMP_STORAGE_T *store, const size_t size, int* min_gallop_p) {
+  while (1) {
+    size_t A, B, C, D;
+    int ABC, BCD, CD;
+
+    /* if the stack only has one thing on it, we are done with the collapse */
+    if (stack_curr <= 1) {
+      break;
+    }
+
+    /* if this is the last merge, just do it */
+    if ((stack_curr == 2) && (stack[0].length + stack[1].length == size)) {
+      TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
+      stack[0].length += stack[1].length;
+      stack_curr--;
+      break;
+    }
+    /* check if the invariant is off for a stack of 2 elements */
+    else if ((stack_curr == 2) && (stack[0].length <= stack[1].length)) {
+      TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
+      stack[0].length += stack[1].length;
+      stack_curr--;
+      break;
+    } else if (stack_curr == 2) {
+      break;
+    }
+
+    B = stack[stack_curr - 3].length;
+    C = stack[stack_curr - 2].length;
+    D = stack[stack_curr - 1].length;
+
+    if (stack_curr >= 4) {
+      A = stack[stack_curr - 4].length;
+      ABC = (A <= B + C);
+    } else {
+      ABC = 0;
+    }
+
+    BCD = (B <= C + D) || ABC;
+    CD = (C <= D);
+
+    /* Both invariants are good */
+    if (!BCD && !CD) {
+      break;
+    }
+
+    /* left merge */
+    if (BCD && !CD) {
+      TIM_SORT_MERGE(dst, stack, stack_curr - 1, store, min_gallop_p);
+      stack[stack_curr - 3].length += stack[stack_curr - 2].length;
+      stack[stack_curr - 2] = stack[stack_curr - 1];
+      stack_curr--;
+    } else {
+      /* right merge */
+      TIM_SORT_MERGE(dst, stack, stack_curr, store, min_gallop_p);
+      stack[stack_curr - 2].length += stack[stack_curr - 1].length;
+      stack_curr--;
+    }
+  }
+
+  return stack_curr;
+}
+
+static __inline int PUSH_NEXT(SORT_TYPE *dst,
+                              const size_t size,
+                              TEMP_STORAGE_T *store,
+                              const size_t minrun,
+                              TIM_SORT_RUN_T *run_stack,
+                              size_t *stack_curr,
+                              size_t *curr,
+                              int *min_gallop_p) {
+  size_t len = COUNT_RUN(dst, *curr, size);
+  size_t run = minrun;
+
+  if (run > size - *curr) {
+    run = size - *curr;
+  }
+
+  if (run > len) {
+    BINARY_INSERTION_SORT_START(&dst[*curr], len, run);
+    len = run;
+  }
+
+  run_stack[*stack_curr].start = *curr;
+  run_stack[*stack_curr].length = len;
+  (*stack_curr)++;
+  *curr += len;
+
+  if (*curr == size) {
+    /* finish up */
+    while (*stack_curr > 1) {
+      TIM_SORT_MERGE(dst, run_stack, *stack_curr, store, min_gallop_p);
+      run_stack[*stack_curr - 2].length += run_stack[*stack_curr - 1].length;
+      (*stack_curr)--;
+    }
+
+    if (store->storage != NULL) {
+      free(store->storage);
+      store->storage = NULL;
+    }
+
+    return 0;
+  }
+
+  return 1;
+}
+
+void TIM_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t minrun;
+  TEMP_STORAGE_T _store, *store;
+  TIM_SORT_RUN_T run_stack[TIM_SORT_STACK_SIZE];
+  size_t stack_curr = 0;
+  size_t curr = 0;
+  int min_gallop = TIM_SORT_MIN_GALLOP;
+
+  /* don't bother sorting an array of size 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  if (size < 64) {
+    SMALL_SORT(dst, size);
+    return;
+  }
+
+  /* compute the minimum run length */
+  minrun = compute_minrun(size);
+  /* temporary storage for merges */
+  store = &_store;
+  store->alloc = 0;
+  store->storage = NULL;
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
+    return;
+  }
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
+    return;
+  }
+
+  if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
+    return;
+  }
+
+  while (1) {
+    if (!CHECK_INVARIANT(run_stack, stack_curr)) {
+      stack_curr = TIM_SORT_COLLAPSE(dst, run_stack, stack_curr, store, size, &min_gallop);
+      continue;
+    }
+
+    if (!PUSH_NEXT(dst, size, store, minrun, run_stack, &stack_curr, &curr, &min_gallop)) {
+      return;
+    }
+  }
+}
+
+/* heap sort: based on wikipedia */
+
+static __inline void HEAP_SIFT_DOWN(SORT_TYPE *dst, const size_t start, const size_t end) {
+  size_t root = start;
+
+  while ((root << 1) <= end) {
+    size_t child = root << 1;
+
+    if ((child < end) && (SORT_CMP(dst[child], dst[child + 1]) < 0)) {
+      child++;
+    }
+
+    if (SORT_CMP(dst[root], dst[child]) < 0) {
+      SORT_SWAP(dst[root], dst[child]);
+      root = child;
+    } else {
+      return;
+    }
+  }
+}
+
+static __inline void HEAPIFY(SORT_TYPE *dst, const size_t size) {
+  size_t start = size >> 1;
+
+  while (1) {
+    HEAP_SIFT_DOWN(dst, start, size - 1);
+
+    if (start == 0) {
+      break;
+    }
+
+    start--;
+  }
+}
+
+void HEAP_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t end = size - 1;
+
+  /* don't bother sorting an array of size <= 1 */
+  if (size <= 1) {
+    return;
+  }
+
+  HEAPIFY(dst, size);
+
+  while (end > 0) {
+    SORT_SWAP(dst[end], dst[0]);
+    HEAP_SIFT_DOWN(dst, 0, end - 1);
+    end--;
+  }
+}
+
+/********* Sqrt sorting *********************************/
+/*                                                       */
+/* (c) 2014 by Andrey Astrelin                           */
+/*                                                       */
+/*                                                       */
+/* Stable sorting that works in O(N*log(N)) worst time   */
+/* and uses O(sqrt(N)) extra memory                      */
+/*                                                       */
+/* Define SORT_TYPE and SORT_CMP                         */
+/* and then call SqrtSort() function                     */
+/*                                                       */
+/*********************************************************/
+
+#define SORT_CMP_A(a,b) SORT_CMP(*(a),*(b))
+
+static __inline void SQRT_SORT_SWAP_1(SORT_TYPE *a, SORT_TYPE *b) {
+  SORT_TYPE c = *a;
+  *a++ = *b;
+  *b++ = c;
+}
+
+static __inline void SQRT_SORT_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) {
+  while (n--) {
+    SQRT_SORT_SWAP_1(a++, b++);
+  }
+}
+
+
+static void SQRT_SORT_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;
+
+  while (p1 >= 0) {
+    if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
+      arr[p0--] = arr[p1--];
+    } else {
+      arr[p0--] = arr[p2--];
+    }
+  }
+
+  if (p2 != p0) while (p2 >= L1) {
+      arr[p0--] = arr[p2--];
+    }
+}
+
+/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void SQRT_SORT_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      arr[M++] = arr[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+/* arr[0,L1-1] ++ arr2[0,L2-1] -> arr[-L1,L2-1],  arr2 is "before" arr1 */
+static void SQRT_SORT_MERGE_DOWN(SORT_TYPE *arr, SORT_TYPE *arr2, int L1, int L2) {
+  int p0 = 0, p1 = 0, M = -L2;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr2 + p1) >= 0) {
+      arr[M++] = arr2[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+static void SQRT_SORT_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      arr[p0++] = arr[p1++];
+    } else {
+      arr[p0++] = arr[p2++];
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      arr[--q2] = arr[--q1];
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(int *keys, int midkey, SORT_TYPE *arr,
+    int nblock, int lblock, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+    SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
+    return;
+  }
+
+  lrest = lblock;
+  frest = keys[0] < midkey ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = keys[cidx] < midkey ? 0 : 1;
+
+    if (fnext == frest) {
+      memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      SQRT_SORT_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
+  } else {
+    memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+  }
+}
+
+/*
+  build blocks of length K
+  input: [-K,-1] elements are buffer
+  output: first K elements are buffer, blocks 2*K and last subblock sorted
+*/
+static void SQRT_SORT_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K) {
+  int m, u, h, p0, p1, rest, restk, p;
+
+  for (m = 1; m < L; m += 2) {
+    u = 0;
+
+    if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+      u = 1;
+    }
+
+    arr[m - 3] = arr[m - 1 + u];
+    arr[m - 2] = arr[m - u];
+  }
+
+  if (L % 2) {
+    arr[L - 3] = arr[L - 1];
+  }
+
+  arr -= 2;
+
+  for (h = 2; h < K; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      SQRT_SORT_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
+    } else {
+      for (; p0 < L; p0++) {
+        arr[p0 - h] = arr[p0];
+      }
+    }
+
+    arr -= h;
+  }
+
+  restk = L % (2 * K);
+  p = L - restk;
+
+  if (restk <= K) {
+    memcpy(arr + p + K, arr + p, restk * sizeof(SORT_TYPE));
+  } else {
+    SQRT_SORT_MERGE_RIGHT(arr + p, K, restk - K, K);
+  }
+
+  while (p > 0) {
+    p -= 2 * K;
+    SQRT_SORT_MERGE_RIGHT(arr + p, K, K, K);
+  }
+}
+
+
+static void SQRT_SORT_SORT_INS(SORT_TYPE *arr, int len) {
+  int i, j;
+
+  for (i = 1; i < len; i++) {
+    for (j = i - 1; j >= 0 && SORT_CMP_A(arr + (j + 1), arr + j) < 0; j--) {
+      SQRT_SORT_SWAP_1(arr + j, arr + (j + 1));
+    }
+  }
+}
+
+/*
+  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
+  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
+*/
+static void SQRT_SORT_COMBINE_BLOCKS(SORT_TYPE *arr, int len, int LL, int lblock, int *tags) {
+  int M, b, NBlk, midkey, lrest, u, i, p, v, kc, nbl2, llast;
+  SORT_TYPE *arr1;
+  M = len / (2 * LL);
+  lrest = len % (2 * LL);
+
+  if (lrest <= LL) {
+    len -= lrest;
+    lrest = 0;
+  }
+
+  for (b = 0; b <= M; b++) {
+    if (b == M && lrest == 0) {
+      break;
+    }
+
+    arr1 = arr + b * 2 * LL;
+    NBlk = (b == M ? lrest : 2 * LL) / lblock;
+    u = NBlk + (b == M ? 1 : 0);
+
+    for (i = 0; i <= u; i++) {
+      tags[i] = i;
+    }
+
+    midkey = LL / lblock;
+
+    for (u = 1; u < NBlk; u++) {
+      p = u - 1;
+
+      for (v = u; v < NBlk; v++) {
+        kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);
+
+        if (kc > 0 || (kc == 0 && tags[p] > tags[v])) {
+          p = v;
+        }
+      }
+
+      if (p != u - 1) {
+        SQRT_SORT_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
+        i = tags[u - 1];
+        tags[u - 1] = tags[p];
+        tags[p] = i;
+      }
+    }
+
+    nbl2 = llast = 0;
+
+    if (b == M) {
+      llast = lrest % lblock;
+    }
+
+    if (llast != 0) {
+      while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
+        nbl2++;
+      }
+    }
+
+    SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF(tags, midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
+  }
+
+  for (p = len; --p >= 0;) {
+    arr[p] = arr[p - lblock];
+  }
+}
+
+
+static void SQRT_SORT_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int *Tags) {
+  int lblock, cbuf;
+
+  if (Len < 16) {
+    SQRT_SORT_SORT_INS(arr, Len);
+    return;
+  }
+
+  lblock = 1;
+
+  while (lblock * lblock < Len) {
+    lblock *= 2;
+  }
+
+  memcpy(extbuf, arr, lblock * sizeof(SORT_TYPE));
+  SQRT_SORT_COMMON_SORT(extbuf, lblock, arr, Tags);
+  SQRT_SORT_BUILD_BLOCKS(arr + lblock, Len - lblock, lblock);
+  cbuf = lblock;
+
+  while (Len > (cbuf *= 2)) {
+    SQRT_SORT_COMBINE_BLOCKS(arr + lblock, Len - lblock, cbuf, lblock, Tags);
+  }
+
+  SQRT_SORT_MERGE_DOWN(arr + lblock, extbuf, Len - lblock, lblock);
+}
+
+static void SQRT_SORT(SORT_TYPE *arr, size_t Len) {
+  int L = 1;
+  SORT_TYPE *ExtBuf;
+  int *Tags;
+  int NK;
+
+  while (L * L < Len) {
+    L *= 2;
+  }
+
+  NK = (Len - 1) / L + 2;
+  ExtBuf = (SORT_TYPE*)malloc(L * sizeof(SORT_TYPE));
+
+  if (ExtBuf == NULL) {
+    return;  /* fail */
+  }
+
+  Tags = (int*)malloc(NK * sizeof(int));
+
+  if (Tags == NULL) {
+    return;
+  }
+
+  SQRT_SORT_COMMON_SORT(arr, Len, ExtBuf, Tags);
+  free(Tags);
+  free(ExtBuf);
+}
+
+/********* Grail sorting *********************************/
+/*                                                       */
+/* (c) 2013 by Andrey Astrelin                           */
+/*                                                       */
+/*                                                       */
+/* Stable sorting that works in O(N*log(N)) worst time   */
+/* and uses O(1) extra memory                            */
+/*                                                       */
+/* Define SORT_TYPE and SORT_CMP                         */
+/* and then call GrailSort() function                    */
+/*                                                       */
+/* For sorting with fixed external buffer (512 items)    */
+/* use GrailSortWithBuffer()                             */
+/*                                                       */
+/* For sorting with dynamic external buffer (O(sqrt(N)) items) */
+/* use GrailSortWithDynBuffer()                          */
+/*                                                       */
+/* Also classic in-place merge sort is implemented       */
+/* under the name of RecStableSort()                     */
+/*                                                       */
+/*********************************************************/
+
+#define GRAIL_EXT_BUFFER_LENGTH 512
+
+static __inline void GRAIL_SWAP1(SORT_TYPE *a, SORT_TYPE *b) {
+  SORT_TYPE c = *a;
+  *a = *b;
+  *b = c;
+}
+
+static __inline void GRAIL_SWAP_N(SORT_TYPE *a, SORT_TYPE *b, int n) {
+  while (n--) {
+    GRAIL_SWAP1(a++, b++);
+  }
+}
+
+static void GRAIL_ROTATE(SORT_TYPE *a, int l1, int l2) {
+  while (l1 && l2) {
+    if (l1 <= l2) {
+      GRAIL_SWAP_N(a, a + l1, l1);
+      a += l1;
+      l2 -= l1;
+    } else {
+      GRAIL_SWAP_N(a + (l1 - l2), a + l1, l2);
+      l1 -= l2;
+    }
+  }
+}
+
+static int GRAIL_BIN_SEARCH_LEFT(SORT_TYPE *arr, int len, SORT_TYPE *key) {
+  int a = -1, b = len, c;
+
+  while (a < b - 1) {
+    c = a + ((b - a) >> 1);
+
+    if (SORT_CMP_A(arr + c, key) >= 0) {
+      b = c;
+    } else {
+      a = c;
+    }
+  }
+
+  return b;
+}
+static int GRAIL_BIN_SEARCH_RIGHT(SORT_TYPE *arr, int len, SORT_TYPE *key) {
+  int a = -1, b = len, c;
+
+  while (a < b - 1) {
+    c = a + ((b - a) >> 1);
+
+    if (SORT_CMP_A(arr + c, key) > 0) {
+      b = c;
+    } else {
+      a = c;
+    }
+  }
+
+  return b;
+}
+
+/* cost: 2*len+nk^2/2 */
+static int GRAIL_FIND_KEYS(SORT_TYPE *arr, int len, int nkeys) {
+  int h = 1, h0 = 0; /* first key is always here */
+  int u = 1, r;
+
+  while (u < len && h < nkeys) {
+    r = GRAIL_BIN_SEARCH_LEFT(arr + h0, h, arr + u);
+
+    if (r == h || SORT_CMP_A(arr + u, arr + (h0 + r)) != 0) {
+      GRAIL_ROTATE(arr + h0, h, u - (h0 + h));
+      h0 = u - h;
+      GRAIL_ROTATE(arr + (h0 + r), h - r, 1);
+      h++;
+    }
+
+    u++;
+  }
+
+  GRAIL_ROTATE(arr, h0, h);
+  return h;
+}
+
+/* cost: min(L1,L2)^2+max(L1,L2) */
+static void GRAIL_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int len1, int len2) {
+  int h;
+
+  if (len1 < len2) {
+    while (len1) {
+      h = GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr);
+
+      if (h != 0) {
+        GRAIL_ROTATE(arr, len1, h);
+        arr += h;
+        len2 -= h;
+      }
+
+      if (len2 == 0) {
+        break;
+      }
+
+      do {
+        arr++;
+        len1--;
+      } while (len1 && SORT_CMP_A(arr, arr + len1) <= 0);
+    }
+  } else {
+    while (len2) {
+      h = GRAIL_BIN_SEARCH_RIGHT(arr, len1, arr + (len1 + len2 - 1));
+
+      if (h != len1) {
+        GRAIL_ROTATE(arr + h, len1 - h, len2);
+        len1 = h;
+      }
+
+      if (len1 == 0) {
+        break;
+      }
+
+      do {
+        len2--;
+      } while (len2 && SORT_CMP_A(arr + len1 - 1, arr + len1 + len2 - 1) <= 0);
+    }
+  }
+}
+
+/* arr[M..-1] - buffer, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void GRAIL_MERGE_LEFT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      GRAIL_SWAP1(arr + (M++), arr + (p1++));
+    } else {
+      GRAIL_SWAP1(arr + (M++), arr + (p0++));
+    }
+  }
+
+  if (M != p0) {
+    GRAIL_SWAP_N(arr + M, arr + p0, L1 - p0);
+  }
+}
+static void GRAIL_MERGE_RIGHT(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = L1 + L2 + M - 1, p2 = L1 + L2 - 1, p1 = L1 - 1;
+
+  while (p1 >= 0) {
+    if (p2 < L1 || SORT_CMP_A(arr + p1, arr + p2) > 0) {
+      GRAIL_SWAP1(arr + (p0--), arr + (p1--));
+    } else {
+      GRAIL_SWAP1(arr + (p0--), arr + (p2--));
+    }
+  }
+
+  if (p2 != p0) while (p2 >= L1) {
+      GRAIL_SWAP1(arr + (p0--), arr + (p2--));
+    }
+}
+
+static void GRAIL_SMART_MERGE_WITH_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      GRAIL_SWAP1(arr + (p0++), arr + (p1++));
+    } else {
+      GRAIL_SWAP1(arr + (p0++), arr + (p2++));
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      GRAIL_SWAP1(arr + (--q1), arr + (--q2));
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+static void GRAIL_SMART_MERGE_WITHOUT_BUFFER(SORT_TYPE *arr, int *alen1, int *atype, int _len2) {
+  int len1, len2, ftype, h;
+
+  if (!_len2) {
+    return;
+  }
+
+  len1 = *alen1;
+  len2 = _len2;
+  ftype = 1 - *atype;
+
+  if (len1 && SORT_CMP_A(arr + (len1 - 1), arr + len1) - ftype >= 0) {
+    while (len1) {
+      h = ftype ? GRAIL_BIN_SEARCH_LEFT(arr + len1, len2, arr) : GRAIL_BIN_SEARCH_RIGHT(arr + len1, len2,
+          arr);
+
+      if (h != 0) {
+        GRAIL_ROTATE(arr, len1, h);
+        arr += h;
+        len2 -= h;
+      }
+
+      if (len2 == 0) {
+        *alen1 = len1;
+        return;
+      }
+
+      do {
+        arr++;
+        len1--;
+      } while (len1 && SORT_CMP_A(arr, arr + len1) - ftype < 0);
+    }
+  }
+
+  *alen1 = len2;
+  *atype = ftype;
+}
+
+/***** Sort With Extra Buffer *****/
+
+/* arr[M..-1] - free, arr[0,L1-1]++arr[L1,L1+L2-1] -> arr[M,M+L1+L2-1] */
+static void GRAIL_MERGE_LEFT_WITH_X_BUF(SORT_TYPE *arr, int L1, int L2, int M) {
+  int p0 = 0, p1 = L1;
+  L2 += L1;
+
+  while (p1 < L2) {
+    if (p0 == L1 || SORT_CMP_A(arr + p0, arr + p1) > 0) {
+      arr[M++] = arr[p1++];
+    } else {
+      arr[M++] = arr[p0++];
+    }
+  }
+
+  if (M != p0) while (p0 < L1) {
+      arr[M++] = arr[p0++];
+    }
+}
+
+static void GRAIL_SMART_MERGE_WITH_X_BUF(SORT_TYPE *arr, int *alen1, int *atype, int len2,
+    int lkeys) {
+  int p0 = -lkeys, p1 = 0, p2 = *alen1, q1 = p2, q2 = p2 + len2;
+  int ftype = 1 - *atype; /* 1 if inverted */
+
+  while (p1 < q1 && p2 < q2) {
+    if (SORT_CMP_A(arr + p1, arr + p2) - ftype < 0) {
+      arr[p0++] = arr[p1++];
+    } else {
+      arr[p0++] = arr[p2++];
+    }
+  }
+
+  if (p1 < q1) {
+    *alen1 = q1 - p1;
+
+    while (p1 < q1) {
+      arr[--q2] = arr[--q1];
+    }
+  } else {
+    *alen1 = q2 - p2;
+    *atype = ftype;
+  }
+}
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr,
+    int nblock, int lblock, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+    GRAIL_MERGE_LEFT_WITH_X_BUF(arr, l, llast, -lblock);
+    return;
+  }
+
+  lrest = lblock;
+  frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;
+
+    if (fnext == frest) {
+      memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      GRAIL_SMART_MERGE_WITH_X_BUF(arr + prest, &lrest, &frest, lblock, lblock);
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    GRAIL_MERGE_LEFT_WITH_X_BUF(arr + prest, lrest, llast, -lblock);
+  } else {
+    memcpy(arr + prest - lblock, arr + prest, lrest * sizeof(SORT_TYPE));
+  }
+}
+
+/***** End Sort With Extra Buffer *****/
+
+/*
+  build blocks of length K
+  input: [-K,-1] elements are buffer
+  output: first K elements are buffer, blocks 2*K and last subblock sorted
+*/
+static void GRAIL_BUILD_BLOCKS(SORT_TYPE *arr, int L, int K, SORT_TYPE *extbuf, int LExtBuf) {
+  int m, u, h, p0, p1, rest, restk, p, kbuf;
+  kbuf = K < LExtBuf ? K : LExtBuf;
+
+  while (kbuf & (kbuf - 1)) {
+    kbuf &= kbuf - 1;  /* max power or 2 - just in case */
+  }
+
+  if (kbuf) {
+    memcpy(extbuf, arr - kbuf, kbuf * sizeof(SORT_TYPE));
+
+    for (m = 1; m < L; m += 2) {
+      u = 0;
+
+      if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+        u = 1;
+      }
+
+      arr[m - 3] = arr[m - 1 + u];
+      arr[m - 2] = arr[m - u];
+    }
+
+    if (L % 2) {
+      arr[L - 3] = arr[L - 1];
+    }
+
+    arr -= 2;
+
+    for (h = 2; h < kbuf; h *= 2) {
+      p0 = 0;
+      p1 = L - 2 * h;
+
+      while (p0 <= p1) {
+        GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, h, -h);
+        p0 += 2 * h;
+      }
+
+      rest = L - p0;
+
+      if (rest > h) {
+        GRAIL_MERGE_LEFT_WITH_X_BUF(arr + p0, h, rest - h, -h);
+      } else {
+        for (; p0 < L; p0++) {
+          arr[p0 - h] = arr[p0];
+        }
+      }
+
+      arr -= h;
+    }
+
+    memcpy(arr + L, extbuf, kbuf * sizeof(SORT_TYPE));
+  } else {
+    for (m = 1; m < L; m += 2) {
+      u = 0;
+
+      if (SORT_CMP_A(arr + (m - 1), arr + m) > 0) {
+        u = 1;
+      }
+
+      GRAIL_SWAP1(arr + (m - 3), arr + (m - 1 + u));
+      GRAIL_SWAP1(arr + (m - 2), arr + (m - u));
+    }
+
+    if (L % 2) {
+      GRAIL_SWAP1(arr + (L - 1), arr + (L - 3));
+    }
+
+    arr -= 2;
+    h = 2;
+  }
+
+  for (; h < K; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      GRAIL_MERGE_LEFT(arr + p0, h, h, -h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      GRAIL_MERGE_LEFT(arr + p0, h, rest - h, -h);
+    } else {
+      GRAIL_ROTATE(arr + p0 - h, h, rest);
+    }
+
+    arr -= h;
+  }
+
+  restk = L % (2 * K);
+  p = L - restk;
+
+  if (restk <= K) {
+    GRAIL_ROTATE(arr + p, restk, K);
+  } else {
+    GRAIL_MERGE_RIGHT(arr + p, K, restk - K, K);
+  }
+
+  while (p > 0) {
+    p -= 2 * K;
+    GRAIL_MERGE_RIGHT(arr + p, K, K, K);
+  }
+}
+
+/*
+  arr - starting array. arr[-lblock..-1] - buffer (if havebuf).
+  lblock - length of regular blocks. First nblocks are stable sorted by 1st elements and key-coded
+  keys - arrays of keys, in same order as blocks. key<midkey means stream A
+  nblock2 are regular blocks from stream A. llast is length of last (irregular) block from stream B, that should go before nblock2 blocks.
+  llast=0 requires nblock2=0 (no irregular blocks). llast>0, nblock2=0 is possible.
+*/
+static void GRAIL_MERGE_BUFFERS_LEFT(SORT_TYPE *keys, SORT_TYPE *midkey, SORT_TYPE *arr, int nblock,
+                                     int lblock, int havebuf, int nblock2, int llast) {
+  int l, prest, lrest, frest, pidx, cidx, fnext;
+
+  if (nblock == 0) {
+    l = nblock2 * lblock;
+
+    if (havebuf) {
+      GRAIL_MERGE_LEFT(arr, l, llast, -lblock);
+    } else {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr, l, llast);
+    }
+
+    return;
+  }
+
+  lrest = lblock;
+  frest = SORT_CMP_A(keys, midkey) < 0 ? 0 : 1;
+  pidx = lblock;
+
+  for (cidx = 1; cidx < nblock; cidx++, pidx += lblock) {
+    prest = pidx - lrest;
+    fnext = SORT_CMP_A(keys + cidx, midkey) < 0 ? 0 : 1;
+
+    if (fnext == frest) {
+      if (havebuf) {
+        GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
+      }
+
+      prest = pidx;
+      lrest = lblock;
+    } else {
+      if (havebuf) {
+        GRAIL_SMART_MERGE_WITH_BUFFER(arr + prest, &lrest, &frest, lblock, lblock);
+      } else {
+        GRAIL_SMART_MERGE_WITHOUT_BUFFER(arr + prest, &lrest, &frest, lblock);
+      }
+    }
+  }
+
+  prest = pidx - lrest;
+
+  if (llast) {
+    if (frest) {
+      if (havebuf) {
+        GRAIL_SWAP_N(arr + prest - lblock, arr + prest, lrest);
+      }
+
+      prest = pidx;
+      lrest = lblock * nblock2;
+      frest = 0;
+    } else {
+      lrest += lblock * nblock2;
+    }
+
+    if (havebuf) {
+      GRAIL_MERGE_LEFT(arr + prest, lrest, llast, -lblock);
+    } else {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + prest, lrest, llast);
+    }
+  } else {
+    if (havebuf) {
+      GRAIL_SWAP_N(arr + prest, arr + (prest - lblock), lrest);
+    }
+  }
+}
+
+static void GRAIL_LAZY_STABLE_SORT(SORT_TYPE *arr, int L) {
+  int m, h, p0, p1, rest;
+
+  for (m = 1; m < L; m += 2) {
+    if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
+      GRAIL_SWAP1(arr + (m - 1), arr + m);
+    }
+  }
+
+  for (h = 2; h < L; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      GRAIL_MERGE_WITHOUT_BUFFER(arr + p0, h, rest - h);
+    }
+  }
+}
+
+/*
+  keys are on the left of arr. Blocks of length LL combined. We'll combine them in pairs
+  LL and nkeys are powers of 2. (2*LL/lblock) keys are guarantied
+*/
+static void GRAIL_COMBINE_BLOCKS(SORT_TYPE *keys, SORT_TYPE *arr, int len, int LL, int lblock,
+                                 int havebuf, SORT_TYPE *xbuf) {
+  int M, b, NBlk, midkey, lrest, u, p, v, kc, nbl2, llast;
+  SORT_TYPE *arr1;
+  M = len / (2 * LL);
+  lrest = len % (2 * LL);
+
+  if (lrest <= LL) {
+    len -= lrest;
+    lrest = 0;
+  }
+
+  if (xbuf) {
+    memcpy(xbuf, arr - lblock, lblock * sizeof(SORT_TYPE));
+  }
+
+  for (b = 0; b <= M; b++) {
+    if (b == M && lrest == 0) {
+      break;
+    }
+
+    arr1 = arr + b * 2 * LL;
+    NBlk = (b == M ? lrest : 2 * LL) / lblock;
+    SMALL_SORT(keys, NBlk + (b == M ? 1 : 0));
+    midkey = LL / lblock;
+
+    for (u = 1; u < NBlk; u++) {
+      p = u - 1;
+
+      for (v = u; v < NBlk; v++) {
+        kc = SORT_CMP_A(arr1 + p * lblock, arr1 + v * lblock);
+
+        if (kc > 0 || (kc == 0 && SORT_CMP_A(keys + p, keys + v) > 0)) {
+          p = v;
+        }
+      }
+
+      if (p != u - 1) {
+        GRAIL_SWAP_N(arr1 + (u - 1)*lblock, arr1 + p * lblock, lblock);
+        GRAIL_SWAP1(keys + (u - 1), keys + p);
+
+        if (midkey == u - 1 || midkey == p) {
+          midkey ^= (u - 1)^p;
+        }
+      }
+    }
+
+    nbl2 = llast = 0;
+
+    if (b == M) {
+      llast = lrest % lblock;
+    }
+
+    if (llast != 0) {
+      while (nbl2 < NBlk && SORT_CMP_A(arr1 + NBlk * lblock, arr1 + (NBlk - nbl2 - 1)*lblock) < 0) {
+        nbl2++;
+      }
+    }
+
+    if (xbuf) {
+      GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF(keys, keys + midkey, arr1, NBlk - nbl2, lblock, nbl2, llast);
+    } else {
+      GRAIL_MERGE_BUFFERS_LEFT(keys, keys + midkey, arr1, NBlk - nbl2, lblock, havebuf, nbl2, llast);
+    }
+  }
+
+  if (xbuf) {
+    for (p = len; --p >= 0;) {
+      arr[p] = arr[p - lblock];
+    }
+
+    memcpy(arr - lblock, xbuf, lblock * sizeof(SORT_TYPE));
+  } else if (havebuf) while (--len >= 0) {
+      GRAIL_SWAP1(arr + len, arr + len - lblock);
+    }
+}
+
+
+static void GRAIL_COMMON_SORT(SORT_TYPE *arr, int Len, SORT_TYPE *extbuf, int LExtBuf) {
+  int lblock, nkeys, findkeys, ptr, cbuf, lb, nk;
+  int havebuf, chavebuf;
+  long long s;
+
+  if (Len <= SMALL_SORT_BND) {
+    SMALL_SORT(arr, Len);
+    return;
+  }
+
+  lblock = 1;
+
+  while (lblock * lblock < Len) {
+    lblock *= 2;
+  }
+
+  nkeys = (Len - 1) / lblock + 1;
+  findkeys = GRAIL_FIND_KEYS(arr, Len, nkeys + lblock);
+  havebuf = 1;
+
+  if (findkeys < nkeys + lblock) {
+    if (findkeys < 4) {
+      GRAIL_LAZY_STABLE_SORT(arr, Len);
+      return;
+    }
+
+    nkeys = lblock;
+
+    while (nkeys > findkeys) {
+      nkeys /= 2;
+    }
+
+    havebuf = 0;
+    lblock = 0;
+  }
+
+  ptr = lblock + nkeys;
+  cbuf = havebuf ? lblock : nkeys;
+
+  if (havebuf) {
+    GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, extbuf, LExtBuf);
+  } else {
+    GRAIL_BUILD_BLOCKS(arr + ptr, Len - ptr, cbuf, NULL, 0);
+  }
+
+  /* 2*cbuf are built */
+  while (Len - ptr > (cbuf *= 2)) {
+    lb = lblock;
+    chavebuf = havebuf;
+
+    if (!havebuf) {
+      if (nkeys > 4 && nkeys / 8 * nkeys >= cbuf) {
+        lb = nkeys / 2;
+        chavebuf = 1;
+      } else {
+        nk = 1;
+        s = (long long)cbuf * findkeys / 2;
+
+        while (nk < nkeys && s != 0) {
+          nk *= 2;
+          s /= 8;
+        }
+
+        lb = (2 * cbuf) / nk;
+      }
+    }
+
+    GRAIL_COMBINE_BLOCKS(arr, arr + ptr, Len - ptr, cbuf, lb, chavebuf, chavebuf
+                         && lb <= LExtBuf ? extbuf : NULL);
+  }
+
+  SMALL_SORT(arr, ptr);
+  GRAIL_MERGE_WITHOUT_BUFFER(arr, ptr, Len - ptr);
+}
+
+static void GRAIL_SORT(SORT_TYPE *arr, size_t Len) {
+  GRAIL_COMMON_SORT(arr, Len, NULL, 0);
+}
+
+static void GRAIL_SORT_FIXED_BUFFER(SORT_TYPE *arr, size_t Len) {
+  SORT_TYPE ExtBuf[GRAIL_EXT_BUFFER_LENGTH];
+  GRAIL_COMMON_SORT(arr, Len, ExtBuf, GRAIL_EXT_BUFFER_LENGTH);
+}
+
+static void GRAIL_SORT_DYN_BUFFER(SORT_TYPE *arr, size_t Len) {
+  int L = 1;
+  SORT_TYPE *ExtBuf;
+
+  while (L * L < Len) {
+    L *= 2;
+  }
+
+  ExtBuf = (SORT_TYPE*)malloc(L * sizeof(SORT_TYPE));
+
+  if (ExtBuf == NULL) {
+    GRAIL_SORT_FIXED_BUFFER(arr, Len);
+  } else {
+    GRAIL_COMMON_SORT(arr, Len, ExtBuf, L);
+    free(ExtBuf);
+  }
+}
+
+/****** classic MergeInPlace *************/
+
+static void GRAIL_REC_MERGE(SORT_TYPE *A, int L1, int L2) {
+  int K, k1, k2, m1, m2;
+
+  if (L1 < 3 || L2 < 3) {
+    GRAIL_MERGE_WITHOUT_BUFFER(A, L1, L2);
+    return;
+  }
+
+  if (L1 < L2) {
+    K = L1 + L2 / 2;
+  } else {
+    K = L1 / 2;
+  }
+
+  k1 = k2 = GRAIL_BIN_SEARCH_LEFT(A, L1, A + K);
+
+  if (k2 < L1 && SORT_CMP_A(A + k2, A + K) == 0) {
+    k2 = GRAIL_BIN_SEARCH_RIGHT(A + k1, L1 - k1, A + K) + k1;
+  }
+
+  m1 = GRAIL_BIN_SEARCH_LEFT(A + L1, L2, A + K);
+  m2 = m1;
+
+  if (m2 < L2 && SORT_CMP_A(A + L1 + m2, A + K) == 0) {
+    m2 = GRAIL_BIN_SEARCH_RIGHT(A + L1 + m1, L2 - m1, A + K) + m1;
+  }
+
+  if (k1 == k2) {
+    GRAIL_ROTATE(A + k2, L1 - k2, m2);
+  } else {
+    GRAIL_ROTATE(A + k1, L1 - k1, m1);
+
+    if (m2 != m1) {
+      GRAIL_ROTATE(A + (k2 + m1), L1 - k2, m2 - m1);
+    }
+  }
+
+  GRAIL_REC_MERGE(A + (k2 + m2), L1 - k2, L2 - m2);
+  GRAIL_REC_MERGE(A, k1, m1);
+}
+
+static void REC_STABLE_SORT(SORT_TYPE *arr, size_t L) {
+  int m, h, p0, p1, rest;
+
+  for (m = 1; m < L; m += 2) {
+    if (SORT_CMP_A(arr + m - 1, arr + m) > 0) {
+      GRAIL_SWAP1(arr + (m - 1), arr + m);
+    }
+  }
+
+  for (h = 2; h < L; h *= 2) {
+    p0 = 0;
+    p1 = L - 2 * h;
+
+    while (p0 <= p1) {
+      GRAIL_REC_MERGE(arr + p0, h, h);
+      p0 += 2 * h;
+    }
+
+    rest = L - p0;
+
+    if (rest > h) {
+      GRAIL_REC_MERGE(arr + p0, h, rest - h);
+    }
+  }
+}
+
+/* Bubble sort implementation based on Wikipedia article
+   https://en.wikipedia.org/wiki/Bubble_sort
+*/
+void BUBBLE_SORT(SORT_TYPE *dst, const size_t size) {
+  size_t n = size;
+
+  while (n) {
+    size_t i, newn = 0U;
+
+    for (i = 1U; i < n; ++i) {
+      if (SORT_CMP(dst[i - 1U], dst[i]) > 0) {
+        SORT_SWAP(dst[i - 1U], dst[i]);
+        newn = i;
+      }
+    }
+
+    n = newn;
+  }
+}
+
+#undef QUICK_SORT
+#undef MEDIAN
+#undef SORT_CONCAT
+#undef SORT_MAKE_STR1
+#undef SORT_MAKE_STR
+#undef SORT_NAME
+#undef SORT_TYPE
+#undef SORT_CMP
+#undef TEMP_STORAGE_T
+#undef TIM_SORT_RUN_T
+#undef PUSH_NEXT
+#undef SORT_SWAP
+#undef SORT_CONCAT
+#undef SORT_MAKE_STR1
+#undef SORT_MAKE_STR
+#undef BINARY_INSERTION_FIND
+#undef BINARY_INSERTION_SORT_START
+#undef BINARY_INSERTION_SORT
+#undef REVERSE_ELEMENTS
+#undef COUNT_RUN
+#undef TIM_SORT
+#undef TIM_SORT_RESIZE
+#undef TIM_SORT_COLLAPSE
+#undef TIM_SORT_RUN_T
+#undef TEMP_STORAGE_T
+#undef MERGE_SORT
+#undef MERGE_SORT_RECURSIVE
+#undef MERGE_SORT_IN_PLACE
+#undef MERGE_SORT_IN_PLACE_RMERGE
+#undef MERGE_SORT_IN_PLACE_BACKMERGE
+#undef MERGE_SORT_IN_PLACE_FRONTMERGE
+#undef MERGE_SORT_IN_PLACE_ASWAP
+#undef GRAIL_SWAP1
+#undef REC_STABLE_SORT
+#undef GRAIL_REC_MERGE
+#undef GRAIL_SORT_DYN_BUFFER
+#undef GRAIL_SORT_FIXED_BUFFER
+#undef GRAIL_COMMON_SORT
+#undef GRAIL_SORT
+#undef GRAIL_COMBINE_BLOCKS
+#undef GRAIL_LAZY_STABLE_SORT
+#undef GRAIL_MERGE_WITHOUT_BUFFER
+#undef GRAIL_ROTATE
+#undef GRAIL_BIN_SEARCH_LEFT
+#undef GRAIL_BUILD_BLOCKS
+#undef GRAIL_FIND_KEYS
+#undef GRAIL_MERGE_BUFFERS_LEFT_WITH_X_BUF
+#undef GRAIL_BIN_SEARCH_RIGHT
+#undef GRAIL_MERGE_BUFFERS_LEFT
+#undef GRAIL_SMART_MERGE_WITH_X_BUF
+#undef GRAIL_MERGE_LEFT_WITH_X_BUF
+#undef GRAIL_SMART_MERGE_WITHOUT_BUFFER
+#undef GRAIL_SMART_MERGE_WITH_BUFFER
+#undef GRAIL_MERGE_RIGHT
+#undef GRAIL_MERGE_LEFT
+#undef GRAIL_SWAP_N
+#undef SQRT_SORT
+#undef SQRT_SORT_BUILD_BLOCKS
+#undef SQRT_SORT_MERGE_BUFFERS_LEFT_WITH_X_BUF
+#undef SQRT_SORT_MERGE_DOWN
+#undef SQRT_SORT_MERGE_LEFT_WITH_X_BUF
+#undef SQRT_SORT_MERGE_RIGHT
+#undef SQRT_SORT_SWAP_N
+#undef SQRT_SORT_SWAP_1
+#undef SQRT_SORT_SMART_MERGE_WITH_X_BUF
+#undef SQRT_SORT_SORT_INS
+#undef SQRT_SORT_COMBINE_BLOCKS
+#undef SQRT_SORT_COMMON_SORT
+#undef SORT_CMP_A
+#undef BUBBLE_SORT