/*
On an 8-core machine, this takes the top 3-bits
of random numbers and distributes them to the 8 cores
for sorting.    Then, it merge sorts them.
*/

#define NUM             1000000

I64 my_mp_count = 1 << Bsr(mp_count);//Power of 2

I32 *arg1, *arg2;
I32 *b[my_mp_count], bn[my_mp_count];
I64  mp_not_done_flags;

I64 Compare(I32 *e1, I32 *e2)
{
        return *e1 - *e2;
}

U0 QuickSortU32(I32 *base, I64 num)
{//By customizing, we dramatically improve it!
//Cut and paste from QuickSortI64().
        I64  i;
        I32 *less, *greater, pivot;

        if (num > 1)
        {
                do
                {
                        less = base;
                        greater = base + num;
                        pivot = base[num / 2];
                        while (less < greater)
                        {
                                if (*less <= pivot)
                                        less++;
                                else
                                {
                                        greater--;
                                        SwapU32(less, greater);
                                }
                        }
                        i = less - base;
                        if (i == num)
                        {//All less or equ to pivot

                                //Point greater to first less
                                do greater--;
                                while (--i && *greater == pivot);

                                if (i)
                                {
                                        less = base  +num / 2; //Pivot was not moved, point to it
                                        if (less < greater)
                                                SwapU32(less, greater);
                                        num = i;
                                }
                                else //All equ
                                        break;
                        }
                        else if (i < num / 2)
                        {
                                QuickSortU32(base, i);
                                num -= i;
                                base = greater;
                        }
                        else
                        {
                                QuickSortU32(greater, num - i);
                                num = i;
                        }
                }
                while (num > 1);
        }
}

U0 MPSort(I64 dummy=0)
{
        no_warn dummy;
        QuickSortU32(b[Gs->num], bn[Gs->num]);
        LBtr(&mp_not_done_flags, Gs->num);
}

U0 MPRadixSortDemo(I64 dummy=0)
{
        no_warn dummy;
        I64 i, j, k1, k2;
        F64 t0;

        arg1 = MAlloc(NUM * sizeof(I32));
        for (i = 0; i < NUM; i++)
                arg1[i] = RandI32;

        arg2 = MAlloc(NUM * sizeof(I32));

        "$GREEN$QuickSort$FG$\n";
        t0 = tS;
        MemCopy(arg2, arg1, sizeof(I32) * NUM);
        QuickSort(arg2, NUM, sizeof(I32), &Compare);
        "Time:%9.6f\n", tS - t0;
        D(arg2 + NUM / 4);

        "$GREEN$QuickSortU32$FG$\n";
        t0 = tS;
        MemCopy(arg2, arg1, sizeof(I32)*NUM);
        QuickSortU32(arg2, NUM);
        "Time:%9.6f\n", tS - t0;
        D(arg2 + NUM / 4);

        for (i = 0; i < my_mp_count; i++)
        {
                //We must do full size, just in case.
                //There will be uneven split between cores
                //depending on the distribution of rand numbers.
                b[i] = MAlloc(NUM * sizeof(I32));
                bn[i] = 0;
        }

        if (my_mp_count < 2)
                throw('MultCore');

        "$GREEN$MP Radix QuickSortU32$FG$\n";
        t0 = tS;
        k1 = 32 - Bsr(my_mp_count);
        k2 = my_mp_count / 2;
        for (i = 0; i < NUM; i++)
        {
                j = arg1[i] >> k1 + k2; //This is a preliminary radix sort.
                b[j][bn[j]++] = arg1[i];
        }
        mp_not_done_flags = 1 << my_mp_count - 1;
        for (i = 0; i < my_mp_count; i++)
                Spawn(&MPSort, NULL, NULL, i);
        while (mp_not_done_flags)
                Yield;
        j = 0;
        for (i = 0; i < my_mp_count; i++)
        {
                MemCopy(&arg2[j], b[i], bn[i] * sizeof(I32));
                j += bn[i];
        }
        "Time:%9.6f\n", tS - t0;
        D(arg2 + NUM / 4);

        Free(arg1);
        Free(arg2);
        for (i = 0; i < my_mp_count; i++)
                Free(b[i]);
}

MPRadixSortDemo;

/* Results on 8 Cores 3.397GHz Core i7:
QuickSort
Time: 0.759998
QuickSortU32
Time: 0.093684
MP Radix QuickSortU32
Time: 0.045450
*/