/////////////////////////////////////////////////////////////////////////////
//
//      SORT_T.CPP
//
//      Radix-sort versus Ordering Table [polygon sorting]
//
//      Coded especially for Measure#7 by Maciej Sinilo (Yarpen/Substance)
//
//      - basic info about the ordering table method found in article
//        by Nathan Whitaker of Reflection Software
//
//      For more informations about the method of ordering table check out
//      the article this code is accompanying (Measure#7, 3D Gems).
//      It is in Polish and contains some of my thoughts concerning
//      advantages/disadvantages of both approaches.
//      For English document -- check out the one by N. Whitaker.
//
//      The codes are speed-optimized, not size or precision (for example
//      I sort only the integer part of fixed-point number to decrease the
//      number of needed iterations -- this can be easily fixed with
//      radix-sort [I think that sorting also high 8 bytes of fraction would
//      be really enough for everyone], but increasing the precision in OT
//      method would slow things down really dramatically (bigger table)).
//
//      Compiles only with WATCOM because of specific assembly inlines.
//      Tested successfully with WATCOM 10.0a.
//
//      Contact: msinilo@kki.net.pl (KKI is free server, so some stupid
//                                   ads are appended to my mail...)
//
/////////////////////////////////////////////////////////////////////////////

#include <string.h>             // memset
#include <stdlib.h>             // rand
#include <iostream.h>

// --- Global stuff ----------------------------------------------------------

typedef unsigned short ushort16;
typedef unsigned int   uint32;

// Change this to the number of polys you are dealing with in your engine
const int NR_ELEMENTS = 8000;

// only MINIMAL information
typedef struct poly_s
{
        uint32 sortedZ;
        struct poly_s *next;
} poly_t;


// Radix data
poly_t *stack0[256];
poly_t *stack1[256];

// Ordering Table
poly_t **ot = 0;



// -----------------------------------------------------------------------------

// Profiling stuff
void RDTSC_Start(uint32 *counter);
uint32 RDTSC_End(uint32 counter);

#pragma aux RDTSC_Start =       \
  "     db      0xF, 0x31"      \
  "     mov     [ebx], eax"     \
  "     cld"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  "     nop"                    \
  modify [eax edx]              \
  parm caller [ebx];

// 'bout 17 ticks of overhead (filler instruction)
// 15 on plain pentium (i got mmx, so 17)
#pragma aux RDTSC_End =         \
  "     clc"                    \
  "     db      0xF, 0x31"      \
  "     sub     eax, ebx"       \
  "     sub     eax, 17"        \
  "     cld"                    \
  value [eax]                   \
  modify [eax edx]              \
  parm caller [ebx];


// -----------------------------------------------------------------------------
// Check if data was really sorted
// Compare only integer part of fixed-point data
static uint32 checkCorrectness(poly_t **data, int len)
{
        uint32 errors = 0;

        for (len = len - 1; len; --len, ++data)
                if ((data[0]->sortedZ & 0xFFFF0000) > (data[1]->sortedZ & 0xFFFF0000))
                        errors++;

        return errors;
}

// Get memory for polygons and init their Z values with random stuff
// (full dword is used for that! -- it simulates 16:16 fixed point format)
static poly_t **initPolys(int len)
{
        poly_t **data = new poly_t*[len];
        if (!data)
                exit(1);

        for (int i = 0; i < len; i++)
        {
                data[i] = new poly_t;
                if (!data[i])
                        exit(1);

                data[i]->sortedZ = rand() | (rand() << 16);
                data[i]->next = 0;
        }
        return data;
}

// Delete poly structures
static void deletePolys(poly_t **data, int len)
{
        for (int i = 0; i < len; i++)
                if (data[i])
                        delete data[i];

        delete []data;
}



// ---- Radix sort -----------------------------------------------------------

static void radixInit()
{
        memset(stack0, 0, sizeof(stack0));
        memset(stack1, 0, sizeof(stack1));
}

////////
// Main sorting function
// Simple radix sort with two iterations + one for sending sorted data
static void radixSort(poly_t **data, int len)
{
        int i;
        uint32 index;

        poly_t **pdata = data;
        for (i = len; i; --i, ++pdata)
        {
                index = ((*pdata)->sortedZ >> 16) & 0xFF;  // lower byte
                (*pdata)->next = stack0[index];
                stack0[index] = (*pdata);
        }

        pdata = &stack0[255];
        poly_t *nxt;
        for (i = 256; i; --i, --pdata)
                while ((*pdata))
                {
                        index = ((*pdata)->sortedZ >> 24) & 0xFF;
                        nxt = (*pdata)->next;
                        (*pdata)->next = stack1[index];
                        stack1[index] = (*pdata);
                        *pdata = nxt;
                }

        // High word sorted, build new poly list
        pdata = &stack1[0];
        for (i = 256; i; --i, ++pdata)
                while ((*pdata))
                {
                        *data++ = *pdata;
                        *pdata = (*pdata)->next;
                }

}


// Main test routine. Prepare data, process sorting and return number of
// cycles used for sorting
// IN: number of elements (polys in this case) to sort
// OUT: number of cycles taken by sorting
static uint32 radixTest(int len)
{
        poly_t **data = initPolys(len);

        radixInit();

        uint32 counter = 0;
        RDTSC_Start(&counter);
        radixSort(data, len);
        counter = RDTSC_End(counter);

        cout << "Radix errors: " << checkCorrectness(data, len) << endl;

        deletePolys(data, len);

        return counter;
}

// --- Ordering Table --------------------------------------------------------

// Size of ordering table. The bigger table -- the bigger precision and
// the smaller speed.
#define OT_SIZE (32768)

static void otSort(poly_t **data, int len)
{
        uint32 index;

        poly_t **pdata = data;
        for ( ; len; --len, ++pdata)
        {
                index = ((*pdata)->sortedZ >> 16) & 0x7FFF;
                (*pdata)->next = ot[index];
                ot[index] = *pdata;
        }

        // Build sorted polylist... This is the most expensive moment of
        // whole sorting (!)
        poly_t **p_ot = &ot[0];
        pdata = data;
        for (uint32 i = 0; i < OT_SIZE; i++, ++p_ot)
        {
                while (*p_ot)
                {
                        *pdata++ = *p_ot;
                        *p_ot = (*p_ot)->next;
                }
        }
}

// Main test routine. Prepare data, process sorting and return number of
// cycles used for sorting
// IN: number of elements (polys in this case) to sort
// OUT: number of cycles taken by sorting
static uint32 otTest(int len)
{
        poly_t **data = initPolys(len);

        ot = new poly_t* [OT_SIZE];
        if (!ot)
                exit(1);

        memset(ot, 0, OT_SIZE * sizeof(poly_t *));

        uint32 counter = 0;
        RDTSC_Start(&counter);
        otSort(data, len);
        counter = RDTSC_End(counter);

        cout << "OT errors: " << checkCorrectness(data, len) << endl;

        deletePolys(data, len);
        delete []ot;

        return counter;
}



// =============================================================================


int main(void)
{
        cout << endl << endl;
        cout << "Radix sort VS Ordering Table [polygon sorting]. "
             << "Speed comparison." << endl
             << "Elements to sort: " << NR_ELEMENTS << endl
             << "------------------------------------------------------\n\n";

        uint32 radix_res = radixTest(NR_ELEMENTS);
        uint32 ot_res    = otTest(NR_ELEMENTS);

        cout << "Radix sort results: " << radix_res << " ticks = about "
             << (double)radix_res / NR_ELEMENTS << " ticks/poly" << endl;
        cout << "OT sort results: " << ot_res << " ticks = about "
             << (double)ot_res / NR_ELEMENTS << " ticks/poly" << endl << endl;

        cout << "Ticks result aren't very exact (I didn't take function "
             << "calls and so on" << endl
             << "into an account. The aim is only to show, which method is "
             << "faster." << endl;

        return 0;
}