/*
"Fixed point" means you use ints
that are scaled by a value.  A common
example would be using number of pennies
instead of dollars with a float.

Fixed-point used to be much faster,
but modern processors do well with
floats.  It also depends on the compiler
and the CosmiC compiler is poor with floats.

Terry often use 64-bit ints with upper 32-bits
as int and lower 32-bits as fraction.

See ::/Demo/SubIntAccess.CC for how
to access upper or lower 32-bits.
*/

#define SAMPLE_SIZE         10000000

I32 coordinates[65536];

asm {
_ASM_FIXED_POINT::
                PUSH        RBP
                MOV         RBP, RSP
                PUSH        RSI
                PUSH        RDI
                MOV         RSI, coordinates
                MOV         RDI, ToI64(Sin(1.0) * 0x100000000)
                XOR         RBX, RBX //SUM
                MOV         RCX, SAMPLE_SIZE-1
@@05:           XOR         RDX, RDX
                MOV         DX,  CX
                MOVSXD      RAX, U32 [RSI + RDX * 4]
                IMUL        RDI
                SAR         RAX, 32
                ADD         RBX, RAX
                DEC         RCX
                JGE         @@05
                MOV         RAX, RBX
                POP         RDI
                POP         RSI
                POP         RBP
                RET

SINE_VAL:       DU64        Sin(1.0);
RET_VAL:        DU64        0;

_ASM_FLOAT::
                PUSH        RBP
                MOV         RBP, RSP
                PUSH        RSI
                MOV         RSI, coordinates
                FLD         U64 [SINE_VAL]
                FLDZ
                MOV         RCX, SAMPLE_SIZE - 1
@@05:           XOR         RDX, RDX
                MOV         DX,  CX
                FILD        U32 [RSI + RDX * 4]
                FMUL        ST0, ST2
                FADDP       ST1, ST0
                DEC         RCX
                JGE         @@05
                FISTP       U64 [RET_VAL]
                MOV         RAX, U64 [RET_VAL]
                FFREE       ST0
                FINCSTP
                POP         RSI
                POP         RBP
                RET
}

_extern _ASM_FIXED_POINT I64 AsmFixedPt();
_extern _ASM_FLOAT       I64 AsmFloat();

U0 Main()
{
    I64 start, end, overhead_time, test_time;
    F64 d1, fsum;
    I64 reg i, tmp, reg d2, reg sum;

    CPURep;

    //Set-up some sample coordinates
    for (i = 0; i < 65536; i++)
        coordinates[i] = RandU32;

        //Measure Loop Overhead
    start = TSCGet;
    for (i = SAMPLE_SIZE - 1; i >= 0; i--)
    {
    }
    end = TSCGet;
    overhead_time = end-start;
    "$RED$Overhead Cycles       :%10.5f$FG$\n", ToF64(overhead_time) / SAMPLE_SIZE;

    //Measure F64 arithmetic
    // (Some of this is due to crappy
    // compiler code.)
    d1 = Sin(1.0);
    fsum = 0;
    start = TSCGet;
    for (i = SAMPLE_SIZE - 1; i >= 0; i--)
        fsum += d1 * coordinates[i & 65535];
    end = TSCGet;
    test_time = end-start;
    "Float Sum             :%X\n", ToI64(fsum);
    "$RED$Float Cycles          :%10.5f$FG$\n", ToF64(test_time) / SAMPLE_SIZE;

    //Measure fixed point arithmetic
    d2 = Sin(1.0) * 0x100000000;
    sum = 0;
    start = TSCGet;
    for (i = SAMPLE_SIZE - 1; i >= 0; i--)
    {
        tmp = d2  *coordinates[i & 65535];
        sum += tmp.i32[1];
    }
    end = TSCGet;
    test_time = end - start;
    "Fixed-Point Sum       :%X\n", sum;
    "$RED$Fixed-Point Cycles    :%10.5f$FG$\n", ToF64(test_time) / SAMPLE_SIZE;

    //Measure fixed point arithmetic
    start = TSCGet;
    sum = AsmFixedPt;
    end = TSCGet;
    test_time = end - start;
    "Asm Fixed-Point Sum   :%X\n", sum;
    "$RED$Asm Fixed-Point Cycles:%10.5f$FG$\n", ToF64(test_time) / SAMPLE_SIZE;

    //Measure float arithmetic
    start = TSCGet;
    sum = AsmFloat;
    end = TSCGet;
    test_time = end - start;
    "Asm Float Sum         :%X\n", sum;
    "$RED$Asm Float Cycles      :%10.5f$FG$\n", ToF64(test_time) / SAMPLE_SIZE;

}

Main;

/*  Program Output

Machine 1:
8 Cores 2.660GHz
Overhead Cycles       :   2.00814
Float Sum             :FFFFE1D361BEED68
Float Cycles          :  10.16076
Fixed-Point Sum       :FFFFE1D361729914
Fixed-Point Cycles    :   5.29392
Asm Fixed-Point Sum   :FFFFE1D361729914
Asm Fixed-Point Cycles:   4.20464
Asm Float Sum         :FFFFE1D361BEED56
Asm Float Cycles      :   3.04635

Machine 2:
8 Cores 3.395GHz
Overhead Cycles       :   4.87040
Float Sum             :D20A01DB177
Float Cycles          :  10.11558
Fixed-Point Sum       :D209FD18CC7
Fixed-Point Cycles    :   4.50618
Asm Fixed-Point Sum   :D209FD18CC7
Asm Fixed-Point Cycles:   3.02426
Asm Float Sum         :D20A01DB17B
Asm Float Cycles      :   3.21070

*/