/*
The moral of this story is simple
instruction level optimizations
don't matter much on a modern Intel CPU
because they convert complex insts
to a stream of RISC insts.

Terry learned this the hard way when he thought
he was greatly improving the compiler by
cutting code by a third.    No significant
speed-up.  Depressing.
*/

#define SAMPLES (8 * 10000000 + 1)

asm {

LIMIT::         DU64        SAMPLES;                //Memory reference should be bad, right?

_BADLY_UNOPTIMIZED::
                MOV         RAX, 0
                MOV         RCX, 1
@@05:           MOV         RDX, RCX
                INC         RCX                         //if no dependencies, Free!
                ADD         RAX, RDX
                MOV         RDX, LIMIT - 16     //added 16 displacement to make it worse
                CMP         RCX, U64 16[RDX]
                JB          @@05
                RET

_WELL_OPTIMIZED1::
                XOR         RAX, RAX
                MOV         RCX, SAMPLES - 1
@@05:           ADD         RAX, RCX
                DEC         RCX
                JNZ         @@05
                RET

_WELL_OPTIMIZED2:: //Unrolled
                XOR         RAX, RAX
                MOV         RCX, SAMPLES - 1
@@05:           ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                ADD         RAX, RCX
                DEC         RCX
                JNZ         @@05
                RET

_WELL_OPTIMIZED3::
                XOR         RAX, RAX
                MOV         RCX, SAMPLES - 1
@@05:           ADD         RAX, RCX
                LOOP        @@05        //Instruction has slow speed, but saves code size.
                RET
}

_extern _BADLY_UNOPTIMIZED  I64 Loop1();
_extern _WELL_OPTIMIZED1    I64 Loop2();
_extern _WELL_OPTIMIZED2    I64 Loop3();
_extern _WELL_OPTIMIZED3    I64 Loop4();

I64 i;
F64 t0;

CPURep;

"Bad Code\n";
t0 = tS;
i = Loop1;
"Res:%d Time:%9.6f\n", i, tS - t0;

"Good Code #1\n";
t0 = tS;
i = Loop2;
"Res:%d Time:%9.6f\n", i, tS - t0;

"Good Code #2\n";
t0 = tS;
i = Loop3;
"Res:%d Time:%9.6f\n", i, tS - t0;

"Good Code #3\n";
t0 = tS;
i = Loop4;
"Res:%d Time:%9.6f\n", i, tS - t0;

/*  Program Output
8 Cores 2.660GHz
Bad Code
Res:3200000040000000 Time: 0.069966
Good Code #1
Res:3200000040000000 Time: 0.062567
Good Code #2
Res:3200000040000000 Time: 0.062907
Good Code #3
Res:3200000040000000 Time: 0.156359
*/