/*

The CosmiC assembler currently has partial SSE support.

SSE instructions with no prefix are fully supported,
but instructions prefixed with 0x66, 0xF2, or F3
can only be assembled using the lower registers,
XMM0-XMM7 and RAX-RDI.


SSE instructions supporting XMM0-XMM15 RAX-R15:
_________________________________________________

MOVAPS  MOVUPS  MOVLPS  MOVHPS  MOVLHPS MOVHLPS
MOVNTI  MOVNTPS MOVMSKPS

RCPPS   ANDPS   ANDNPS  ADDPS   SUBPS   MULPS
DIVPS   MINPS   MAXPS   ORPS    XORPS   SQRTPS
RSQRTPS CMPPS   SHUFPS

CVTPS2PD CVTDQ2PS CVTPI2PS

COMISS UCOMISS UNPCKLPS UNPCKHPS 
_________________________________________________

SSE instructions not in the above list most likely
can only be assembled using XMM0-XMM7 RAX-RDI.

*/

I64 DemoAllSSE()
{// Not meant to be run, just to
 // test Assembler and Unassembler against.
 // Will likely cause General Protection crash if run.
 // Some SSE ops require 16-byte aligned vals or else crash.
    I64 reg RDX res = 0;
asm
{
    LFENCE
    MFENCE
    SFENCE
    LDMXCSR     [RDX]
    STMXCSR     [RDX]
    LDDQU       XMM0, [RDX]
    MOVAPS      XMM0, XMM15
    MOVAPD      XMM0, XMM1
    MOVUPS      XMM0, XMM15
    MOVUPD      XMM0, XMM1
    MOVSS       XMM0, XMM1
    MOVSD_SSE   XMM0, XMM1
    MOVD        XMM0, ESI
    MOVQ        XMM0, RDX
    MOVLPS      XMM15, [RDX]
    MOVLPD      XMM0, [RDX]
    MOVHPS      XMM0, [RDX]
    MOVHPD      XMM0, [RDX]
    MOVDQA      XMM0, XMM1
    MOVDQU      XMM0, XMM1
    MOVDDUP     XMM0, XMM1
    MOVSLDUP    XMM0, XMM1
    MOVSHDUP    XMM0, XMM1
    MOVLHPS     XMM0, XMM15
    MOVHLPS     XMM0, XMM15
    MOVNTI      [RDX], R13
//  MOVNTPS     [RDX], XMM0
//  MOVNTPD     [RDX], XMM0
//  MOVNTDQ     [RDX], XMM1
    MOVMSKPS    RDX, XMM15
    MOVMSKPD    RDX, XMM0
    PMOVMSKB    RDX, XMM0
    PMOVSXBW    XMM0, XMM1
    PMOVSXBD    XMM0, XMM1
//  PMOVSXBQ    XMM0, XMM1
    PMOVSXWD    XMM0, XMM1
    PMOVSXWQ    XMM0, XMM1
    PMOVSXDQ    XMM0, XMM1
    PMOVZXBW    XMM0, XMM1
    PMOVZXBD    XMM0, XMM1
//  PMOVZXBQ    XMM0, XMM1
    PMOVZXWD    XMM0, XMM1
    PMOVZXWQ    XMM0, XMM1
    PMOVZXDQ    XMM0, XMM1
    HADDPS      XMM0, XMM1
    HADDPD      XMM0, XMM1
    HSUBPS      XMM0, XMM1
    HSUBPD      XMM0, XMM1
    ADDSUBPS    XMM0, XMM1
    ADDSUBPD    XMM0, XMM1
    RCPSS       XMM0, XMM1
    RCPPS       XMM0, XMM15
    ANDPS       XMM0, XMM15
    ANDPD       XMM0, XMM1
    ANDNPS      XMM0, XMM15
    ANDNPD      XMM0, XMM1
    ADDSS       XMM0, XMM1
    ADDSD       XMM0, XMM1
    ADDPS       XMM0, XMM15
    ADDPD       XMM0, XMM1
    SUBSS       XMM0, XMM1
    SUBSD       XMM0, XMM1
    SUBPS       XMM0, XMM15
    SUBPD       XMM0, XMM1
    MULSS       XMM0, XMM1
    MULSD       XMM0, XMM1
    MULPS       XMM0, XMM15
    MULPD       XMM0, XMM1
    DIVSS       XMM0, XMM0
    DIVSD       XMM0, XMM1
    DIVPS       XMM0, XMM15
    DIVPD       XMM0, XMM1
    MINSS       XMM0, XMM1
    MINSD       XMM0, XMM1
    MINPS       XMM0, XMM15
    MINPD       XMM0, XMM1
    MAXSS       XMM0, XMM1
    MAXSD       XMM0, XMM1
    MAXPS       XMM0, XMM15
    MAXPD       XMM0, XMM1
    ORPS        XMM0, XMM15
    ORPD        XMM0, XMM1
    XORPS       XMM0, XMM15
    XORPD       XMM0, XMM1
    SQRTSS      XMM0, XMM1
    SQRTSD      XMM0, XMM1
    SQRTPS      XMM0, XMM15
    SQRTPD      XMM0, XMM1
    RSQRTSS     XMM0, XMM1
    RSQRTPS     XMM0, XMM15
    CVTSD2SS    XMM0, XMM1
    CVTSS2SD    XMM0, XMM1
    CVTSS2SI    RDX, XMM0
    CVTSI2SD    XMM0, RDX
    CVTSI2SS    XMM0, RDX
    CVTSD2SI    RDX, XMM1
    CVTPS2PD    XMM0, XMM15
    CVTDQ2PS    XMM0, XMM15
    CVTPS2DQ    XMM0, XMM1
    CVTPD2DQ    XMM0, XMM1
    CVTDQ2PD    XMM0, XMM1
    CVTPD2PS    XMM0, XMM9
    CVTPI2PS    XMM15, [RDX]
    CVTPI2PD    XMM0, [RDX]
    CVTTSS2SI   RDX, XMM1
    CVTTSD2SI   RDX, XMM0
    CVTTPS2DQ   XMM0, XMM1
    CVTTPD2DQ   XMM0, XMM1
    COMISS      XMM0, XMM15
    COMISD      XMM0, XMM1
    UCOMISS     XMM0, XMM15
    UCOMISD     XMM0, XMM1
    UNPCKLPS    XMM0, XMM15
    UNPCKLPD    XMM0, XMM1
    UNPCKHPS    XMM0, XMM15
    UNPCKHPD    XMM0, XMM1
    MASKMOVDQU  XMM0, XMM1
    CMPSS       XMM0, XMM1, 0x74
    CMPSD_SSE   XMM0, XMM1, 0x63
    CMPPS       XMM0, XMM15, 0x52
    CMPPD       XMM0, XMM1, 0x41
    SHUFPS      XMM0, XMM15, 0x30
    SHUFPD      XMM0, XMM1, 0x29
    PSHUFB      XMM0, XMM1
    PSHUFD      XMM0, XMM1, 0x18
    PSHUFLW     XMM0, XMM1, 0x07
    PSHUFHW     XMM0, XMM1, 0x96
    ROUNDSS     XMM0, XMM1, 0x85
    ROUNDSD     XMM0, XMM1, 0x74
    ROUNDPS     XMM0, XMM1, 0x63
    ROUNDPD     XMM0, XMM1, 0x52
    BLENDVPS    XMM0, XMM1
    BLENDVPD    XMM0, XMM1
    BLENDPS     XMM0, XMM1, 0x99
    BLENDPD     XMM0, XMM1, 0x99
    PBLENDW     XMM0, XMM1, 0x99
    DPPS        XMM0, XMM1, 0x99
    DPPD        XMM0, XMM1, 0x99
    PALIGNR     XMM0, XMM1, 0x99
    PCLMULQDQ   XMM0, XMM1, 0x99
    PEXTRB      AH, XMM1, 0x99
    PEXTRW      RDX, XMM1, 0x99
    PEXTRD      ESI, XMM1, 0x99
    PEXTRQ      RDX, XMM1, 0x99
    EXTRACTPS   ESI, XMM1, 0x99
    PINSRB      XMM0, AH, 0x99
    PINSRW      XMM0, AX, 0x99
    PINSRD      XMM0, ESI, 0x99
    PINSRQ      XMM0, RDX, 0x99
    PCMPESTRM   XMM0, XMM1, 0x99
    PCMPESTRI   XMM0, XMM1, 0x99
    PCMPISTRM   XMM0, XMM1, 0x99
    PCMPISTRI   XMM0, XMM1, 0x99
    PCMPGTB     XMM0, XMM1
    PCMPGTW     XMM0, XMM1
    PCMPGTD     XMM0, XMM1
    PCMPGTQ     XMM0, XMM1
    PCMPEQB     XMM0, XMM1
    PCMPEQW     XMM0, XMM1
    PCMPEQD     XMM0, XMM1
    PCMPEQQ     XMM0, XMM1
    PSRLW       XMM0, XMM1
    PSRLD       XMM0, XMM1
    PSRLQ       XMM0, XMM1
    PSLLW       XMM0, XMM1
    PSLLD       XMM0, XMM1
    PSLLQ       XMM0, XMM1
    PSRAW       XMM0, XMM1
    PSRAD       XMM0, XMM1
    PAVGB       XMM0, XMM1
    PAVGW       XMM0, XMM1
    PABSB       XMM0, XMM1
    PABSW       XMM0, XMM1
    PABSD       XMM0, XMM1
    PAND        XMM0, XMM1
    PANDN       XMM0, XMM1
    PHADDW      XMM0, XMM1
    PHADDD      XMM0, XMM1
    PHADDSW     XMM0, XMM1
    PADDUSB     XMM0, XMM1
    PADDUSW     XMM0, XMM1
    PADDSB      XMM0, XMM1
    PADDSW      XMM0, XMM1
    PHSUBW      XMM0, XMM1
    PHSUBD      XMM0, XMM1
    PHSUBSW     XMM0, XMM1
    PSUBUSB     XMM0, XMM1
    PSUBUSW     XMM0, XMM1
    PSUBSB      XMM0, XMM1
    PSUBSW      XMM0, XMM1
    PADDB       XMM0, XMM1
    PADDW       XMM0, XMM1
    PADDD       XMM0, XMM1
    PADDQ       XMM0, XMM1
    PSUBB       XMM0, XMM1
    PSUBW       XMM0, XMM1
    PSUBD       XMM0, XMM1
    PSUBQ       XMM0, XMM1
    PHMINPOSUW  XMM0, XMM1
    PMINUB      XMM0, XMM1
    PMINUW      XMM0, XMM1
    PMINUD      XMM0, XMM1
    PMINSB      XMM0, XMM1
    PMINSW      XMM0, XMM1
    PMINSD      XMM0, XMM1
    PMAXUB      XMM0, XMM1
    PMAXUW      XMM0, XMM1
    PMAXUD      XMM0, XMM1
    PMAXSB      XMM0, XMM1
    PMAXSW      XMM0, XMM1
    PMAXSD      XMM0, XMM1
    PMULLW      XMM0, XMM1
    PMULLD      XMM0, XMM1
    PMULHRSW    XMM0, XMM1
    PMULHUW     XMM0, XMM1
    PMULHW      XMM0, XMM1
    PMULUDQ     XMM0, XMM1
    PMULDQ      XMM0, XMM1
    PMADDWD     XMM0, XMM1
    PMADDUBSW   XMM0, XMM1
    PTEST       XMM0, XMM1
    PSLLDQ      XMM0, 0x11
    PSRLDQ      XMM0, 0x22
    PSIGNB      XMM0, XMM1
    PSIGNW      XMM0, XMM1
    PSIGND      XMM0, XMM1
    PXOR        XMM0, XMM1
    PACKSSWB    XMM0, XMM1
    PACKUSWB    XMM0, XMM1
    PACKSSDW    XMM0, XMM1
    PACKUSDW    XMM0, XMM1
    PUNPCKLBW   XMM0, XMM1
    PUNPCKLWD   XMM0, XMM1
    PUNPCKLDQ   XMM0, XMM1
    PUNPCKLQDQ  XMM0, XMM1
    PUNPCKHBW   XMM0, XMM1
    PUNPCKHWD   XMM0, XMM1
    PUNPCKHDQ   XMM0, XMM1
    PUNPCKHQDQ  XMM0, XMM1
    PSADBW      XMM0, XMM1
    MPSADBW     XMM0, XMM1, 0x21
    INSERTPS    XMM0, XMM1, 0x32
    PREFETCHT0  [RDX]
    PREFETCHT1  [RDX]
    PREFETCHT2  [RDX]
    PREFETCHNTA [RDX]

}
    return res;
}
"\n$BK,1$Unassembling all SSE ops, note errors:\n$BK,0$";
Uf("DemoAllSSE");



U0 DumpXMM()
{ // Dump XMM registers
    I64 reg RAX quad;

asm {PEXTRQ RAX, XMM0, 1}
    "XMM0: 0x%016X", quad;
asm {PEXTRQ RAX, XMM0, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM1, 1}
    "XMM1: 0x%016X", quad;
asm {PEXTRQ RAX, XMM1, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM2, 1}
    "XMM2: 0x%016X", quad;
asm {PEXTRQ RAX, XMM2, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM3, 1}
    "XMM3: 0x%016X", quad;
asm {PEXTRQ RAX, XMM3, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM4, 1}
    "XMM4: 0x%016X", quad;
asm {PEXTRQ RAX, XMM4, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM5, 1}
    "XMM5: 0x%016X", quad;
asm {PEXTRQ RAX, XMM5, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM6, 1}
    "XMM6: 0x%016X", quad;
asm {PEXTRQ RAX, XMM6, 0}
    "%016X\n", quad;

asm {PEXTRQ RAX, XMM7, 1}
    "XMM7: 0x%016X", quad;
asm {PEXTRQ RAX, XMM7, 0}
    "%016X\n", quad;

    "\n\n";
}

"\n\nDump XMM Registers function definition:\n";
Uf("DumpXMM");
"\n\n";

I64 DemoSSE()
{
    I64 reg RDX res = 0;
asm
{
    MOV         RDX, 0x3939393939393939
    MOVQ        XMM0, RDX

    MOV         RDX, 0x7777777777777777
    MOVQ        XMM1, RDX

    MOV         RDX, 0x2021202120212021
    MOVQ        XMM2, RDX

    MOV         RDX, 0x0123456789012345
    MOVQ        XMM3, RDX

    MOV         RDX, 0x0000400000005000
    MOVQ        XMM6, RDX
    MOV         RDX, 0x0000000300000002
    MOVQ        XMM7, RDX
}
    DumpXMM;
asm
{
    PSLLDQ  XMM0, 8
    PSLLDQ  XMM1, 8
    PSLLDQ  XMM2, 8
    PSLLDQ  XMM3, 8
    PHADDD  XMM6, XMM7

}
    DumpXMM;


    return res;
}


"\n$BK,1$Unassembling  and running SSE demo.\n$BK,0$";

Uf("DemoSSE"); "\n";

DemoSSE;