asm {
//************************************
//See ::/Doc/Credits.DD.
_MALLOC::
// Throws 'OutMem'
                PUSH        RBP
                MOV         RBP, RSP
                PUSH        RSI
                PUSH        RDI

                XOR         RBX, RBX
                MOV         RDX, U64 SF_ARG2[RBP]
                TEST        RDX, RDX
                JNZ         @@05
                MOV         RDX, U64 FS:CTask.addr[RBX]
@@05:           CMP         U32 CTask.task_signature[RDX], TASK_SIGNATURE_VAL

#assert CTask.task_signature == CHeapCtrl.hc_signature //location signature same

                JNE         @@10
                MOV         RDX, U64 CTask.data_heap[RDX]
@@10:           CMP         U32 CHeapCtrl.hc_signature[RDX], HEAP_CTRL_SIGNATURE_VAL
                JE          @@15
                PUSH        RDX
                CALL        &SysBadMAlloc
                JMP         I32 _SYS_HLT

@@15:           MOV         RAX, U64 SF_ARG1[RBP]
                PUSHFD
                ADD         RAX, CMemUsed.start + 7     //round-up to I64
                AND         AL,  0xF8
#assert CMemUsed.start >= sizeof(CMemUnused)
                CMP         RAX, CMemUsed.start
                JAE         @@20
                MOV         RAX, CMemUsed.start
@@20:

                CLI
@@25:           LOCK
                BTS         U32 CHeapCtrl.locked_flags[RDX], HClf_LOCKED
                PAUSE   //don't know if this instruction helps
                JC          @@25

                CMP         RAX, MEM_HEAP_HASH_SIZE
                JAE         @@30
                MOV         RSI, U64 CHeapCtrl.heap_hash[RAX + RDX]
                TEST        RSI, RSI
                JZ          @@35
                MOV         RCX, U64 CMemUnused.next[RSI]
                MOV         U64 CHeapCtrl.heap_hash[RAX + RDX], RCX
                JMP         I32 MALLOC_ALMOST_DONE

//Big allocation
@@30:           ADD         RAX, sizeof(CMemBlk) + MEM_PAG_SIZE - 1
                SHR         RAX, MEM_PAG_BITS

                PUSH        RDX //preserve HeapCtrl
                PUSH        RDX
                PUSH        RAX
                CALL        &MemPagTaskAlloc
                POP         RDX
                TEST        RAX, RAX
                JZ          @@45        //Out of memory
                MOV         RSI, RAX
                MOV         EAX, U32 CMemBlk.pags[RSI]

                SHL         RAX, MEM_PAG_BITS
                SUB         RAX, sizeof(CMemBlk)
                ADD         RSI, sizeof(CMemBlk)
                JMP         I32 MALLOC_ALMOST_DONE

//Little allocation, chunk-off piece from free list chunks
@@35:           LEA         RSI, U64 CHeapCtrl.malloc_free_list - CMemUnused.next[RDX]

@@40:           MOV         RBX, RSI
                MOV         RSI, U64 CMemUnused.next[RBX]
                TEST        RSI, RSI
                JNZ         I32 @@60
                PUSH        RAX                         //-**** save byte size
                ADD         RAX, 16 * MEM_PAG_SIZE - 1
                SHR         RAX, MEM_PAG_BITS

                PUSH        RDX //preserve HeapCtrl
                PUSH        RDX
                PUSH        RAX
                CALL        &MemPagTaskAlloc
                POP         RDX
                TEST        RAX, RAX
                JNZ         @@50

//Out of memory
@@45:           LOCK
                BTR         U32 CHeapCtrl.locked_flags[RDX], HClf_LOCKED
                POPFD
                PUSH        TRUE
                MOV         RAX, 'OutMem'
                PUSH        RAX
                CALL        I32 &throw
                JMP         I32 MALLOC_FINAL_EXIT //Never gets here, hopefully.

@@50:           MOV         RSI, RAX
                MOV         EAX, U32 CMemBlk.pags[RSI]
                SHL         RAX, MEM_PAG_BITS

//Can it be combined with last chunk? (Never Free these chunks.)
                MOV         RDI, U64 CHeapCtrl.last_mergable[RDX]
                LEA         RBX, U64 [RSI + RAX]
                CMP         RDI, RBX
                JNE         @@55

                PUSH        RAX
                MOV         EAX, U32 CMemBlk.pags[RDI]
                ADD         U32 CMemBlk.pags[RSI],EAX
//QueueRemove
                MOV         RAX, U64 CMemBlk.next[RDI]
                MOV         RBX, U64 CMemBlk.last[RDI]
                MOV         U64 CMemBlk.last[RAX], RBX
                MOV         U64 CMemBlk.next[RBX], RAX
                POP         RAX

@@55:           MOV         U64 CHeapCtrl.last_mergable[RDX], RSI
                LEA         RSI, U64 sizeof(CMemBlk)[RSI]
                SUB         RAX, sizeof(CMemBlk)
                LEA         RBX, U64 CHeapCtrl.malloc_free_list - CMemUnused.next[RDX]
                MOV         RDI, U64 CMemUnused.next[RBX]
                MOV         U64 CMemUnused.next[RSI], RDI
                MOV         U64 CMemUnused.size[RSI], RAX
                MOV         U64 CMemUnused.next[RBX], RSI
                POP         RAX     //+****
                JMP         @@70
@@60:           CMP         U64 CMemUnused.size[RSI], RAX
                JB          I32 @@40
                JNE         @@70

@@65:           MOV         RDI, U64 CMemUnused.next[RSI]
                MOV         U64 CMemUnused.next[RBX], RDI
                JMP         MALLOC_ALMOST_DONE

@@70:           SUB         U64 CMemUnused.size[RSI], RAX       //UPDATE FREE ENTRY
                CMP         U64 CMemUnused.size[RSI], sizeof(CMemUnused)
                JAE         @@75                                        //take from top of block
                ADD         U64 CMemUnused.size[RSI], RAX       //doesn't fit, undo
                JMP         I32 @@40

@@75:           ADD         RSI, U64 CMemUnused.size[RSI]

MALLOC_ALMOST_DONE:
//RSI = res  - CMemUsed.size
//RAX = size + CMemUsed.size
//RDX = HeapCtrl
                ADD         U64 CHeapCtrl.used_u8s[RDX], RAX

#if _CONFIG_HEAP_DEBUG
//QueueInsert
                MOV         RDI, U64 CHeapCtrl.last_um[RDX]
                MOV         U64 CMemUsed.next[RDI], RSI
                MOV         U64 CHeapCtrl.last_um[RDX], RSI
                MOV         U64 CMemUsed.last[RSI], RDI
                LEA         RDI, U64 CHeapCtrl.next_um - CMemUsed.next[RDX]
                MOV         U64 CMemUsed.next[RSI], RDI

//Caller1/Caller2
                PUSH        RDX
                MOV         RDX, U64 [MEM_HEAP_LIMIT]
                MOV         RDI, U64 SF_RIP[RBP]
                CMP         RDI, RDX
                JB          @@80
                XOR         RDI, RDI
                MOV         U64 CMemUsed.caller1[RSI], RDI
                JMP         @@90
@@80:           MOV         U64 CMemUsed.caller1[RSI], RDI
                MOV         RDI, U64 SF_RBP[RBP]
                CMP         RDI, RDX
                JB          @@85
                XOR         RDI, RDI
                JMP         @@90
@@85:           MOV         RDI, U64 SF_RIP[RDI]
                CMP         RDI, RDX
                JB          @@90
                XOR         RDI, RDI
@@90:           MOV         U64 CMemUsed.caller2[RSI], RDI
                POP         RDX

#endif
                LOCK
                BTR         U32 CHeapCtrl.locked_flags[RDX], HClf_LOCKED
                POPFD

                MOV         U64 CMemUsed.size[RSI], RAX
                MOV         U64 CMemUsed.hc[RSI], RDX
                LEA         RAX, U64 CMemUsed.start[RSI]

                TEST        U8 [SYS_SEMAS + SEMA_HEAPLOG_ACTIVE * DEFAULT_CACHE_LINE_WIDTH], 1
                JZ          @@105
                PUSH        RAX
                PUSH        RAX
                MOV         RAX, U64 [SYS_EXTERN_TABLE]
                MOV         RAX, U64 EXT_HEAPLOG_MALLOC*8[RAX]
                TEST        RAX, RAX
                JZ          @@95
                CALL        RAX
                JMP         @@100
@@95:           ADD         RSP, 8
@@100:          POP         RAX

@@105:          TEST        U8 [SYS_HEAP_INIT_FLAG], 1
                JZ          MALLOC_FINAL_EXIT

                PUSH        RAX
                MOV         RCX, U64 CMemUsed.size - CMemUsed.start[RAX]
                SUB         RCX, CMemUsed.start
                MOV         RDI, RAX
                MOV         AL, U8 [SYS_HEAP_INIT_VAL]
                REP_STOSB
                POP         RAX

MALLOC_FINAL_EXIT:
                POP         RDI
                POP         RSI
                POP         RBP
                RET1        16

//************************************
_FREE::
//Be aware of heap_hash in MemPagTaskAlloc().
                PUSH        RBP
                MOV         RBP, RSP
                PUSH        RSI
                PUSH        RDI

                TEST        U8 [SYS_SEMAS + SEMA_HEAPLOG_ACTIVE * DEFAULT_CACHE_LINE_WIDTH], 1
                JZ          @@15
                MOV         RBX, U64 SF_ARG1[RBP]
                TEST        RBX, RBX
                JZ          @@05
                MOV         RAX, U64 CMemUsed.size - CMemUsed.start[RBX]
                TEST        RAX, RAX
                JGE         @@05        //Aligned alloced chunks have neg size
                ADD         RBX, RAX
@@05:           PUSH        RBX
                MOV         RAX, U64 [SYS_EXTERN_TABLE]
                MOV         RAX, U64 EXT_HEAPLOG_FREE*8[RAX]
                TEST        RAX, RAX
                JZ          @@10
                CALL        RAX
                JMP         @@15
@@10:           ADD         RSP, 8

@@15:           MOV         RSI, U64 SF_ARG1[RBP]
                TEST        RSI, RSI

#if _CONFIG_HEAP_DEBUG
                JZ          I32 FREE_DONE
#else
                JZ          FREE_DONE
#endif

                MOV         RAX, U64 CMemUsed.size - CMemUsed.start[RSI]
                TEST        RAX, RAX
                JGE         @@20        //Aligned alloced chunks have neg size.
                                        //The neg size is offset to start of CMemUsed struct.
                ADD         RSI, RAX

@@20:           PUSHFD
                SUB         RSI, CMemUsed.start
                MOV         RDX, U64 CMemUsed.hc[RSI]
                CMP         U32 CHeapCtrl.hc_signature[RDX], HEAP_CTRL_SIGNATURE_VAL
                JE          @@25
                ADD         RSI, CMemUsed.start
                PUSH        RSI
                CALL        &SysBadFree
                JMP         I32 _SYS_HLT

@@25:           MOV         RAX, U64 CMemUsed.size[RSI]
                SUB         U64 CHeapCtrl.used_u8s[RDX], RAX
                CLI
@@30:           LOCK
                BTS         U32 CHeapCtrl.locked_flags[RDX], HClf_LOCKED
                PAUSE
                JC          @@30
#if _CONFIG_HEAP_DEBUG
//QueueRemove
                MOV         RDX, U64 CMemUsed.next[RSI]
                MOV         RDI, U64 CMemUsed.last[RSI]
                MOV         U64 CMemUsed.last[RDX], RDI
                MOV         U64 CMemUsed.next[RDI], RDX

//Caller1/Caller2
                MOV         RDX, U64 [MEM_HEAP_LIMIT]
                MOV         RDI, U64 SF_RIP[RBP]
                CMP         RDI, RDX
                JB          @@35
                XOR         RDI, RDI
                MOV         U64 CMemUnused.caller1[RSI], RDI
                JMP         @@45
@@35:           MOV         U64 CMemUnused.caller1[RSI], RDI
                MOV         RDI, U64 SF_RBP[RBP]
                CMP         RDI, RDX
                JB          @@40
                XOR         RDI, RDI
                JMP         @@45
@@40:           MOV         RDI, U64 SF_RIP[RDI]
                CMP         RDI, RDX
                JB          @@45
                XOR         RDI, RDI
@@45:           MOV         U64 CMemUnused.caller2[RSI], RDI

                MOV         RDX, U64 CMemUsed.hc[RSI]
#endif
                CMP         RAX, MEM_HEAP_HASH_SIZE
                JAE         @@50

#assert CMemUnused.size == CMemUsed.size
//              MOV         U64 CMemUnused.size[RSI], RAX

                MOV         RBX, U64 CHeapCtrl.heap_hash[RAX + RDX]
                MOV         U64 CMemUnused.next[RSI], RBX
                MOV         U64 CHeapCtrl.heap_hash[RAX + RDX], RSI
                JMP         @@55

@@50:           SUB         RSI, sizeof(CMemBlk)
                PUSH        RDX
                PUSH        RDX
                PUSH        RSI
                CALL        &MemPagTaskFree
                POP         RDX

@@55:           LOCK
                BTR         U32 CHeapCtrl.locked_flags[RDX], HClf_LOCKED
                POPFD
FREE_DONE:
                POP         RDI
                POP         RSI
                POP         RBP
                RET1        8

//************************************
_MSIZE::
                PUSH        RBP
                MOV         RBP, RSP
                MOV         RBX, U64 SF_ARG1[RBP]
                XOR         RAX, RAX
                TEST        RBX, RBX
                JZ          @@10
                MOV         RAX, U64 CMemUsed.size - CMemUsed.start[RBX]
                TEST        RAX, RAX
                JGE         @@05        //Aligned alloced chunks have neg size
                ADD         RBX, RAX
                MOV         RAX, U64 CMemUsed.size - CMemUsed.start[RBX]
@@05:           SUB         RAX, CMemUsed.start
@@10:           POP         RBP
                RET1        8

//************************************
_MSIZE2::
                PUSH        RBP
                MOV         RBP, RSP
                MOV         RBX, U64 SF_ARG1[RBP]
                XOR         RAX, RAX
                TEST        RBX, RBX
                JZ          @@10
                MOV         RAX, U64 CMemUsed.size-CMemUsed.start[RBX]
                TEST        RAX, RAX
                JGE         @@05        //Aligned alloced chunks have neg size
                ADD         RBX, RAX
@@05:           MOV         RAX, U64 CMemUsed.size - CMemUsed.start[RBX]
@@10:           POP         RBP
                RET1        8

//************************************
_MHEAP_CTRL::
                PUSH        RBP
                MOV         RBP, RSP
                MOV         RBX, U64 SF_ARG1[RBP]
                XOR         RAX, RAX
                TEST        RBX, RBX
                JZ          @@10
                MOV         RAX, U64 CMemUsed.size-CMemUsed.start[RBX]
                TEST        RAX, RAX
                JGE         @@05        //Aligned alloced chunks have neg size
                ADD         RBX, RAX
@@05:           MOV         RAX, U64 CMemUsed.hc - CMemUsed.start[RBX]
@@10:           POP         RBP
                RET1        8
}

_extern _FREE       U0           Free(U8 *addr); //Free MAlloc()ed memory chunk.
_extern _MSIZE      I64          MSize(    U8 *src); //Size of heap object.
_extern _MSIZE2     I64          MSize2(   U8 *src); //Internal size of heap object.
_extern _MHEAP_CTRL CHeapCtrl   *MHeapCtrl(U8 *src); //CHeapCtrl of object.
_extern _MALLOC     U8          *MAlloc(I64 size, CTask *mem_task=NULL); //Alloc memory chunk.
//Accepts a CTask or CHeapCtrl. NULL allocs off current task's heap.

U8 *SysMAlloc(I64 size)
{//Alloc memory in System task's heap.
    return MAlloc(size, sys_task);
}

U8 *CAlloc(I64 size, CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
    U8 *res = MAlloc(size, mem_task);

    MemSet(res, 0, size);

    return res;
}

U8 *SysCAlloc(I64 size)
{//Alloc and set to zero memory in System task's heap.
    return CAlloc(size, sys_task);
}

U8 *MAllocIdent(U8 *src, CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
    U8 *res;
    I64 size;

    if (!src)
        return NULL;
    size = MSize(src);
    res  = MAlloc(size, mem_task);
    MemCopy(res, src, size);

    return res;
}

U8 *SysMAllocIdent(U8 *src)
{//Alloc in System task's heap, ident copy of heap node.
    return MAllocIdent(src, sys_task);
}

U8 *MAllocAligned(I64 size, I64 alignment, CTask *mem_task=NULL, I64 misalignment=0)
{//Only powers of two alignment. This is awful.
    I64  mask = alignment - 1;
    U8  *ptr  = MAlloc(size + mask + sizeof(I64) + misalignment, mem_task),
        *res  = (ptr + sizeof(I64) + mask) & ~mask + misalignment;

    res(I64 *)[-1] = ptr - res;
#assert offset(CMemUsed.size) == offset(CMemUsed.start) - sizeof(I64)

    return res;
}

U8 *CAllocAligned(I64 size, I64 alignment, CTask *mem_task=NULL, I64 misalignment=0)
{//Only powers of two alignment. This is awful.
    I64  mask = alignment-1;
    U8  *ptr  = MAlloc(size + mask + sizeof(I64) + misalignment, mem_task),
        *res  = (ptr + sizeof(I64) + mask) & ~mask + misalignment;

    res(I64 *)[-1] = ptr - res;
#assert offset(CMemUsed.size) == offset(CMemUsed.start) - sizeof(I64)
    MemSet(res, 0, size);

    return res;
}

U8 *ReAlloc(U8 *ptr, U64 new_size, CTask *mem_task=NULL)
{//Resize previously MAlloc'ed chunk. If new_size is zero then act as Free.
//If pointer is NULL then act as MAlloc. if both are NULL/0 does nothing (Free(NULL))
//Useless for changing chunk sizes smaller than 8 bytes because MAlloc allocs 8 bytes at a time.
    U8 *res;

    if (!new_size)
    {
        Free(ptr); //we can free NULL
        return NULL;
    }

    res = MAlloc(new_size, mem_task);
    if (!ptr)
        return res;

    MemCopy(res, ptr, MinI64(MSize(ptr), new_size));
    Free(ptr);

    return res;
}

U8 *SysReAlloc(U8 *ptr, I64 new_size)
{//Realloc in System task's heap.
    return ReAlloc(ptr, new_size, sys_task);
}

U8 *StrNew(U8 *buf, CTask *mem_task=NULL)
{//Accepts a CTask or CHeapCtrl.  NULL allocs off current task's heap.
    U8 *res;
    I64 size;

    if (buf)
    {
        size = StrLen(buf) + 1;
        res  = MAlloc(size, mem_task);
        MemCopy(res, buf, size);
    }
    else
    {
        res  = MAlloc(1, mem_task);
        *res = 0;
    }
    return res;
}

U8 *SysStrNew(U8 *buf)
{//Alloc copy of string in System task's heap.
    return StrNew(buf, sys_task);
}