asm {
                ALIGN   16, OC_NOP
USE16
//See ZealOS MultiCore.

//This code gets copied to MP_VECT_ADDR.
//See MemCopy(MP_VECT_ADDR.
COREAP_16BIT_INIT::
                JMP         @@05

                ALIGN   4, OC_NOP
AP_GDT_PTR:     DU16        sizeof(CGDT) - 1;
                DU64        0;

@@05:           CLI
                WBINVD
                MOV         AX, MP_VECT_ADDR / 16
                MOV         DS, AX
                LGDT        U32 [CAP16BitInit.ap_gdt_ptr]  //See mp->ap_gdt_ptr

                MOV         EAX, SYS_START_CR0
                MOV_CR0_EAX

                MOV_EAX_CR4
                BTS         EAX, CR4f_OSFXSR // enable SSE
                MOV_CR4_EAX

                DU8         0x66, 0xEA;     //JMP CGDT.cs32:AP_32BIT_INIT
                DU32        AP_32BIT_INIT;
                DU16        CGDT.cs32;
COREAP_16BIT_INIT_END::

USE32
AP_32BIT_INIT:
                MOV         AX, CGDT.ds
                MOV         DS, AX
                MOV         ES, AX
                MOV         FS, AX
                MOV         GS, AX
                MOV         SS, AX

@@05:           LOCK
                BTS         U32 [SYS_MP_COUNT_LOCK], 0
                JC          @@05

                MOV         ESI, U32 [SYS_MP_COUNT_INITIAL]
                LOCK
                INC         U32 [SYS_MP_COUNT_INITIAL]
                LOCK
                BTR         U32 [SYS_MP_COUNT_LOCK], 0

                CMP         ESI, MP_PROCESSORS_NUM
                JAE         I32 _SYS_HLT

                IMUL2       ESI, sizeof(CCPU)
                ADD         ESI, U32 [SYS_CPU_STRUCTS]

                LEA         ESP, U32 CCPU.start_stack + sizeof(CCPU.start_stack)[ESI]
                PUSH        U32 RFLAGG_START
                POPFD
                PUSH        U32 0   //Return from next call will be 64-bit
                CALL        SYS_ENTER_LONG_MODE
USE64
                FNINIT
                MOV         RAX, RSI
                CALL        SET_GS_BASE
@@10:           MOV         RAX, U64 CCPU.executive_task[RSI]
                TEST        RAX, RAX
                JZ          @@10
                MOV         U64 CTask.gs[RAX], RSI
                CALL        SET_FS_BASE

                JMP         I32 _TASK_CONTEXT_RESTORE
}

U0 TSSBusy(I64 tr, Bool val=OFF)
{//See ::/Demo/Lectures/Ring3.CC.
    LBEqual((&sys_gdt)(U8 *) + tr + 4, 9, val);
}

CTSS *TSSNew(I64 cpu_num)
{
    U32  *d, *d1;
    CTSS *tss = CAlloc(sizeof(CTSS));

    tss->io_map_offset = offset(CTSS.io_map);
    MemSet(tss->io_map, 0xFF, 0x10000 / 8);

    tss->st0    = MAlloc(MEM_INTERRUPT_STACK);
    tss->rsp0   = tss->st0(U8 *) + MSize(tss->st0);
    tss->st1    = MAlloc(MEM_INTERRUPT_STACK);
    tss->rsp1   = tss->st1(U8 *) + MSize(tss->st1);
    tss->st2    = MAlloc(MEM_INTERRUPT_STACK);
    tss->rsp2   = tss->st2(U8 *) + MSize(tss->st2);

    tss->tr      = offset(CGDT.tr) + cpu_num * 16;
    tss->tr_ring3= offset(CGDT.tr_ring3) + cpu_num * 16;

    d = (&sys_gdt)(U8 *) + tss->tr;
    d1 = d(U8 *) + 4;
    *d = 0x0000FFFF;
    *d1 = 0x008F8900;
    d(U8 *) += 2;
    *d |= tss & 0x00FFFFFF;
    *d1++ |= tss & 0xFF000000;
    *d1++ = tss >> 32;
    *d1 = 0;

    d = (&sys_gdt)(U8 *) + tss->tr_ring3;
    d1 = d(U8 *) + 4;
    *d = 0x0000FFFF;
    *d1 = 0x008FE900;
    d(U8 *) += 2;
    *d |= tss & 0x00FFFFFF;
    *d1++ |= tss & 0xFF000000;
    *d1++ = tss >> 32;
    *d1 = 0;

    return tss;
}

CCPU *CPUStructInit(I64 num, CCPU *c, CTask *executive_task)
{//Executive is null when called by sys_task on CSysFixedArea.boot_cpu0
    MemSet(c, 0, sizeof(CCPU));
    c->addr         = c;
    c->num          = num;
    c->idle_factor  = 0.01;
    QueueInit(&c->next_dying);
    if (Bt(&sys_run_level, RLf_16MEG_SYSTEM_HEAP_CTRL))
    {
        c->idle_task = Spawn(0, NULL, "Idle Task",, Fs,, 0);
        LBts(&c->idle_task->task_flags, TASKf_IDLE);
        c->tss = TSSNew(num);
    }
    c->executive_task = executive_task;// It waits for this to be filled-in: executive_task

    return c;
}

U0 MPInt(U8 num, I64 cpu_num=1)
{//Generate interrupt for specified core.
    if (cpu_num >= mp_count)
    {
        if (!Bt(&sys_run_level, RLf_MP))
            return;
        else
            throw('MultCore');
    }

    PUSHFD
    CLI //Multitasking safe because each core has a local apic and IRQs are off
    while (*(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) & 0x1000)
        PAUSE
    *(dev.uncached_alias + LAPIC_ICR_HIGH)(U32 *) = dev.mp_apic_ids[cpu_num] << 24;
    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *)  = 0x4000 + num;
    POPFD
}

U0 MPIntAll(U8 num)
{//Generate interrupt for all but own core.
    PUSHFD
    CLI //Multitasking safe because each core has a local apic and IRQs are off
    while (*(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) & 0x1000)
        PAUSE
    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4800 + num;
    POPFD
}

U0 MPNMInt()
{//Generate nonmaskable interrupt.
    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4400;
}

U0 MPHalt()
{//Halt all other cores.
    mp_count = 1;
    MPNMInt;
    Busy(10000);
}

U0 MPAPICInit()
{//Called by sys_task during start-up
//and other cores during initialization
    //after Core0StartMP().
    *(dev.uncached_alias + LAPIC_SVR)(U32 *) |= LAPICF_APIC_ENABLED;
    dev.mp_apic_ids[Gs->num] = *(dev.uncached_alias + LAPIC_APIC_ID)(U32 *) >> 24;
    *(dev.uncached_alias + LAPIC_LDR)(U32 *) = dev.mp_apic_ids[Gs->num] << 24;
    *(dev.uncached_alias + LAPIC_DFR)(U32 *) = 0xF0000000;

    //  MemSet(dev.uncached_alias + LAPIC_IRR, 0, 0x20);
    //  MemSet(dev.uncached_alias + LAPIC_ISR, 0, 0x20);
    //  MemSet(dev.uncached_alias + LAPIC_TMR, 0, 0x20);

    RAXSet(Gs->tss->tr);
    LTR     AX
    if (Gs->num)
    {
        IntInit1;
        RFlagsSet(RFLAGG_NORMAL);
    }
}

#assert !offset(CJobCtrl.next_waiting)

U0 CoreAPExecutiveTask()
{
    CJobCtrl *ctrl = &Fs->server_ctrl;

    while (TRUE)
    {
        STI
        do
        {
            TaskKillDying;
            do PAUSE
            while (LBts(&ctrl->flags, JOBCf_LOCKED));
        }
        while (ctrl->next_waiting != ctrl && JobRunOne(RFlagsGet, ctrl));

        CLI
        LBts(&Fs->task_flags, TASKf_AWAITING_MESSAGE);
        LBtr(&ctrl->flags, JOBCf_LOCKED);
        LBts(&Fs->task_flags, TASKf_IDLE);
        Yield;
        LBtr(&Fs->task_flags, TASKf_IDLE);
    }
}

CJob *JobQueue(I64 (*fp_addr)(U8 *data), U8 *data=NULL, I64 target_cpu=1, I64 flags=1<<JOBf_FREE_ON_COMPLETE, 
               I64 job_code=JOBT_CALL, U8 *aux_str=NULL, I64 aux1=0, I64 aux2=0)
{//Queue multicore jobs, handled by Executive tasks.
//Set flags to zero if you wish to get the res.
    //See ::/Demo/MultiCore/Lock.CC
    CJobCtrl    *ctrl;
    CJob        *tmpc;
    CTask       *executive;

    if (!(0 <= target_cpu < mp_count))
        throw('MultCore');
    tmpc = SysCAlloc(sizeof(CJob));
    if (aux_str)
        tmpc->aux_str = SysStrNew(aux_str);
    tmpc->job_code  = job_code;
    tmpc->addr      = fp_addr;
    tmpc->fun_arg   = data;
    tmpc->flags     = flags;
    tmpc->aux1      = aux1;
    tmpc->aux2      = aux2;
    executive = cpu_structs[target_cpu].executive_task;
    tmpc->ctrl = ctrl = &executive->server_ctrl;

    PUSHFD
    CLI
    while (LBts(&ctrl->flags, JOBCf_LOCKED))
        Yield;
    if (ctrl->next_waiting == ctrl && LBtr(&executive->task_flags, TASKf_AWAITING_MESSAGE))
        MPInt(I_WAKE, target_cpu);
    QueueInsert(tmpc, ctrl->last_waiting);
    LBtr(&ctrl->flags, JOBCf_LOCKED);
    POPFD

    return tmpc;
}

CTask *SpawnQueue(U0 (*fp_addr)(U8 *data), U8 *data=NULL, U8 *task_name=NULL, 
                  I64 target_cpu,  CTask *parent=NULL,  //NULL means sys_task
                  I64 stack_size=0, I64 flags=1 << JOBf_ADD_TO_QUE)
{
    CTask       *res;
    CJob        *tmpc = JobQueue(fp_addr, data, target_cpu, flags, JOBT_SPAWN_TASK, task_name, parent, stack_size);
    CJobCtrl    *ctrl;

    while (!Bt(&tmpc->flags, JOBf_DONE))
    {
        LBts(&Fs->task_flags, TASKf_IDLE);
        Yield;
    }
    LBtr(&Fs->task_flags, TASKf_IDLE);

    res  = tmpc->spawned_task;
    ctrl = tmpc->ctrl;

    PUSHFD
    CLI
    while (LBts(&ctrl->flags, JOBCf_LOCKED))
        Yield;
    QueueRemove(tmpc);
    LBtr(&ctrl->flags, JOBCf_LOCKED);
    POPFD

    JobDel(tmpc);
    return res;
}

U0 CoreAPExecutiveInit()
{//Called by multicore's executive task after Core0StartMP()
//as the first thing a CPU does before waiting for jobs.
    MPAPICInit;
    Fs->rip = &CoreAPExecutiveTask;
    TaskContextRestore;
}

U0 Core0StartMP()
{//Called by sys_task during start-up.
    CTask           *task;
    U8               buf[STR_LEN];
    CAP16BitInit    *mp = MP_VECT_ADDR;
    CCPU            *c;
    I64              i, my_mp_count;

    PUSHFD
    CLI
    if (mp_count > 1)
    {
        my_mp_count = mp_count;
        MPHalt; //sets mp_count to 1
        for (i = 1; i < my_mp_count; i++)
        {
            c = &cpu_structs[i];
            JobQueueDel(&c->executive_task->server_ctrl.next_waiting);
            JobQueueDel(&c->executive_task->server_ctrl.next_done);
        }
    }
    MemSet(&cpu_structs[1], 0, sizeof(CCPU) * (MP_PROCESSORS_NUM - 1));

    //When you start-up other cores, they jump to an address
    //specified by a byte vect number, MPN_VECT which corresponds
    //to a location 4096*vect number, MP_VECT_ADDR.
    MemCopy(mp, COREAP_16BIT_INIT, COREAP_16BIT_INIT_END-COREAP_16BIT_INIT);
    MemCopy(&mp->ap_gdt_ptr, SYS_GDT_PTR, sizeof(CSysLimitBase));
    mp_count_initial = mp_count = 1;
    mp_count_lock = 0;

    *(dev.uncached_alias + LAPIC_LVT_ERR)(U32 *) = *(dev.uncached_alias + LAPIC_LVT_ERR)(U32 *) & 0xFFFFFF00 + MPN_VECT;
    WBINVD //Not sure why this is needed. Might just need delay. MemCopy above?

    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4500; //assert init IPI
    Busy(10000);

    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4600 + MPN_VECT; //start-up
    Busy(200);
    *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4600 + MPN_VECT;

    Busy(100000);
    for (i = 0; i < 10000; i++)
        LBts(&mp_count_lock, 0); //Don't let more through
    my_mp_count = mp_count_initial;

    if (my_mp_count > MP_PROCESSORS_NUM)
        my_mp_count = MP_PROCESSORS_NUM;

    for (i = 1; i < my_mp_count; i++)
    {
        StrPrint(buf, "Executive CPU%02X", i);
        task = Spawn(&CoreAPExecutiveInit, NULL, buf,,, MEM_EXECUTIVE_STACK, 0);
        task->rflags = RFLAGG_START;
//CTask alloced off this core's executive_task's heap (Which is System task)
        CPUStructInit(i, &cpu_structs[i], task);
        WBINVD //Not sure why this is needed.  Might just need delay.
    }

    //Make sure they're all up-and-running
    for (i = 1; i < my_mp_count; i++)
        while (!Bt(&cpu_structs[i].executive_task->task_flags, TASKf_AWAITING_MESSAGE))
            PAUSE;

    POPFD
    mp_count = my_mp_count; //Finalize count
}

U0 Core0Init()
{//Called by sys_task during start-up
    mp_count_initial = mp_count = 1;
    mp_count_lock = 0;

    debug.mp_crash = SysCAlloc(sizeof(CMPCrash));

    //Must be in code heap because init code uses 32 bit address of cpu_struct
    sys_task->gs = cpu_structs = CAlloc(sizeof(CCPU) * MP_PROCESSORS_NUM, Fs->code_heap);
    CPUStructInit(0, cpu_structs, sys_task);
    //RAX has GS
    IMPORT  SET_GS_BASE;
    CALL    SET_GS_BASE
 
    MPAPICInit;
}

interrupt U0 IntMPCrash()
{//Entering the debugger from another core causes an interrupt on Core0
//Which calls this routine.
    *(dev.uncached_alias + LAPIC_EOI)(U32 *) = 0;
    mp_count = 1;
    Raw(ON);
    text.raw_flags |= RAWF_SHOW_DOLLAR;
    "MP Crash CPU%02X Task:%08X\n"
    "RIP:%P\n", debug.mp_crash->cpu_num, debug.mp_crash->task, debug.mp_crash->rip;
    Panic(debug.mp_crash->message, debug.mp_crash->message_num);
}