asm {
                                ALIGN   16, OC_NOP
USE16
//See ZealOS MultiCore.

//This code gets copied to MP_VECT_ADDR.
//See MemCopy(MP_VECT_ADDR.
COREAP_16BIT_INIT::
                                JMP             @@05

                                ALIGN   4, OC_NOP
AP_GDT_PTR:     DU16            sizeof(CGDT) - 1;
                                DU64            0;

@@05:                   CLI
                                WBINVD
                                MOV             AX, MP_VECT_ADDR / 16
                                MOV             DS, AX
                                LGDT            U32 [CAP16BitInit.ap_gdt_ptr]  //See mp->ap_gdt_ptr

                                MOV             EAX, SYS_START_CR0
                                MOV_CR0_EAX
                                DU8             0x66, 0xEA;             //JMP CGDT.cs32:AP_32BIT_INIT
                                DU32            AP_32BIT_INIT;
                                DU16            CGDT.cs32;
COREAP_16BIT_INIT_END::

USE32
AP_32BIT_INIT:
                                MOV             AX, CGDT.ds
                                MOV             DS, AX
                                MOV             ES, AX
                                MOV             FS, AX
                                MOV             GS, AX
                                MOV             SS, AX

@@05:                   LOCK
                                BTS             U32 [SYS_MP_COUNT_LOCK], 0
                                JC                      @@05

                                MOV             ESI, U32 [SYS_MP_COUNT_INITIAL]
                                LOCK
                                INC             U32 [SYS_MP_COUNT_INITIAL]
                                LOCK
                                BTR             U32 [SYS_MP_COUNT_LOCK], 0

                                CMP             ESI, MP_PROCESSORS_NUM
                                JAE             I32 _SYS_HLT

                                IMUL2           ESI, sizeof(CCPU)
                                ADD             ESI, U32 [SYS_CPU_STRUCTS]

                                LEA             ESP, U32 CCPU.start_stack + sizeof(CCPU.start_stack)[ESI]
                                PUSH            U32 RFLAGG_START
                                POPFD
                                PUSH            U32 0   //Return from next call will be 64-bit
                                CALL            SYS_ENTER_LONG_MODE
USE64
                                FNINIT
                                MOV             RAX, RSI
                                CALL            SET_GS_BASE
@@10:                   MOV             RAX, U64 CCPU.executive_task[RSI]
                                TEST            RAX, RAX
                                JZ                      @@10
                                MOV             U64 CTask.gs[RAX], RSI
                                CALL            SET_FS_BASE

                                JMP             I32 _TASK_CONTEXT_RESTORE
}

U0 TSSBusy(I64 tr, Bool val=OFF)
{//See ::/Demo/Lectures/Ring3.CC.
        LBEqual((&sys_gdt)(U8 *) + tr + 4, 9, val);
}

CTSS *TSSNew(I64 cpu_num)
{
        U32  *d, *d1;
        CTSS *tss = CAlloc(sizeof(CTSS));

        tss->io_map_offset = offset(CTSS.io_map);
        MemSet(tss->io_map, 0xFF, 0x10000 / 8);

        tss->st0        = MAlloc(MEM_INTERRUPT_STACK);
        tss->rsp0       = tss->st0(U8 *) + MSize(tss->st0);
        tss->st1        = MAlloc(MEM_INTERRUPT_STACK);
        tss->rsp1       = tss->st1(U8 *) + MSize(tss->st1);
        tss->st2        = MAlloc(MEM_INTERRUPT_STACK);
        tss->rsp2       = tss->st2(U8 *) + MSize(tss->st2);

        tss->tr          = offset(CGDT.tr) + cpu_num * 16;
        tss->tr_ring3= offset(CGDT.tr_ring3) + cpu_num * 16;

        d = (&sys_gdt)(U8 *) + tss->tr;
        d1 = d(U8 *) + 4;
        *d = 0x0000FFFF;
        *d1 = 0x008F8900;
        d(U8 *) += 2;
        *d |= tss & 0x00FFFFFF;
        *d1++ |= tss & 0xFF000000;
        *d1++ = tss >> 32;
        *d1 = 0;

        d = (&sys_gdt)(U8 *) + tss->tr_ring3;
        d1 = d(U8 *) + 4;
        *d = 0x0000FFFF;
        *d1 = 0x008FE900;
        d(U8 *) += 2;
        *d |= tss & 0x00FFFFFF;
        *d1++ |= tss & 0xFF000000;
        *d1++ = tss >> 32;
        *d1 = 0;

        return tss;
}

CCPU *CPUStructInit(I64 num, CCPU *c, CTask *executive_task)
{//Executive is null when called by system_task on CSysFixedArea.boot_cpu0
        MemSet(c, 0, sizeof(CCPU));
        c->addr                 = c;
        c->num                  = num;
        c->idle_factor  = 0.01;
        QueueInit(&c->next_dying);
        if (Bt(&sys_run_level, RLf_16MEG_SYSTEM_HEAP_CTRL))
        {
                c->idle_task = Spawn(0, NULL, "Idle Task",, Fs,, 0);
                LBts(&c->idle_task->task_flags, TASKf_IDLE);
                c->tss = TSSNew(num);
        }
        c->executive_task = executive_task;// It waits for this to be filled-in: executive_task

        return c;
}

U0 MPInt(U8 num, I64 cpu_num=1)
{//Generate interrupt for specified core.
        if (cpu_num >= mp_count)
        {
                if (!Bt(&sys_run_level, RLf_MP))
                        return;
                else
                        throw('MultCore');
        }

        PUSHFD
        CLI //Multitasking safe because each core has a local apic and IRQs are off
        while (*(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) & 0x1000)
                PAUSE
        *(dev.uncached_alias + LAPIC_ICR_HIGH)(U32 *) = dev.mp_apic_ids[cpu_num] << 24;
        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *)  = 0x4000 + num;
        POPFD
}

U0 MPIntAll(U8 num)
{//Generate interrupt for all but own core.
        PUSHFD
        CLI //Multitasking safe because each core has a local apic and IRQs are off
        while (*(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) & 0x1000)
                PAUSE
        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4800 + num;
        POPFD
}

U0 MPNMInt()
{//Generate nonmaskable interrupt.
        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4400;
}

U0 MPHalt()
{//Halt all other cores.
        mp_count = 1;
        MPNMInt;
        Busy(10000);
}

U0 MPAPICInit()
{//Called by system_task during start-up
//and other cores during initialization
        //after Core0StartMP().
        *(dev.uncached_alias + LAPIC_SVR)(U32 *) |= LAPICF_APIC_ENABLED;
        dev.mp_apic_ids[Gs->num] = *(dev.uncached_alias + LAPIC_APIC_ID)(U32 *) >> 24;
        *(dev.uncached_alias + LAPIC_LDR)(U32 *) = dev.mp_apic_ids[Gs->num] << 24;
        *(dev.uncached_alias + LAPIC_DFR)(U32 *) = 0xF0000000;

        //      MemSet(dev.uncached_alias + LAPIC_IRR, 0, 0x20);
        //      MemSet(dev.uncached_alias + LAPIC_ISR, 0, 0x20);
        //      MemSet(dev.uncached_alias + LAPIC_TMR, 0, 0x20);

        RAXSet(Gs->tss->tr);
        LTR     AX
        if (Gs->num)
        {
                IntInit1;
                RFlagsSet(RFLAGG_NORMAL);
        }
}

#assert !offset(CJobCtrl.next_waiting)

U0 CoreAPExecutiveTask()
{
        CJobCtrl *ctrl = &Fs->server_ctrl;

        while (TRUE)
        {
                STI
                do
                {
                        TaskKillDying;
                        do PAUSE
                        while (LBts(&ctrl->flags, JOBCf_LOCKED));
                }
                while (ctrl->next_waiting != ctrl && JobRunOne(RFlagsGet, ctrl));

                CLI
                LBts(&Fs->task_flags, TASKf_AWAITING_MESSAGE);
                LBtr(&ctrl->flags, JOBCf_LOCKED);
                LBts(&Fs->task_flags, TASKf_IDLE);
                Yield;
                LBtr(&Fs->task_flags, TASKf_IDLE);
        }
}

CJob *JobQueue(I64 (*fp_addr)(U8 *data), U8 *data=NULL, I64 target_cpu=1, I64 flags=1<<JOBf_FREE_ON_COMPLETE, 
                           I64 job_code=JOBT_CALL, U8 *aux_str=NULL, I64 aux1=0, I64 aux2=0)
{//Queue multicore jobs, handled by Executive tasks.
//Set flags to zero if you wish to get the res.
        //See ::/Demo/MultiCore/Lock.CC
        CJobCtrl        *ctrl;
        CJob            *tmpc;
        CTask           *executive;

        if (!(0 <= target_cpu < mp_count))
                throw('MultCore');
        tmpc = ZCAlloc(sizeof(CJob));
        if (aux_str)
                tmpc->aux_str = ZStrNew(aux_str);
        tmpc->job_code  = job_code;
        tmpc->addr              = fp_addr;
        tmpc->fun_arg   = data;
        tmpc->flags             = flags;
        tmpc->aux1              = aux1;
        tmpc->aux2              = aux2;
        executive = cpu_structs[target_cpu].executive_task;
        tmpc->ctrl = ctrl = &executive->server_ctrl;

        PUSHFD
        CLI
        while (LBts(&ctrl->flags, JOBCf_LOCKED))
                Yield;
        if (ctrl->next_waiting == ctrl && LBtr(&executive->task_flags, TASKf_AWAITING_MESSAGE))
                MPInt(I_WAKE, target_cpu);
        QueueInsert(tmpc, ctrl->last_waiting);
        LBtr(&ctrl->flags, JOBCf_LOCKED);
        POPFD

        return tmpc;
}

CTask *SpawnQueue(U0 (*fp_addr)(U8 *data), U8 *data=NULL, U8 *task_name=NULL, 
                                  I64 target_cpu,  CTask *parent=NULL,  //NULL means system_task
                                  I64 stack_size=0, I64 flags=1 << JOBf_ADD_TO_QUE)
{
        CTask           *res;
        CJob            *tmpc = JobQueue(fp_addr, data, target_cpu, flags, JOBT_SPAWN_TASK, task_name, parent, stack_size);
        CJobCtrl        *ctrl;

        while (!Bt(&tmpc->flags, JOBf_DONE))
        {
                LBts(&Fs->task_flags, TASKf_IDLE);
                Yield;
        }
        LBtr(&Fs->task_flags, TASKf_IDLE);

        res  = tmpc->spawned_task;
        ctrl = tmpc->ctrl;

        PUSHFD
        CLI
        while (LBts(&ctrl->flags, JOBCf_LOCKED))
                Yield;
        QueueRemove(tmpc);
        LBtr(&ctrl->flags, JOBCf_LOCKED);
        POPFD

        JobDel(tmpc);
        return res;
}

U0 CoreAPExecutiveInit()
{//Called by multicore's executive task after Core0StartMP()
//as the first thing a CPU does before waiting for jobs.
        MPAPICInit;
        Fs->rip = &CoreAPExecutiveTask;
        TaskContextRestore;
}

U0 Core0StartMP()
{//Called by system_task during start-up.
        CTask                   *task;
        U8                               buf[STR_LEN];
        CAP16BitInit    *mp = MP_VECT_ADDR;
        CCPU                    *c;
        I64                              i, my_mp_count;

        PUSHFD
        CLI
        if (mp_count > 1)
        {
                my_mp_count = mp_count;
                MPHalt; //sets mp_count to 1
                for (i = 1; i < my_mp_count; i++)
                {
                        c = &cpu_structs[i];
                        JobQueueDel(&c->executive_task->server_ctrl.next_waiting);
                        JobQueueDel(&c->executive_task->server_ctrl.next_done);
                }
        }
        MemSet(&cpu_structs[1], 0, sizeof(CCPU) * (MP_PROCESSORS_NUM - 1));

        //When you start-up other cores, they jump to an addr
        //specified by a byte vect number, MPN_VECT which corresponds
        //to a location 4096*vect number, MP_VECT_ADDR.
        MemCopy(mp, COREAP_16BIT_INIT, COREAP_16BIT_INIT_END-COREAP_16BIT_INIT);
        MemCopy(&mp->ap_gdt_ptr, SYS_GDT_PTR, sizeof(CSysLimitBase));
        mp_count_initial = mp_count = 1;
        mp_count_lock = 0;

        *(dev.uncached_alias + LAPIC_LVT_ERR)(U32 *) = *(dev.uncached_alias + LAPIC_LVT_ERR)(U32 *) & 0xFFFFFF00 + MPN_VECT;
        WBINVD //Not sure why this is needed. Might just need delay. MemCopy above?

        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4500; //assert init IPI
        Busy(10000);

        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4600 + MPN_VECT; //start-up
        Busy(200);
        *(dev.uncached_alias + LAPIC_ICR_LOW)(U32 *) = 0xC4600 + MPN_VECT;

        Busy(100000);
        for (i = 0; i < 10000; i++)
                LBts(&mp_count_lock, 0); //Don't let more through
        my_mp_count = mp_count_initial;

        if (my_mp_count > MP_PROCESSORS_NUM)
                my_mp_count = MP_PROCESSORS_NUM;

        for (i = 1; i < my_mp_count; i++)
        {
                StrPrint(buf, "Executive Task CPU%02X", i);
                task = Spawn(&CoreAPExecutiveInit, NULL, buf,,, MEM_EXECUTIVE_STACK, 0);
                task->rflags = RFLAGG_START;
//CTask alloced off this core's executive_task's heap (Which is System task)
                CPUStructInit(i, &cpu_structs[i], task);
                WBINVD //Not sure why this is needed.  Might just need delay.
        }

        //Make sure they're all up-and-running
        for (i = 1; i < my_mp_count; i++)
                while (!Bt(&cpu_structs[i].executive_task->task_flags, TASKf_AWAITING_MESSAGE))
                        PAUSE;

        POPFD
        mp_count = my_mp_count; //Finalize count
}

U0 Core0Init()
{//Called by system_task during start-up
        mp_count_initial = mp_count = 1;
        mp_count_lock = 0;

        debug.mp_crash = ZCAlloc(sizeof(CMPCrash));

        //Must be in code heap because init code uses 32 bit addr of cpu_struct
        sys_task->gs = cpu_structs = CAlloc(sizeof(CCPU) * MP_PROCESSORS_NUM, Fs->code_heap);
        CPUStructInit(0, cpu_structs, sys_task);
        //RAX has GS
        IMPORT  SET_GS_BASE;
        CALL    SET_GS_BASE
 
        MPAPICInit;
}

interrupt U0 IntMPCrash()
{//Entering the debugger from another core causes an interrupt on Core0
//Which calls this routine.
        *(dev.uncached_alias + LAPIC_EOI)(U32 *) = 0;
        mp_count = 1;
        Raw(ON);
        text.raw_flags |= RAWF_SHOW_DOLLAR;
        "MP Crash CPU%02X Task:%08X\n"
        "RIP:%P\n", debug.mp_crash->cpu_num, debug.mp_crash->task, debug.mp_crash->rip;
        Panic(debug.mp_crash->message, debug.mp_crash->message_num);
}