mirror of
https://github.com/Zeal-Operating-System/ZealOS.git
synced 2024-12-25 23:10:32 +00:00
fix: bare-metal optimizations progress
This commit is contained in:
parent
4eaf4f7205
commit
de2b050a6f
3 changed files with 69 additions and 16 deletions
|
@ -68,7 +68,20 @@ _MEMCOPY::
|
|||
MOV RDI, U64 SF_ARG1[RBP] // dst
|
||||
MOV RSI, U64 SF_ARG2[RBP] // src
|
||||
MOV RCX, U64 SF_ARG3[RBP] // count
|
||||
@@05: // SSE 128-bit memcopy (count >= 16)
|
||||
TEST RDI, 0xF // check if dst is 16byte aligned
|
||||
JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks
|
||||
TEST RSI, 0xF // check if src is 16byte aligned
|
||||
JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks
|
||||
@@03: // SSE 128-bit ALIGNED memcopy (count >= 16)
|
||||
CMP RCX, 16 // count <, >, == 16 bytes?
|
||||
JL @@10 // if count less than 16, jump down
|
||||
MOVAPS XMM15, [RSI]// move 128 bits, src-->xmm ALIGNED
|
||||
MOVAPS [RDI], XMM15// move 128 bits, xmm-->dst mem ALIGNED
|
||||
ADD RSI, 16 // increment src addr by 128 bits
|
||||
ADD RDI, 16 // increment dst addr by 128 bits
|
||||
ADD RCX, -16 // decrement count by 128 bits
|
||||
JMP @@03 // jump back to 16 byte check
|
||||
@@05: // SSE 128-bit UNALIGNED memcopy (count >= 16)
|
||||
CMP RCX, 16 // count <, >, == 16 bytes?
|
||||
JL @@10 // if count less than 16, jump down
|
||||
MOVUPS XMM15, [RSI]// move 128 bits, src-->xmm
|
||||
|
|
|
@ -187,7 +187,7 @@ U0 GrInit2()
|
|||
gr.to_8_bits = MAlloc(256 * sizeof(I64));
|
||||
gr.to_8_colors = MAlloc(256 * sizeof(I64));
|
||||
|
||||
gr.screen_cache = MAlloc(sys_framebuffer_width * sys_framebuffer_height);
|
||||
gr.screen_cache = MAllocAligned(sys_framebuffer_width * sys_framebuffer_height, 16);
|
||||
|
||||
gr.text_base = CAlloc(TEXT_ROWS * TEXT_COLS * sizeof(U32));
|
||||
gr.win_uncovered_bitmap = CAlloc(65536 / 8);
|
||||
|
|
|
@ -344,13 +344,12 @@ U0 GrUpdateTextFG()
|
|||
|
||||
U0 DCBlotColor8(CDC *dc, CDC *img)
|
||||
{
|
||||
U8 *src = img->body, *b0 = dc->body;
|
||||
I64 reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height;
|
||||
U8 *b0 = dc->body;
|
||||
I64 *src = img->body, reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height;
|
||||
|
||||
for (k = 0; k < d0; k += 8)
|
||||
{
|
||||
j = *(src(U64 *));
|
||||
src += 8;
|
||||
j = *src++;;
|
||||
do
|
||||
{
|
||||
jj = j & 0xFF;
|
||||
|
@ -364,10 +363,13 @@ U0 DCBlotColor8(CDC *dc, CDC *img)
|
|||
}
|
||||
}
|
||||
|
||||
U0 GrCalcScreenUpdates()
|
||||
//#define GR_CALC_SCREEN_UPDATES_SKIP 2
|
||||
//#define GR_CALC_SCREEN_UPDATES_CHUNKS 4
|
||||
$IV,0$U0 GrCalcScreenUpdates()
|
||||
{
|
||||
U64 *screen, *last_screen = gr.screen_cache;
|
||||
U64 i, *src = text.raw_screen, *dst = text.fb_alias, diffs_size = GR_WIDTH * GR_HEIGHT / 8;
|
||||
U64 reg *screen, reg *last_screen = gr.screen_cache, i, ii, *src = text.raw_screen, *dst = text.fb_alias, reg RDX diffs_size = GR_WIDTH * GR_HEIGHT / 8,
|
||||
// skip = gr.screen_cache[0], // use 1st U8 of cache as flag to skip cache MemCopy every-other call.
|
||||
/* reg R9 skip_size64, reg R8 skip_diff, skip_size8*/;
|
||||
|
||||
if (gr.screen_zoom == 1)
|
||||
screen = gr.dc2->body;
|
||||
|
@ -376,38 +378,76 @@ U0 GrCalcScreenUpdates()
|
|||
|
||||
for (i = 0; i < diffs_size; i++)
|
||||
if (screen[i] != last_screen[i])
|
||||
MemCopy(dst + i * 4, src + i * 4, 4 * 64);
|
||||
{
|
||||
ii = i * 4;
|
||||
dst[ii] = src[ii++];
|
||||
dst[ii] = src[ii++];
|
||||
dst[ii] = src[ii++];
|
||||
dst[ii] = src[ii];
|
||||
}
|
||||
|
||||
/*
|
||||
if (skip < GR_CALC_SCREEN_UPDATES_CHUNKS * GR_CALC_SCREEN_UPDATES_SKIP)
|
||||
{
|
||||
if (skip % GR_CALC_SCREEN_UPDATES_SKIP == 0)
|
||||
{
|
||||
skip_size64 = skip / GR_CALC_SCREEN_UPDATES_SKIP * text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS / 8;
|
||||
// skip_diff = diffs_size * 8 / GR_CALC_SCREEN_UPDATES_CHUNKS;
|
||||
// skip_size8 = skip / GR_CALC_SCREEN_UPDATES_SKIP * skip_diff;
|
||||
MemCopy(dst + skip_size64,
|
||||
src + skip_size64,
|
||||
text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS);
|
||||
// MemCopy(gr.screen_cache + skip_size8, screen(U8 *) + skip_size8, skip_diff);
|
||||
}
|
||||
}
|
||||
else
|
||||
skip = -1;
|
||||
|
||||
gr.screen_cache[0] = ++skip;
|
||||
*/
|
||||
MemCopy(gr.screen_cache, screen, diffs_size * 8);
|
||||
|
||||
}
|
||||
|
||||
U0 GrUpdateScreen32()
|
||||
{
|
||||
U64 size, *dst, reg src64val;
|
||||
U8 *src, reg a, reg b, c, reg d;
|
||||
U64 size, *dst, src64val, *src;
|
||||
U8 a, b, c, d;
|
||||
|
||||
PUSHFD
|
||||
CLI
|
||||
|
||||
if (gr.screen_zoom == 1)
|
||||
{
|
||||
src = gr.dc2->body;
|
||||
size = src + gr.dc2->height * gr.dc2->width_internal;
|
||||
size = src(U8 *) + gr.dc2->height * gr.dc2->width_internal;
|
||||
}
|
||||
else
|
||||
{
|
||||
GrZoomInScreen;
|
||||
src = gr.zoomed_dc->body;
|
||||
size = src + gr.zoomed_dc->height * gr.zoomed_dc->width_internal;
|
||||
size = src(U8 *) + gr.zoomed_dc->height * gr.zoomed_dc->width_internal;
|
||||
}
|
||||
POPFD
|
||||
|
||||
dst = text.raw_screen;
|
||||
while (src < size) // draw 4 pixels at a time
|
||||
|
||||
while (src < size) // draw 8 pixels at a time
|
||||
{
|
||||
src64val = *(src(U64 *));
|
||||
a = src64val & 0xFF;
|
||||
b = (src64val >>= 8) & 0xFF;
|
||||
c = (src64val >>= 8) & 0xFF;
|
||||
d = (src64val >>= 8) & 0xFF;
|
||||
src += 4;
|
||||
*dst++ = gr_palette[a] | gr_palette[b] << 32;
|
||||
*dst++ = gr_palette[c] | gr_palette[d] << 32;
|
||||
a = (src64val >>= 8) & 0xFF;
|
||||
b = (src64val >>= 8) & 0xFF;
|
||||
c = (src64val >>= 8) & 0xFF;
|
||||
d = (src64val >>= 8) & 0xFF;
|
||||
*dst++ = gr_palette[a] | gr_palette[b] << 32;
|
||||
*dst++ = gr_palette[c] | gr_palette[d] << 32;
|
||||
src++;
|
||||
}
|
||||
|
||||
GrCalcScreenUpdates;
|
||||
|
|
Loading…
Reference in a new issue