mirror of
https://github.com/Zeal-Operating-System/ZealOS.git
synced 2024-12-26 23:36:32 +00:00
fix: bare-metal optimizations progress
This commit is contained in:
parent
4eaf4f7205
commit
de2b050a6f
3 changed files with 69 additions and 16 deletions
|
@ -68,7 +68,20 @@ _MEMCOPY::
|
||||||
MOV RDI, U64 SF_ARG1[RBP] // dst
|
MOV RDI, U64 SF_ARG1[RBP] // dst
|
||||||
MOV RSI, U64 SF_ARG2[RBP] // src
|
MOV RSI, U64 SF_ARG2[RBP] // src
|
||||||
MOV RCX, U64 SF_ARG3[RBP] // count
|
MOV RCX, U64 SF_ARG3[RBP] // count
|
||||||
@@05: // SSE 128-bit memcopy (count >= 16)
|
TEST RDI, 0xF // check if dst is 16byte aligned
|
||||||
|
JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks
|
||||||
|
TEST RSI, 0xF // check if src is 16byte aligned
|
||||||
|
JNZ @@05 // if not 16byte aligned, jump down to MOVUPS unaligned checks
|
||||||
|
@@03: // SSE 128-bit ALIGNED memcopy (count >= 16)
|
||||||
|
CMP RCX, 16 // count <, >, == 16 bytes?
|
||||||
|
JL @@10 // if count less than 16, jump down
|
||||||
|
MOVAPS XMM15, [RSI]// move 128 bits, src-->xmm ALIGNED
|
||||||
|
MOVAPS [RDI], XMM15// move 128 bits, xmm-->dst mem ALIGNED
|
||||||
|
ADD RSI, 16 // increment src addr by 128 bits
|
||||||
|
ADD RDI, 16 // increment dst addr by 128 bits
|
||||||
|
ADD RCX, -16 // decrement count by 128 bits
|
||||||
|
JMP @@03 // jump back to 16 byte check
|
||||||
|
@@05: // SSE 128-bit UNALIGNED memcopy (count >= 16)
|
||||||
CMP RCX, 16 // count <, >, == 16 bytes?
|
CMP RCX, 16 // count <, >, == 16 bytes?
|
||||||
JL @@10 // if count less than 16, jump down
|
JL @@10 // if count less than 16, jump down
|
||||||
MOVUPS XMM15, [RSI]// move 128 bits, src-->xmm
|
MOVUPS XMM15, [RSI]// move 128 bits, src-->xmm
|
||||||
|
|
|
@ -187,7 +187,7 @@ U0 GrInit2()
|
||||||
gr.to_8_bits = MAlloc(256 * sizeof(I64));
|
gr.to_8_bits = MAlloc(256 * sizeof(I64));
|
||||||
gr.to_8_colors = MAlloc(256 * sizeof(I64));
|
gr.to_8_colors = MAlloc(256 * sizeof(I64));
|
||||||
|
|
||||||
gr.screen_cache = MAlloc(sys_framebuffer_width * sys_framebuffer_height);
|
gr.screen_cache = MAllocAligned(sys_framebuffer_width * sys_framebuffer_height, 16);
|
||||||
|
|
||||||
gr.text_base = CAlloc(TEXT_ROWS * TEXT_COLS * sizeof(U32));
|
gr.text_base = CAlloc(TEXT_ROWS * TEXT_COLS * sizeof(U32));
|
||||||
gr.win_uncovered_bitmap = CAlloc(65536 / 8);
|
gr.win_uncovered_bitmap = CAlloc(65536 / 8);
|
||||||
|
|
|
@ -344,13 +344,12 @@ U0 GrUpdateTextFG()
|
||||||
|
|
||||||
U0 DCBlotColor8(CDC *dc, CDC *img)
|
U0 DCBlotColor8(CDC *dc, CDC *img)
|
||||||
{
|
{
|
||||||
U8 *src = img->body, *b0 = dc->body;
|
U8 *b0 = dc->body;
|
||||||
I64 reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height;
|
I64 *src = img->body, reg j, reg jj, v = 0, k, d0 = img->width_internal * img->height;
|
||||||
|
|
||||||
for (k = 0; k < d0; k += 8)
|
for (k = 0; k < d0; k += 8)
|
||||||
{
|
{
|
||||||
j = *(src(U64 *));
|
j = *src++;;
|
||||||
src += 8;
|
|
||||||
do
|
do
|
||||||
{
|
{
|
||||||
jj = j & 0xFF;
|
jj = j & 0xFF;
|
||||||
|
@ -364,10 +363,13 @@ U0 DCBlotColor8(CDC *dc, CDC *img)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
U0 GrCalcScreenUpdates()
|
//#define GR_CALC_SCREEN_UPDATES_SKIP 2
|
||||||
|
//#define GR_CALC_SCREEN_UPDATES_CHUNKS 4
|
||||||
|
$IV,0$U0 GrCalcScreenUpdates()
|
||||||
{
|
{
|
||||||
U64 *screen, *last_screen = gr.screen_cache;
|
U64 reg *screen, reg *last_screen = gr.screen_cache, i, ii, *src = text.raw_screen, *dst = text.fb_alias, reg RDX diffs_size = GR_WIDTH * GR_HEIGHT / 8,
|
||||||
U64 i, *src = text.raw_screen, *dst = text.fb_alias, diffs_size = GR_WIDTH * GR_HEIGHT / 8;
|
// skip = gr.screen_cache[0], // use 1st U8 of cache as flag to skip cache MemCopy every-other call.
|
||||||
|
/* reg R9 skip_size64, reg R8 skip_diff, skip_size8*/;
|
||||||
|
|
||||||
if (gr.screen_zoom == 1)
|
if (gr.screen_zoom == 1)
|
||||||
screen = gr.dc2->body;
|
screen = gr.dc2->body;
|
||||||
|
@ -376,38 +378,76 @@ U0 GrCalcScreenUpdates()
|
||||||
|
|
||||||
for (i = 0; i < diffs_size; i++)
|
for (i = 0; i < diffs_size; i++)
|
||||||
if (screen[i] != last_screen[i])
|
if (screen[i] != last_screen[i])
|
||||||
MemCopy(dst + i * 4, src + i * 4, 4 * 64);
|
{
|
||||||
|
ii = i * 4;
|
||||||
|
dst[ii] = src[ii++];
|
||||||
|
dst[ii] = src[ii++];
|
||||||
|
dst[ii] = src[ii++];
|
||||||
|
dst[ii] = src[ii];
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
if (skip < GR_CALC_SCREEN_UPDATES_CHUNKS * GR_CALC_SCREEN_UPDATES_SKIP)
|
||||||
|
{
|
||||||
|
if (skip % GR_CALC_SCREEN_UPDATES_SKIP == 0)
|
||||||
|
{
|
||||||
|
skip_size64 = skip / GR_CALC_SCREEN_UPDATES_SKIP * text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS / 8;
|
||||||
|
// skip_diff = diffs_size * 8 / GR_CALC_SCREEN_UPDATES_CHUNKS;
|
||||||
|
// skip_size8 = skip / GR_CALC_SCREEN_UPDATES_SKIP * skip_diff;
|
||||||
|
MemCopy(dst + skip_size64,
|
||||||
|
src + skip_size64,
|
||||||
|
text.buffer_size / GR_CALC_SCREEN_UPDATES_CHUNKS);
|
||||||
|
// MemCopy(gr.screen_cache + skip_size8, screen(U8 *) + skip_size8, skip_diff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
skip = -1;
|
||||||
|
|
||||||
|
gr.screen_cache[0] = ++skip;
|
||||||
|
*/
|
||||||
MemCopy(gr.screen_cache, screen, diffs_size * 8);
|
MemCopy(gr.screen_cache, screen, diffs_size * 8);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
U0 GrUpdateScreen32()
|
U0 GrUpdateScreen32()
|
||||||
{
|
{
|
||||||
U64 size, *dst, reg src64val;
|
U64 size, *dst, src64val, *src;
|
||||||
U8 *src, reg a, reg b, c, reg d;
|
U8 a, b, c, d;
|
||||||
|
|
||||||
|
PUSHFD
|
||||||
|
CLI
|
||||||
|
|
||||||
if (gr.screen_zoom == 1)
|
if (gr.screen_zoom == 1)
|
||||||
{
|
{
|
||||||
src = gr.dc2->body;
|
src = gr.dc2->body;
|
||||||
size = src + gr.dc2->height * gr.dc2->width_internal;
|
size = src(U8 *) + gr.dc2->height * gr.dc2->width_internal;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
GrZoomInScreen;
|
GrZoomInScreen;
|
||||||
src = gr.zoomed_dc->body;
|
src = gr.zoomed_dc->body;
|
||||||
size = src + gr.zoomed_dc->height * gr.zoomed_dc->width_internal;
|
size = src(U8 *) + gr.zoomed_dc->height * gr.zoomed_dc->width_internal;
|
||||||
}
|
}
|
||||||
|
POPFD
|
||||||
|
|
||||||
dst = text.raw_screen;
|
dst = text.raw_screen;
|
||||||
while (src < size) // draw 4 pixels at a time
|
|
||||||
|
while (src < size) // draw 8 pixels at a time
|
||||||
{
|
{
|
||||||
src64val = *(src(U64 *));
|
src64val = *(src(U64 *));
|
||||||
a = src64val & 0xFF;
|
a = src64val & 0xFF;
|
||||||
b = (src64val >>= 8) & 0xFF;
|
b = (src64val >>= 8) & 0xFF;
|
||||||
c = (src64val >>= 8) & 0xFF;
|
c = (src64val >>= 8) & 0xFF;
|
||||||
d = (src64val >>= 8) & 0xFF;
|
d = (src64val >>= 8) & 0xFF;
|
||||||
src += 4;
|
|
||||||
*dst++ = gr_palette[a] | gr_palette[b] << 32;
|
*dst++ = gr_palette[a] | gr_palette[b] << 32;
|
||||||
*dst++ = gr_palette[c] | gr_palette[d] << 32;
|
*dst++ = gr_palette[c] | gr_palette[d] << 32;
|
||||||
|
a = (src64val >>= 8) & 0xFF;
|
||||||
|
b = (src64val >>= 8) & 0xFF;
|
||||||
|
c = (src64val >>= 8) & 0xFF;
|
||||||
|
d = (src64val >>= 8) & 0xFF;
|
||||||
|
*dst++ = gr_palette[a] | gr_palette[b] << 32;
|
||||||
|
*dst++ = gr_palette[c] | gr_palette[d] << 32;
|
||||||
|
src++;
|
||||||
}
|
}
|
||||||
|
|
||||||
GrCalcScreenUpdates;
|
GrCalcScreenUpdates;
|
||||||
|
|
Loading…
Reference in a new issue