/*
This file is a stand-alone program
which will regenerate processed dictionary
files from a raw Project Gutenberg
dictionary file.

See ::/Doc/Credits.DD.
*/

U0 ACDPreprocess(U8 *in_name, U8 *out_name)
{/*
<cr><nl>--> <nl>
$                       --> $$
\'89            --> .
*/
        I64                      ch, i;
        U8                      *src, *dst;
        CDoc            *doc;
        CDocEntry       *doc_e;

        if (doc = DocRead(in_name, DOCF_PLAIN_TEXT_TABS | DOCF_DBL_DOLLARS))
        {
                doc_e = doc->head.next;
                while (doc_e != doc)
                {
                        if (doc_e->type_u8 == DOCT_TEXT)
                        {
                                src = dst = doc_e->tag;
                                while (ch = *src++)
                                {
                                        if (ch == '\\' && *src == '\'')
                                        {
                                                src++;
                                                i = 0;
                                                ch = ToUpper(*src++);
                                                if ('0' <= ch <= '9')
                                                        i += ch - '0';
                                                else if ('A' <= ch <= 'F')
                                                        i += ch - 'A' + 10;
                                                i <<= 4;
                                                ch = ToUpper(*src++);
                                                if ('0' <= ch <= '9')
                                                        i += ch - '0';
                                                else if ('A' <= ch <= 'F')
                                                        i += ch - 'A' + 10;
                                                *dst++ = i;
                                        }
                                        else
                                                *dst++ = ch;
                                }
                                *dst = 0;
                        }
                        doc_e = doc_e->next;
                }
                StrCopy(doc->filename.name, out_name);
                DocWrite(doc);
                DocDel(doc);
        }
}

I64 ACDNextCmd(U8 **_ptr)
{
        U8 *ptr = *_ptr, *ptr2;
        I64 ch, res = -1;

        do
        {
                do
                {
                        if (!(ch = *ptr++))
                                goto ncmd_done;
                }
                while (ch != '<');

                ptr2 = ptr;
                do
                {
                        if (!(ch = *ptr2++))
                                goto ncmd_done;
                }
                while (ch != '>');

                *--ptr2 = 0;
                res = ListMatch(ptr,    "h1\0/h1\0def\0/def\0hw\0/hw\0tt\0/tt\0"
                                                                "ety\0@fld\0@cd\0@blockquote\0@wordforms\0@note\0@altname\0@chform\0"
                                                                "@cref\0@syn\0/ety\0@/fld\0@/cd\0@/blockquote\0@/wordforms\0@/note\0"
                                                                "@/altname\0@/chform\0@/cref\0@/syn\0");
                *ptr2++ = '>';
                ptr = ptr2;
        }
        while (res < 0);

        ncmd_done:
        *_ptr = ptr;

        return res;
}

U8 *ACDNextEntry(U8 **_ptr)
{
        U8 *res, *ignore, *ptr = *_ptr, buf[ACD_BLK_SIZE], *out_ptr = buf;
        I64 ch, l;

        while (TRUE)
        {
                while (TRUE)
                {
                        if (!(ch = *ptr++))
                                goto nentry_done;
                        if (ch != '<')
                        {
                                *out_ptr++ = ch;
                                if (ch == '$')
                                        *out_ptr++ = ch;
                        }
                        else
                                break;
                }
                ignore =        "b>\0i>\0ppp>\0/b>\0/i>\0/p>\0"
                                        "ets>\0col>\0spn>\0/ets>\0/col>\0/spn>\0er>\0as>\0cs>\0cd>\0ex>\0"
                                        "/er>\0/as>\0/cs>\0/cd>\0/ex>\0"
                                        "note>\0/note>\0blockquote>\0/blockquote>\0";
                while (*ignore)
                {
                        l = StrLen(ignore);
                        if (!StrNCompare(ptr, ignore, l))
                        {
                                ptr += l;
                                break;
                        }
                        else
                                ignore += l + 1;
                }
                if (!*ignore)
                        break;
        }
nentry_done:
        *out_ptr++ = 0;
        res = StrNew(buf);
        *_ptr = ptr - 1;
        return res;
}

I64 ACDCompareWords(U8 *e1, U8 *e2)
{
        return StrICompare(e1, e2);
}

U8 *ACDSortWords(U8 *start, I64 size, I64 word_count)
{
        U8 **ptr_array = MAlloc(sizeof(U8 *)*word_count),
                *out_start = MAlloc(size),
                *ptr = start,*ptr2;
        I64  i = 0;

        while (*ptr)
        {
                ptr_array[i++] = ptr;
                ptr += StrLen(ptr) + 3;
        }

        "Sorting...\n";
        Sleep(100);
        QuickSortI64(ptr_array, word_count, &ACDCompareWords);
        "Done...\n";
        Sleep(100);

        ptr = out_start;
        for (i = 0; i < word_count; i++)
        {
                ptr2 = ptr_array[i];
                while (*ptr2)
                        *ptr++ = *ptr2++;
                *ptr++ = *ptr2++; //zero
                *ptr++ = *ptr2++; //blk lo
                *ptr++ = *ptr2++; //blk hi
        }
        *ptr++ = 0;
        return out_start;
}

U0 ACDGen(U8 *in_file)
{
        I64  cmd, size, word_count = 0, largest_entry = 0;
        U8      *st,
                *in_ptr = FileRead(in_file, &size),
                *in_start = in_ptr,
                *out_ptr = MAlloc(size),
                *out_start = out_ptr,
                *word_ptr = MAlloc(size),
                *word_start = word_ptr,
                *last_word = "",
                *def_word_start = out_ptr,
                *sorted_word_start;
        U16 *d;

        if (!in_ptr)
                return;

        do
        {
                cmd = ACDNextCmd(&in_ptr);
                if (cmd == ACD_H1)
                {
next_word:
                        if (out_ptr - def_word_start > largest_entry)
                                largest_entry = out_ptr - def_word_start;

                        def_word_start = out_ptr;
                        if (st = ACDNextEntry(&in_ptr))
                        {
                                if (*st)
                                {
                                        if (StrICompare(st, last_word))
                                        {
                                                word_count++;

                                                *word_ptr++ = ACD_WORD_CHAR;
                                                last_word = word_ptr;
                                                StrCopy(word_ptr, st);
                                                word_ptr += StrLen(st) + 1;

                                                d  = word_ptr;
                                                *d = (out_ptr - out_start) / ACD_BLK_SIZE;
                                                word_ptr += 2;

                                                *out_ptr++ = ACD_WORD_CHAR;
                                                StrCopy(out_ptr, st);
                                                out_ptr += StrLen(st) + 1;
                                        }
                                        Free(st);

                                        do
                                        {
                                                do
                                                {
                                                        cmd = ACDNextCmd(&in_ptr);
                                                        if (cmd == ACD_H1)
                                                                goto next_word;
                                                }
                                                while (cmd >= 0 &&
                                                           !(cmd == ACD_DEF || cmd == ACD_PRONUNCIATION || cmd == ACD_POS || cmd == ACD_EXTRA));

                                                if (cmd == ACD_DEF)
                                                {
                                                        if (st = ACDNextEntry(&in_ptr))
                                                        {
                                                                if (*st)
                                                                {
                                                                        *out_ptr++ = ACD_DEF_CHAR;
                                                                        StrCopy(out_ptr, st);
                                                                        out_ptr += StrLen(st) + 1;
                                                                }
                                                                Free(st);
                                                        }
                                                }
                                                else if (cmd == ACD_PRONUNCIATION)
                                                {
                                                        if (st = ACDNextEntry(&in_ptr))
                                                        {
                                                                if (*st)
                                                                {
                                                                        *out_ptr++ = ACD_PRONUNCIATION_CHAR;
                                                                        StrCopy(out_ptr, st);
                                                                        out_ptr += StrLen(st) + 1;
                                                                }
                                                                Free(st);
                                                        }
                                                }
                                                else if (cmd == ACD_POS)
                                                {
                                                        if (st = ACDNextEntry(&in_ptr))
                                                        {
                                                                if (*st)
                                                                {
                                                                        *out_ptr++ = ACD_POS_CHAR;
                                                                        StrCopy(out_ptr, st);
                                                                        out_ptr += StrLen(st) + 1;
                                                                }
                                                                Free(st);
                                                        }
                                                }
                                                else if (cmd == ACD_EXTRA)
                                                {
                                                        if (st = ACDNextEntry(&in_ptr))
                                                        {
                                                                if (*st)
                                                                {
                                                                        *out_ptr++ = ACD_EXTRA_CHAR;
                                                                        StrCopy(out_ptr, st);
                                                                        out_ptr += StrLen(st) + 1;
                                                                }
                                                                Free(st);
                                                        }
                                                }
                                        }
                                        while (cmd == ACD_DEF || cmd == ACD_PRONUNCIATION || cmd == ACD_POS || cmd == ACD_EXTRA);
                                }
                                else
                                        Free(st);
                        }
                }
        }
        while (cmd >= 0);

        *out_ptr++  = ACD_END_CHAR;
        *word_ptr++ = ACD_END_CHAR;

        Free(in_start);

        "Blk Size      :%d\n",          ACD_BLK_SIZE;
        "Blk Count       :%04X\n",      (out_ptr - out_start + ACD_BLK_SIZE - 1) / ACD_BLK_SIZE;
        "Largest Entry :%d\n",          largest_entry;
        "Word Count    :%d\n",          word_count;

        FileWrite(ACD_DEF_FILENAME, out_start, out_ptr - out_start);
        "Def File Size :%d\n", out_ptr - out_start;

        sorted_word_start = ACDSortWords(word_start, word_ptr - word_start, word_count);
        FileWrite(ACD_WORD_FILENAME, sorted_word_start, word_ptr - word_start);
        "Word File Size:%d\n", word_ptr - word_start;

        Free(out_start);
        Free(word_start);
        Free(sorted_word_start);
}

Cd(__DIR__);
ACDPreprocess("DICTIONARY.DD", "DICTIONARY2.DD");
ACDGen("DICTIONARY2.DD");