4 // create unicode mappings
6 // Two kinds of mappings:
10 // For mapping to a number, we use the following strategy:
13 // 1. a table of numbers (for now we use uint16, so full Unicode table is 4MB)
14 // 2. a "don't care" value
15 // 3. define a 'fallback' value (typically 0)
16 // 4. define a fast-path range (typically 0..255 or 0..1023) [@TODO: automate detecting this]
19 // 1. Determine range of *end* of unicode codepoints (U+10FFFF and down) which
20 // all have the same value (or don't care). If large enough, emit this as a
21 // special case in the code.
22 // 2. Repeat above, limited to at most U+FFFF.
23 // 3. Cluster the data into intervals of 8,16,32,64,128,256 numeric values.
24 // 3a. If all the values in an interval are fallback/dont-care, no further processing
25 // 3b. Find the "trimmed range" outside which all the values are the fallback or don't care
26 // 3c. Find the "special trimmed range" outside which all the values are some constant or don't care
27 // 4. Pack the clusters into continuous memory, and find previous instances of
28 // the cluster. Repeat for trimmed & special-trimmed. In the first case, find
29 // previous instances of the cluster (allow don't-care to match in either
30 // direction), both aligned and mis-aligned; in the latter, starting where
31 // things start or mis-aligned. Build an index table specifiying the
32 // location of each cluster (and its length). Allow an extra indirection here;
33 // the full-sized index can index a smaller table which has the actual offset
35 // 5. Associate with each packed continuous memory above the amount of memory
36 // required to store the data w/ smallest datatype (of uint8, uint16, uint32).
37 // Discard the continuous memory. Recurse on each index table, but avoid the
40 // For mapping to a bit, we pack the results for 8 characters into a byte, and then apply
41 // the above strategy. Note that there may be more optimal approaches with e.g. packing
42 // 8 different bits into a single structure, though, which we should explore eventually.
45 // currently we limit *indices* to being 2^16, and we pack them as
46 // index + end_trim*2^16 + start_trim*2^24; specials have to go in a separate table
48 #define UVAL_DONT_CARE_DEFAULT 0xffffffff
60 int replace_fallback_with_codepoint
;
62 size_t inherited_storage
;
68 table result
; // index into not-returned table
92 uint16 overhead
; // add some forced overhead for each mode to avoid getting complex encoding when it doesn't save much
115 #define MODECOUNT (sizeof(modes)/sizeof(modes[0]))
116 #define CLUSTERSIZECOUNT 6 // 8,16, 32,64, 128,256
118 size_t size_for_max_number(uint32 number
)
120 if (number
== 0) return 0;
121 if (number
< 256) return 1;
122 if (number
< 256*256) return 2;
123 if (number
< 256*256*256) return 3;
127 size_t size_for_max_number_aligned(uint32 number
)
129 size_t n
= size_for_max_number(number
);
130 return n
== 3 ? 4 : n
;
133 uval
get_data(uval
*data
, int offset
, uval
*end
)
135 if (data
+ offset
>= end
)
141 int safe_len(uval
*data
, int len
, uval
*end
)
143 if (len
> end
- data
)
151 size_t find_packed(uval
**packed
, uval
*data
, int len
, int aligned
, int fastpath
, uval
*end
, int offset
, int replace
)
153 int packlen
= stb_arr_len(*packed
);
156 if (data
+len
> end
|| replace
) {
157 int safelen
= safe_len(data
, len
, end
);
158 memset(tempdata
, 0, dirty
*sizeof(tempdata
[0]));
159 memcpy(tempdata
, data
, safelen
* sizeof(data
[0]));
165 int safelen
= safe_len(data
, len
, end
);
166 for (i
=0; i
< safelen
; ++i
)
175 for (i
=0; i
< packlen
; i
+= len
)
176 if ((*packed
)[i
] == data
[0] && 0==memcmp(&(*packed
)[i
], data
, len
* sizeof(uval
)))
179 for (i
=0; i
< packlen
-len
+1; i
+= 1 )
180 if ((*packed
)[i
] == data
[0] && 0==memcmp(&(*packed
)[i
], data
, len
* sizeof(uval
)))
184 p
= stb_arr_len(*packed
);
185 for (i
=0; i
< len
; ++i
)
186 stb_arr_push(*packed
, data
[i
]);
190 void output_table(char *name1
, char *name2
, uval
*data
, int length
, int sign
, char **names
)
194 int bytes
, numlen
, at_newline
;
195 int linelen
= 79; // @TODO: make table more readable by choosing a length that's a multiple?
196 int i
,pos
, do_split
=0;
197 for (i
=0; i
< length
; ++i
)
199 maxv
= stb_max(maxv
, (uval
)abs((int)data
[i
]));
201 maxv
= stb_max(maxv
, data
[i
]);
202 bytes
= size_for_max_number_aligned(maxv
);
203 sprintf(temp
, "%d", maxv
);
211 printf("uint%d %s%s[%d] = {\n", bytes
*8, name1
, name2
, length
);
213 for (i
=0; i
< length
; ++i
) {
214 if (pos
+ numlen
+ 2 > linelen
) {
227 printf("%*d,", numlen
, data
[i
]);
230 if (!at_newline
) printf("\n");
234 void output_table_with_trims(char *name1
, char *name2
, uval
*data
, int length
)
238 // split the table into two pieces
244 for (i
=0; i
< stb_arr_len(data
); ++i
) {
245 stb_arr_push(trims
, data
[i
] >> 16);
247 maxt
= stb_max(maxt
, trims
[i
]);
248 maxp
= stb_max(maxp
, data
[i
]);
253 // need to output start & end values
255 // can pack into a single table
256 printf("struct { uint16 val; uint8 start, end; } %s%s[%d] = {\n", name1
, name2
, length
);
258 output_table(name1
, name2
, data
, length
, 0, 0);
260 printf("struct { uint8 start, end; } %s%s_trim[%d] = {\n", name1
, name2
, length
);
262 } else if (maxt
> 0) {
264 output_table(name1
, name2
, data
, length
, 0, 0);
265 output_table(name1
, stb_sprintf("%s_end", name2
), trims
, length
, 0, 0);
268 printf("struct { uint8 val, end; } %s%s[%d] = {\n", name1
, name2
, length
);
272 output_table(name1
, name2
, data
, length
, 0, 0);
275 // d or s can be zero (but not both), e is always present and last
277 assert(count
>= 2 && count
<= 3);
282 int numlen
, at_newline
, len
;
283 int linelen
= 79; // @TODO: make table more readable by choosing a length that's a multiple?
284 int i
,pos
, do_split
=0;
286 for (i
=0; i
< length
; ++i
) {
288 sprintf(temp
, "{%d,%d}", d
? data
[i
] : (trims
[i
]>>8), trims
[i
]&255);
290 sprintf(temp
, "{%d,%d,%d}", data
[i
], trims
[i
]>>8, trims
[i
]&255);
292 numlen
= stb_max(len
, numlen
);
296 for (i
=0; i
< length
; ++i
) {
297 if (pos
+ numlen
+ 2 > linelen
) {
311 sprintf(temp
, "{%d,%d}", d
? data
[i
] : (trims
[i
]>>8), trims
[i
]&255);
313 sprintf(temp
, "{%d,%d,%d}", data
[i
], trims
[i
]>>8, trims
[i
]&255);
314 printf("%*s,", numlen
, temp
);
317 if (!at_newline
) printf("\n");
324 table
pack_for_mode(table
*t
, int mode
, char *table_name
)
329 mode_info mi
= modes
[mode
% MODECOUNT
];
330 int size
= 8 << (mode
/ MODECOUNT
);
334 uval
*indirect
= NULL
;
335 uval
*specials
= NULL
;
336 newtab
.dont_care
= UVAL_DONT_CARE_DEFAULT
;
338 printf("// clusters of %d\n", size
);
339 for (i
=0; i
< t
->length
; i
+= size
) {
341 int fastpath
= (i
< t
->fastpath
);
343 int end_trim
= size
-1;
346 // @TODO: pick special from start or end instead of only end depending on which is longer
348 special
= t
->input
[i
+ end_trim
];
349 if (special
!= t
->dont_care
|| end_trim
== 0)
353 // at this point, special==inp[end_trim], and end_trim >= 0
354 if (special
== t
->dont_care
&& !fastpath
) {
355 // entire block is don't care, so OUTPUT don't care
356 stb_arr_push(index
, newtab
.dont_care
);
360 if (mi
.trim_end
&& !fastpath
) {
361 while (end_trim
>= 0) {
362 if (t
->input
[i
+ end_trim
] == special
|| t
->input
[i
+ end_trim
] == t
->dont_care
)
369 if (mi
.trim_start
&& !fastpath
) {
370 while (start_trim
< end_trim
) {
371 if (t
->input
[i
+ start_trim
] == special
|| t
->input
[i
+ start_trim
] == t
->dont_care
)
378 // end_trim points to the last character we have to output
380 // find the first match, or add it
381 pos
= find_packed(&packed
, &t
->input
[i
+start_trim
], end_trim
-start_trim
+1, mi
.aligned
, fastpath
, &t
->input
[t
->length
], i
+start_trim
, t
->replace_fallback_with_codepoint
);
388 pos
= pos
| 0x80000000;
390 assert(end_trim
< size
&& end_trim
>= -1);
391 if (!fastpath
) assert(end_trim
< size
-1); // special always matches last one
392 assert(end_trim
< size
&& end_trim
+1 >= 0);
393 if (!fastpath
) assert(end_trim
+1 < size
);
396 trim
= start_trim
*256 + (end_trim
+1);
400 assert(pos
< 65536); // @TODO: if this triggers, just bail on this search path
401 pos
= pos
+ (trim
<< 16);
406 stb_arr_push(specials
, special
);
408 } else if (mi
.trim_end
) {
409 int end_trim
= size
-1;
413 while (end_trim
>= 0 && !fastpath
)
414 if (t
->input
[i
+ end_trim
] == t
->fallback
|| t
->input
[i
+ end_trim
] == t
->dont_care
)
419 if (mi
.trim_start
&& !fastpath
) {
420 while (start_trim
< end_trim
) {
421 if (t
->input
[i
+ start_trim
] == t
->fallback
|| t
->input
[i
+ start_trim
] == t
->dont_care
)
428 // end_trim points to the last character we have to output, and can be -1
429 ++end_trim
; // make exclusive at end
431 if (end_trim
== 0 && size
== 256)
432 start_trim
= end_trim
= 1; // we can't make encode a length from 0..256 in 8 bits, so restrict end_trim to 1..256
434 // find the first match, or add it
435 pos
= find_packed(&packed
, &t
->input
[i
+start_trim
], end_trim
- start_trim
, mi
.aligned
, fastpath
, &t
->input
[t
->length
], i
+start_trim
, t
->replace_fallback_with_codepoint
);
437 assert(end_trim
<= size
&& end_trim
>= 0);
439 assert(end_trim
-1 < 256 && end_trim
-1 >= 0);
441 assert(end_trim
< 256 && end_trim
>= 0);
446 trim
= start_trim
*256 + end_trim
;
450 assert(pos
< 65536); // @TODO: if this triggers, just bail on this search path
451 pos
= pos
+ (trim
<< 16);
455 newval
= find_packed(&packed
, &t
->input
[i
], size
, mi
.aligned
, fastpath
, &t
->input
[t
->length
], i
, t
->replace_fallback_with_codepoint
);
460 for (j
=0; j
< stb_arr_len(indirect
); ++j
)
461 if (indirect
[j
] == newval
)
463 if (j
== stb_arr_len(indirect
))
464 stb_arr_push(indirect
, newval
);
465 stb_arr_push(index
, j
);
467 stb_arr_push(index
, newval
);
471 // total up the new size for everything but the index table
472 extra_size
= mi
.overhead
* weight
; // not the actual overhead cost; a penalty to avoid excessive complexity
473 extra_size
+= 150; // per indirection
478 // 'packed' contains two values, which should be packed positive & negative for size
480 for (i
=0; i
< stb_arr_len(packed
); ++i
)
481 if (packed
[i
] & 0x80000000)
482 maxv2
= stb_max(maxv2
, packed
[i
]);
484 maxv
= stb_max(maxv
, packed
[i
]);
485 maxv
= stb_max(maxv
, maxv2
) << 1;
488 for (i
=0; i
< stb_arr_len(packed
); ++i
)
489 if (packed
[i
] > maxv
&& packed
[i
] != t
->dont_care
)
492 extra_size
+= stb_arr_len(packed
) * (t
->splittable
? size_for_max_number(maxv
) : size_for_max_number_aligned(maxv
));
495 output_table_with_trims(table_name
, "", packed
, stb_arr_len(packed
));
497 output_table(table_name
, "", packed
, stb_arr_len(packed
), t
->has_sign
, NULL
);
501 for (i
=0; i
< stb_arr_len(specials
); ++i
)
502 if (specials
[i
] > maxv
)
504 extra_size
+= stb_arr_len(specials
) * size_for_max_number_aligned(maxv
);
506 output_table(table_name
, "_default", specials
, stb_arr_len(specials
), 0, NULL
);
509 for (i
=0; i
< stb_arr_len(indirect
); ++i
)
510 if (indirect
[i
] > maxv
)
512 extra_size
+= stb_arr_len(indirect
) * size_for_max_number(maxv
);
514 if (table_name
&& stb_arr_len(indirect
)) {
516 output_table_with_trims(table_name
, "_index", indirect
, stb_arr_len(indirect
));
518 assert(0); // this case should only trigger in very extreme circumstances
519 output_table(table_name
, "_index", indirect
, stb_arr_len(indirect
), 0, NULL
);
521 mi
.trim_end
= mi
.special
= 0;
525 printf("// above tables should be %d bytes\n", extra_size
);
528 for (i
=0; i
< stb_arr_len(index
); ++i
)
529 if (index
[i
] > maxv
&& index
[i
] != t
->dont_care
)
531 newtab
.splittable
= mi
.trim_end
;
532 newtab
.input_size
= newtab
.splittable
? size_for_max_number(maxv
) : size_for_max_number_aligned(maxv
);
533 newtab
.input
= index
;
534 newtab
.length
= stb_arr_len(index
);
535 newtab
.inherited_storage
= t
->inherited_storage
+ extra_size
;
537 newtab
.depth
= t
->depth
+1;
538 stb_arr_free(indirect
);
539 stb_arr_free(packed
);
540 stb_arr_free(specials
);
545 result
pack_table(table
*t
, size_t path
, int min_storage
)
549 best
.size
= t
->inherited_storage
+ t
->input_size
* t
->length
;
552 if ((int) t
->inherited_storage
> min_storage
) {
553 best
.size
= stb_max(best
.size
, t
->inherited_storage
);
557 if (t
->length
<= 256 || t
->depth
>= 4) {
558 //printf("%08x: %7d\n", best.path, best.size);
563 for (i
=0; i
< MODECOUNT
* CLUSTERSIZECOUNT
; ++i
) {
566 newtab
= pack_for_mode(t
, i
, 0);
567 r
= pack_table(&newtab
, path
+i
+1, min_storage
);
568 if (r
.size
< best
.size
)
570 stb_arr_free(newtab
.input
);
571 //printf("Size: %6d + %6d\n", newtab.inherited_storage, newtab.input_size * newtab.length);
576 int pack_table_by_modes(table
*t
, int *modes
)
579 while (*modes
> -1) {
581 newtab
= pack_for_mode(&s
, *modes
, 0);
582 if (s
.input
!= t
->input
)
583 stb_arr_free(s
.input
);
587 return s
.inherited_storage
+ s
.input_size
* s
.length
;
590 int strip_table(table
*t
, int exceptions
)
594 while (t
->input
[p
] == t
->dont_care
)
596 terminal_value
= t
->input
[p
];
598 while (p
>= 0x10000) {
599 if (t
->input
[p
] != terminal_value
&& t
->input
[p
] != t
->dont_care
) {
607 return p
+1; // p is a character we must output
610 void optimize_table(table
*t
, char *table_name
)
612 int modelist
[3] = { 85, -1 };
620 // strip tail end of table
621 int orig_length
= t
->length
;
622 int threshhold
= 0xffff;
623 int p
= strip_table(t
, 2);
624 int len_saved
= t
->length
- p
;
625 if (len_saved
>= threshhold
) {
627 while (p
> 0x10000) {
628 p
= strip_table(t
, 0);
629 len_saved
= t
->length
- p
;
630 if (len_saved
< 0x10000)
632 len_saved
= orig_length
- p
;
633 if (len_saved
< threshhold
)
642 // find size of table if we use path 86
643 decent_size
= pack_table_by_modes(t
, modelist
);
647 // find best packing of remainder of table by exploring tree of packings
648 r
= pack_table(t
, 0, decent_size
);
649 // use the computed 'path' to evaluate and output tree
652 path
= 86;//90;//132097;
656 modes
[num_modes
++] = (path
& 127) - 1;
660 printf("// modes: %d\n", r
.path
);
662 while (num_modes
> 0) {
664 sprintf(name
, "%s_%d", table_name
, num_modes
+1);
666 s
= pack_for_mode(&s
, modes
[num_modes
], name
);
668 // output the final table as-is
670 output_table_with_trims(table_name
, "_1", s
.input
, s
.length
);
672 output_table(table_name
, "_1", s
.input
, s
.length
, 0, NULL
);
675 uval unicode_table
[0x110000];
682 char_range
get_range(char *str
)
686 cr
.lo
= strtol(str
, &p
, 16);
687 p
= stb_skipwhite(p
);
689 cr
.hi
= strtol(p
+2, NULL
, 16);
695 char *skip_semi(char *s
, int count
)
706 int main(int argc
, char **argv
)
711 char **s
= stb_stringfile("../../data/UnicodeData.txt", &n
);
713 for (i
=0; i
< n
; ++i
) {
714 if (s
[i
][0] == '#' || s
[i
][0] == '\n' || s
[i
][0] == 0)
717 char_range cr
= get_range(s
[i
]);
718 char *t
= skip_semi(s
[i
], 13);
720 if (*t
== ';' || *t
== '\n' || *t
== 0)
723 v
= strtol(t
, NULL
, 16);
725 maxv
= stb_max(v
, maxv
);
726 for (j
=cr
.lo
; j
<= cr
.hi
; ++j
) {
727 unicode_table
[j
] = v
;
728 //printf("%06x => %06x\n", j, v);
736 t
.dont_care
= UVAL_DONT_CARE_DEFAULT
;
739 t
.inherited_storage
= 0;
742 t
.input
= unicode_table
;
743 t
.input_size
= size_for_max_number(maxv
);
745 t
.replace_fallback_with_codepoint
= 1;
747 optimize_table(&t
, "stbu_upppercase");