1 // stb_dxt.h - v1.04 - DXT1/DXT5 compressor - public domain
2 // original by fabian "ryg" giesen - ported to C by stb
3 // use '#define STB_DXT_IMPLEMENTATION' before including to create the implementation
6 // call stb_compress_dxt_block() for every block (you must pad)
7 // source should be a 4x4 block of RGBA data in row-major order;
8 // A is ignored if you specify alpha=0; you can turn on dithering
9 // and "high quality" using mode.
12 // v1.04 - (ryg) default to no rounding bias for lerped colors (as per S3TC/DX10 spec);
13 // single color match fix (allow for inexact color interpolation);
14 // optimal DXT5 index finder; "high quality" mode that runs multiple refinement steps.
15 // v1.03 - (stb) endianness support
16 // v1.02 - (stb) fix alpha encoding bug
17 // v1.01 - (stb) fix bug converting to RGB that messed up quality, thanks ryg & cbloom
18 // v1.00 - (stb) first release
22 // This software is dual-licensed to the public domain and under the following
23 // license: you are granted a perpetual, irrevocable license to copy, modify,
24 // publish, and distribute this file as you see fit.
26 #ifndef STB_INCLUDE_STB_DXT_H
27 #define STB_INCLUDE_STB_DXT_H
29 // compression mode (bitflags)
30 #define STB_DXT_NORMAL 0
31 #define STB_DXT_DITHER 1 // use dithering. dubious win. never use for normal maps and the like!
32 #define STB_DXT_HIGHQUAL 2 // high quality mode, does two refinement steps instead of 1. ~30-40% slower.
34 void stb_compress_dxt_block(unsigned char *dest
, const unsigned char *src
, int alpha
, int mode
);
35 #define STB_COMPRESS_DXT_BLOCK
37 #ifdef STB_DXT_IMPLEMENTATION
39 // configuration options for DXT encoder. set them in the project/makefile or just define
42 // STB_DXT_USE_ROUNDING_BIAS
43 // use a rounding bias during color interpolation. this is closer to what "ideal"
44 // interpolation would do but doesn't match the S3TC/DX10 spec. old versions (pre-1.03)
45 // implicitly had this turned on.
47 // in case you're targeting a specific type of hardware (e.g. console programmers):
48 // NVidia and Intel GPUs (as of 2010) as well as DX9 ref use DXT decoders that are closer
49 // to STB_DXT_USE_ROUNDING_BIAS. AMD/ATI, S3 and DX10 ref are closer to rounding with no bias.
50 // you also see "(a*5 + b*3) / 8" on some old GPU designs.
51 // #define STB_DXT_USE_ROUNDING_BIAS
55 #include <string.h> // memset
57 static unsigned char stb__Expand5
[32];
58 static unsigned char stb__Expand6
[64];
59 static unsigned char stb__OMatch5
[256][2];
60 static unsigned char stb__OMatch6
[256][2];
61 static unsigned char stb__QuantRBTab
[256+16];
62 static unsigned char stb__QuantGTab
[256+16];
64 static int stb__Mul8Bit(int a
, int b
)
67 return (t
+ (t
>> 8)) >> 8;
70 static void stb__From16Bit(unsigned char *out
, unsigned short v
)
72 int rv
= (v
& 0xf800) >> 11;
73 int gv
= (v
& 0x07e0) >> 5;
74 int bv
= (v
& 0x001f) >> 0;
76 out
[0] = stb__Expand5
[rv
];
77 out
[1] = stb__Expand6
[gv
];
78 out
[2] = stb__Expand5
[bv
];
82 static unsigned short stb__As16Bit(int r
, int g
, int b
)
84 return (stb__Mul8Bit(r
,31) << 11) + (stb__Mul8Bit(g
,63) << 5) + stb__Mul8Bit(b
,31);
87 // linear interpolation at 1/3 point between a and b, using desired rounding type
88 static int stb__Lerp13(int a
, int b
)
90 #ifdef STB_DXT_USE_ROUNDING_BIAS
92 return a
+ stb__Mul8Bit(b
-a
, 0x55);
94 // without rounding bias
95 // replace "/ 3" by "* 0xaaab) >> 17" if your compiler sucks or you really need every ounce of speed.
101 static void stb__Lerp13RGB(unsigned char *out
, unsigned char *p1
, unsigned char *p2
)
103 out
[0] = stb__Lerp13(p1
[0], p2
[0]);
104 out
[1] = stb__Lerp13(p1
[1], p2
[1]);
105 out
[2] = stb__Lerp13(p1
[2], p2
[2]);
108 /****************************************************************************/
110 // compute table to reproduce constant colors as accurately as possible
111 static void stb__PrepareOptTable(unsigned char *Table
,const unsigned char *expand
,int size
)
114 for (i
=0;i
<256;i
++) {
116 for (mn
=0;mn
<size
;mn
++) {
117 for (mx
=0;mx
<size
;mx
++) {
118 int mine
= expand
[mn
];
119 int maxe
= expand
[mx
];
120 int err
= abs(stb__Lerp13(maxe
, mine
) - i
);
122 // DX10 spec says that interpolation must be within 3% of "correct" result,
123 // add this as error term. (normally we'd expect a random distribution of
124 // +-1.5% error, but nowhere in the spec does it say that the error has to be
125 // unbiased - better safe than sorry).
126 err
+= abs(maxe
- mine
) * 3 / 100;
139 static void stb__EvalColors(unsigned char *color
,unsigned short c0
,unsigned short c1
)
141 stb__From16Bit(color
+ 0, c0
);
142 stb__From16Bit(color
+ 4, c1
);
143 stb__Lerp13RGB(color
+ 8, color
+0, color
+4);
144 stb__Lerp13RGB(color
+12, color
+4, color
+0);
147 // Block dithering function. Simply dithers a block to 565 RGB.
149 static void stb__DitherBlock(unsigned char *dest
, unsigned char *block
)
151 int err
[8],*ep1
= err
,*ep2
= err
+4, *et
;
154 // process channels seperately
155 for (ch
=0; ch
<3; ++ch
) {
156 unsigned char *bp
= block
+ch
, *dp
= dest
+ch
;
157 unsigned char *quant
= (ch
== 1) ? stb__QuantGTab
+8 : stb__QuantRBTab
+8;
158 memset(err
, 0, sizeof(err
));
160 dp
[ 0] = quant
[bp
[ 0] + ((3*ep2
[1] + 5*ep2
[0]) >> 4)];
161 ep1
[0] = bp
[ 0] - dp
[ 0];
162 dp
[ 4] = quant
[bp
[ 4] + ((7*ep1
[0] + 3*ep2
[2] + 5*ep2
[1] + ep2
[0]) >> 4)];
163 ep1
[1] = bp
[ 4] - dp
[ 4];
164 dp
[ 8] = quant
[bp
[ 8] + ((7*ep1
[1] + 3*ep2
[3] + 5*ep2
[2] + ep2
[1]) >> 4)];
165 ep1
[2] = bp
[ 8] - dp
[ 8];
166 dp
[12] = quant
[bp
[12] + ((7*ep1
[2] + 5*ep2
[3] + ep2
[2]) >> 4)];
167 ep1
[3] = bp
[12] - dp
[12];
170 et
= ep1
, ep1
= ep2
, ep2
= et
; // swap
175 // The color matching function
176 static unsigned int stb__MatchColorsBlock(unsigned char *block
, unsigned char *color
,int dither
)
178 unsigned int mask
= 0;
179 int dirr
= color
[0*4+0] - color
[1*4+0];
180 int dirg
= color
[0*4+1] - color
[1*4+1];
181 int dirb
= color
[0*4+2] - color
[1*4+2];
185 int c0Point
, halfPoint
, c3Point
;
188 dots
[i
] = block
[i
*4+0]*dirr
+ block
[i
*4+1]*dirg
+ block
[i
*4+2]*dirb
;
191 stops
[i
] = color
[i
*4+0]*dirr
+ color
[i
*4+1]*dirg
+ color
[i
*4+2]*dirb
;
193 // think of the colors as arranged on a line; project point onto that line, then choose
194 // next color out of available ones. we compute the crossover points for "best color in top
195 // half"/"best in bottom half" and then the same inside that subinterval.
197 // relying on this 1d approximation isn't always optimal in terms of euclidean distance,
198 // but it's very close and a lot faster.
199 // http://cbloomrants.blogspot.com/2008/12/12-08-08-dxtc-summary.html
201 c0Point
= (stops
[1] + stops
[3]) >> 1;
202 halfPoint
= (stops
[3] + stops
[2]) >> 1;
203 c3Point
= (stops
[2] + stops
[0]) >> 1;
206 // the version without dithering is straightforward
207 for (i
=15;i
>=0;i
--) {
212 mask
|= (dot
< c0Point
) ? 1 : 3;
214 mask
|= (dot
< c3Point
) ? 2 : 0;
217 // with floyd-steinberg dithering
218 int err
[8],*ep1
= err
,*ep2
= err
+4;
231 dot
= (dp
[0] << 4) + (3*ep2
[1] + 5*ep2
[0]);
233 step
= (dot
< c0Point
) ? 1 : 3;
235 step
= (dot
< c3Point
) ? 2 : 0;
236 ep1
[0] = dp
[0] - stops
[step
];
239 dot
= (dp
[1] << 4) + (7*ep1
[0] + 3*ep2
[2] + 5*ep2
[1] + ep2
[0]);
241 step
= (dot
< c0Point
) ? 1 : 3;
243 step
= (dot
< c3Point
) ? 2 : 0;
244 ep1
[1] = dp
[1] - stops
[step
];
247 dot
= (dp
[2] << 4) + (7*ep1
[1] + 3*ep2
[3] + 5*ep2
[2] + ep2
[1]);
249 step
= (dot
< c0Point
) ? 1 : 3;
251 step
= (dot
< c3Point
) ? 2 : 0;
252 ep1
[2] = dp
[2] - stops
[step
];
255 dot
= (dp
[3] << 4) + (7*ep1
[2] + 5*ep2
[3] + ep2
[2]);
257 step
= (dot
< c0Point
) ? 1 : 3;
259 step
= (dot
< c3Point
) ? 2 : 0;
260 ep1
[3] = dp
[3] - stops
[step
];
264 mask
|= lmask
<< (y
*8);
265 { int *et
= ep1
; ep1
= ep2
; ep2
= et
; } // swap
272 // The color optimization function. (Clever code, part 1)
273 static void stb__OptimizeColorsBlock(unsigned char *block
, unsigned short *pmax16
, unsigned short *pmin16
)
275 int mind
= 0x7fffffff,maxd
= -0x7fffffff;
276 unsigned char *minp
, *maxp
;
279 static const int nIterPower
= 4;
280 float covf
[6],vfr
,vfg
,vfb
;
282 // determine color distribution
284 int mu
[3],min
[3],max
[3];
289 const unsigned char *bp
= ((const unsigned char *) block
) + ch
;
292 muv
= minv
= maxv
= bp
[0];
296 if (bp
[i
] < minv
) minv
= bp
[i
];
297 else if (bp
[i
] > maxv
) maxv
= bp
[i
];
300 mu
[ch
] = (muv
+ 8) >> 4;
305 // determine covariance matrix
311 int r
= block
[i
*4+0] - mu
[0];
312 int g
= block
[i
*4+1] - mu
[1];
313 int b
= block
[i
*4+2] - mu
[2];
323 // convert covariance matrix to float, find principal axis via power iter
325 covf
[i
] = cov
[i
] / 255.0f
;
327 vfr
= (float) (max
[0] - min
[0]);
328 vfg
= (float) (max
[1] - min
[1]);
329 vfb
= (float) (max
[2] - min
[2]);
331 for(iter
=0;iter
<nIterPower
;iter
++)
333 float r
= vfr
*covf
[0] + vfg
*covf
[1] + vfb
*covf
[2];
334 float g
= vfr
*covf
[1] + vfg
*covf
[3] + vfb
*covf
[4];
335 float b
= vfr
*covf
[2] + vfg
*covf
[4] + vfb
*covf
[5];
343 if (fabs(vfg
) > magn
) magn
= fabs(vfg
);
344 if (fabs(vfb
) > magn
) magn
= fabs(vfb
);
346 if(magn
< 4.0f
) { // too small, default to luminance
347 v_r
= 299; // JPEG YCbCr luma coefs, scaled by 1000.
352 v_r
= (int) (vfr
* magn
);
353 v_g
= (int) (vfg
* magn
);
354 v_b
= (int) (vfb
* magn
);
357 // Pick colors at extreme points
360 int dot
= block
[i
*4+0]*v_r
+ block
[i
*4+1]*v_g
+ block
[i
*4+2]*v_b
;
373 *pmax16
= stb__As16Bit(maxp
[0],maxp
[1],maxp
[2]);
374 *pmin16
= stb__As16Bit(minp
[0],minp
[1],minp
[2]);
377 static int stb__sclamp(float y
, int p0
, int p1
)
380 if (x
< p0
) return p0
;
381 if (x
> p1
) return p1
;
385 // The refinement function. (Clever code, part 2)
386 // Tries to optimize colors to suit block contents better.
387 // (By solving a least squares system via normal equations+Cramer's rule)
388 static int stb__RefineBlock(unsigned char *block
, unsigned short *pmax16
, unsigned short *pmin16
, unsigned int mask
)
390 static const int w1Tab
[4] = { 3,0,2,1 };
391 static const int prods
[4] = { 0x090000,0x000900,0x040102,0x010402 };
392 // ^some magic to save a lot of multiplies in the accumulating loop...
393 // (precomputed products of weights for least squares system, accumulated inside one 32-bit register)
396 unsigned short oldMin
, oldMax
, min16
, max16
;
397 int i
, akku
= 0, xx
,xy
,yy
;
398 int At1_r
,At1_g
,At1_b
;
399 int At2_r
,At2_g
,At2_b
;
400 unsigned int cm
= mask
;
405 if((mask
^ (mask
<<2)) < 4) // all pixels have the same index?
407 // yes, linear system would be singular; solve using optimal
408 // single-color match on average color
409 int r
= 8, g
= 8, b
= 8;
416 r
>>= 4; g
>>= 4; b
>>= 4;
418 max16
= (stb__OMatch5
[r
][0]<<11) | (stb__OMatch6
[g
][0]<<5) | stb__OMatch5
[b
][0];
419 min16
= (stb__OMatch5
[r
][1]<<11) | (stb__OMatch6
[g
][1]<<5) | stb__OMatch5
[b
][1];
421 At1_r
= At1_g
= At1_b
= 0;
422 At2_r
= At2_g
= At2_b
= 0;
423 for (i
=0;i
<16;++i
,cm
>>=2) {
425 int w1
= w1Tab
[step
];
426 int r
= block
[i
*4+0];
427 int g
= block
[i
*4+1];
428 int b
= block
[i
*4+2];
439 At2_r
= 3*At2_r
- At1_r
;
440 At2_g
= 3*At2_g
- At1_g
;
441 At2_b
= 3*At2_b
- At1_b
;
443 // extract solutions and decide solvability
445 yy
= (akku
>> 8) & 0xff;
446 xy
= (akku
>> 0) & 0xff;
448 frb
= 3.0f
* 31.0f
/ 255.0f
/ (xx
*yy
- xy
*xy
);
449 fg
= frb
* 63.0f
/ 31.0f
;
452 max16
= stb__sclamp((At1_r
*yy
- At2_r
*xy
)*frb
+0.5f
,0,31) << 11;
453 max16
|= stb__sclamp((At1_g
*yy
- At2_g
*xy
)*fg
+0.5f
,0,63) << 5;
454 max16
|= stb__sclamp((At1_b
*yy
- At2_b
*xy
)*frb
+0.5f
,0,31) << 0;
456 min16
= stb__sclamp((At2_r
*xx
- At1_r
*xy
)*frb
+0.5f
,0,31) << 11;
457 min16
|= stb__sclamp((At2_g
*xx
- At1_g
*xy
)*fg
+0.5f
,0,63) << 5;
458 min16
|= stb__sclamp((At2_b
*xx
- At1_b
*xy
)*frb
+0.5f
,0,31) << 0;
463 return oldMin
!= min16
|| oldMax
!= max16
;
466 // Color block compression
467 static void stb__CompressColorBlock(unsigned char *dest
, unsigned char *block
, int mode
)
473 unsigned short max16
, min16
;
474 unsigned char dblock
[16*4],color
[4*4];
476 dither
= mode
& STB_DXT_DITHER
;
477 refinecount
= (mode
& STB_DXT_HIGHQUAL
) ? 2 : 1;
479 // check if block is constant
481 if (((unsigned int *) block
)[i
] != ((unsigned int *) block
)[0])
484 if(i
== 16) { // constant color
485 int r
= block
[0], g
= block
[1], b
= block
[2];
487 max16
= (stb__OMatch5
[r
][0]<<11) | (stb__OMatch6
[g
][0]<<5) | stb__OMatch5
[b
][0];
488 min16
= (stb__OMatch5
[r
][1]<<11) | (stb__OMatch6
[g
][1]<<5) | stb__OMatch5
[b
][1];
490 // first step: compute dithered version for PCA if desired
492 stb__DitherBlock(dblock
,block
);
494 // second step: pca+map along principal axis
495 stb__OptimizeColorsBlock(dither
? dblock
: block
,&max16
,&min16
);
496 if (max16
!= min16
) {
497 stb__EvalColors(color
,max16
,min16
);
498 mask
= stb__MatchColorsBlock(block
,color
,dither
);
502 // third step: refine (multiple times if requested)
503 for (i
=0;i
<refinecount
;i
++) {
504 unsigned int lastmask
= mask
;
506 if (stb__RefineBlock(dither
? dblock
: block
,&max16
,&min16
,mask
)) {
507 if (max16
!= min16
) {
508 stb__EvalColors(color
,max16
,min16
);
509 mask
= stb__MatchColorsBlock(block
,color
,dither
);
521 // write the color block
524 unsigned short t
= min16
;
530 dest
[0] = (unsigned char) (max16
);
531 dest
[1] = (unsigned char) (max16
>> 8);
532 dest
[2] = (unsigned char) (min16
);
533 dest
[3] = (unsigned char) (min16
>> 8);
534 dest
[4] = (unsigned char) (mask
);
535 dest
[5] = (unsigned char) (mask
>> 8);
536 dest
[6] = (unsigned char) (mask
>> 16);
537 dest
[7] = (unsigned char) (mask
>> 24);
540 // Alpha block compression (this is easy for a change)
541 static void stb__CompressAlphaBlock(unsigned char *dest
,unsigned char *src
,int mode
)
543 int i
,dist
,bias
,dist4
,dist2
,bits
,mask
;
545 // find min/max color
551 if (src
[i
*4+3] < mn
) mn
= src
[i
*4+3];
552 else if (src
[i
*4+3] > mx
) mx
= src
[i
*4+3];
556 ((unsigned char *)dest
)[0] = mx
;
557 ((unsigned char *)dest
)[1] = mn
;
560 // determine bias and emit color indices
561 // given the choice of mx/mn, these indices are optimal:
562 // http://fgiesen.wordpress.com/2009/12/15/dxt5-alpha-block-index-determination/
566 bias
= (dist
< 8) ? (dist
- 1) : (dist
/2 + 2);
571 int a
= src
[i
*4+3]*7 + bias
;
574 // select index. this is a "linear scale" lerp factor between 0 (val=min) and 7 (val=max).
575 t
= (a
>= dist4
) ? -1 : 0; ind
= t
& 4; a
-= dist4
& t
;
576 t
= (a
>= dist2
) ? -1 : 0; ind
+= t
& 2; a
-= dist2
& t
;
579 // turn linear scale into DXT index (0/1 are extremal pts)
585 if((bits
+= 3) >= 8) {
593 static void stb__InitDXT()
597 stb__Expand5
[i
] = (i
<<3)|(i
>>2);
600 stb__Expand6
[i
] = (i
<<2)|(i
>>4);
602 for(i
=0;i
<256+16;i
++)
604 int v
= i
-8 < 0 ? 0 : i
-8 > 255 ? 255 : i
-8;
605 stb__QuantRBTab
[i
] = stb__Expand5
[stb__Mul8Bit(v
,31)];
606 stb__QuantGTab
[i
] = stb__Expand6
[stb__Mul8Bit(v
,63)];
609 stb__PrepareOptTable(&stb__OMatch5
[0][0],stb__Expand5
,32);
610 stb__PrepareOptTable(&stb__OMatch6
[0][0],stb__Expand6
,64);
613 void stb_compress_dxt_block(unsigned char *dest
, const unsigned char *src
, int alpha
, int mode
)
622 stb__CompressAlphaBlock(dest
,(unsigned char*) src
,mode
);
626 stb__CompressColorBlock(dest
,(unsigned char*) src
,mode
);
628 #endif // STB_DXT_IMPLEMENTATION
630 #endif // STB_INCLUDE_STB_DXT_H