1 /// D translation of stb_image-1.33 (http://nothings.org/stb_image.c)
2 ///
3 /// This port only supports:
4 /// $(UL
5 ///   $(LI PNG 8-bit-per-channel only.)
6 ///   $(LI JPEG baseline (no JPEG progressive).)
7 ///   $(LI GIF.)
8 ///   $(LI BMP non-1bpp, non-RLE.)
9 /// )
10 ///
11 /// TODO:
12 /// $(UL
13 ///   $(LI Support a range as input.)
14 ///  )
15 
16 //============================    Contributors    =========================
17 //
18 // Image formats                                Optimizations & bugfixes
19 // Sean Barrett (jpeg, png, bmp)                Fabian "ryg" Giesen
20 // Nicolas Schulz (hdr, psd)
21 // Jonathan Dummer (tga)                     Bug fixes & warning fixes
22 // Jean-Marc Lienher (gif)                      Marc LeBlanc
23 // Tom Seddon (pic)                             Christpher Lloyd
24 // Thatcher Ulrich (psd)                        Dave Moore
25 // Won Chun
26 // the Horde3D community
27 // Extensions, features                            Janez Zemva
28 // Jetro Lauha (stbi_info)                      Jonathan Blow
29 // James "moose2000" Brown (iPhone PNG)         Laurent Gomila
30 // Ben "Disch" Wenger (io callbacks)            Aruelien Pocheville
31 // Martin "SpartanJ" Golini                     Ryamond Barbiero
32 // David Woo
33 
34 module gfm.image.stb_image;
35 
36 import core.stdc.stdlib;
37 import core.stdc.string;
38 
39 import ae.utils.graphics.image;
40 import ae.utils.graphics.color;
41 
42 enum STBI_VERSION = 1;
43 
44 /// The exception type thrown when loading an image failed.
45 class STBImageException : Exception
46 {
47     public
48     {
49         @safe pure nothrow this(string message, string file =__FILE__, size_t line = __LINE__, Throwable next = null)
50         {
51             super(message, file, line, next);
52         }
53     }
54 }
55 
56 enum : int
57 {
58    STBI_default    = 0, // only used for req_comp
59    STBI_grey       = 1,
60    STBI_grey_alpha = 2,
61    STBI_rgb        = 3,
62    STBI_rgb_alpha  = 4
63 };
64 
65 // define faster low-level operations (typically SIMD support)
66 
67 
68 uint stbi_lrot(uint x, uint y)
69 {
70     return (x << y) | (x >> (32 - y));
71 }
72 
73 // stbi structure is our basic context used by all images, so it
74 // contains all the IO context, plus some basic image information
75 struct stbi
76 {
77    uint img_x, img_y;
78    int img_n, img_out_n;
79 
80    int buflen;
81    ubyte buffer_start[128];
82 
83    const(ubyte) *img_buffer;
84    const(ubyte) *img_buffer_end;
85    const(ubyte) *img_buffer_original;
86 }
87 
88 
89 // initialize a memory-decode context
90 void start_mem(stbi *s, const(ubyte)*buffer, int len)
91 {
92    s.img_buffer = buffer;
93    s.img_buffer_original = buffer;
94    s.img_buffer_end = buffer+len;
95 }
96 
97 void stbi_rewind(stbi *s)
98 {
99    // conceptually rewind SHOULD rewind to the beginning of the stream,
100    // but we just rewind to the beginning of the initial buffer, because
101    // we only use it after doing 'test', which only ever looks at at most 92 bytes
102    s.img_buffer = s.img_buffer_original;
103 }
104 
105 
106 ubyte *stbi_load_main(stbi *s, int *x, int *y, int *comp, int req_comp)
107 {
108     try
109     {
110         stbi_jpeg_test(s);
111         stbi_rewind(s);
112         return stbi_jpeg_load(s,x,y,comp,req_comp);
113     }
114     catch(STBImageException e)
115     {
116         stbi_rewind(s);
117     }
118 
119     try
120     {
121         stbi_png_test(s);
122         stbi_rewind(s);
123         return stbi_png_load(s,x,y,comp,req_comp);
124     }
125     catch(STBImageException e)
126     {
127         stbi_rewind(s);
128     }
129 
130     try
131     {
132         stbi_bmp_test(s);
133         stbi_rewind(s);
134         return stbi_bmp_load(s,x,y,comp,req_comp);
135     }
136     catch(STBImageException e)
137     {
138         stbi_rewind(s);
139     }
140 
141     try
142     {
143         stbi_gif_test(s);
144         stbi_rewind(s);
145         return stbi_gif_load(s,x,y,comp,req_comp);
146     }
147     catch(STBImageException e)
148     {
149         stbi_rewind(s);
150     }
151 
152     throw new STBImageException("Image not of any known type, or corrupt");
153 }
154 
155 /// Loads an image from memory.
156 /// Throws: STBImageException on error.
157 ubyte* stbi_load_from_memory(void[] buffer, out int width, out int height, out int components, int requestedComponents)
158 {
159    stbi s;
160    start_mem(&s, cast(ubyte*)buffer.ptr, cast(int)(buffer.length));
161    return stbi_load_main(&s, &width, &height, &components, requestedComponents);
162 }
163 
164 /// Frees an image loaded by stb_image.
165 void stbi_image_free(void *retval_from_stbi_load)
166 {
167     free(retval_from_stbi_load);
168 }
169 
170 /// Load an image from memory and puts it in a ae.utils.graphics.image.Image.
171 /// Throws: STBImageException on error.
172 Image!RGBA stbiLoadImageAE(void[] buffer)
173 {
174     int width, height, components;
175     ubyte* data = stbi_load_from_memory(buffer, width, height, components, 4);
176     scope(exit) stbi_image_free(data);
177 
178     if(components != 4)
179         throw new STBImageException("Could't convert image to 4 components");
180 
181     auto result = Image!RGBA(width, height);
182     size_t length = width * height * RGBA.sizeof;
183     result.pixels[] = cast(RGBA[])(data[0..length]);
184     return result;
185 }
186 
187 //
188 // Common code used by all image loaders
189 //
190 
191 enum : int
192 {
193    SCAN_load=0,
194    SCAN_type,
195    SCAN_header
196 };
197 
198 
199 int get8(stbi *s)
200 {
201    if (s.img_buffer < s.img_buffer_end)
202       return *s.img_buffer++;
203 
204    return 0;
205 }
206 
207 int at_eof(stbi *s)
208 {
209    return s.img_buffer >= s.img_buffer_end;
210 }
211 
212 ubyte get8u(stbi *s)
213 {
214    return cast(ubyte) get8(s);
215 }
216 
217 void skip(stbi *s, int n)
218 {
219    s.img_buffer += n;
220 }
221 
222 int getn(stbi *s, ubyte *buffer, int n)
223 {
224    if (s.img_buffer+n <= s.img_buffer_end) {
225       memcpy(buffer, s.img_buffer, n);
226       s.img_buffer += n;
227       return 1;
228    } else
229       return 0;
230 }
231 
232 int get16(stbi *s)
233 {
234    int z = get8(s);
235    return (z << 8) + get8(s);
236 }
237 
238 uint get32(stbi *s)
239 {
240    uint z = get16(s);
241    return (z << 16) + get16(s);
242 }
243 
244 int get16le(stbi *s)
245 {
246    int z = get8(s);
247    return z + (get8(s) << 8);
248 }
249 
250 uint get32le(stbi *s)
251 {
252    uint z = get16le(s);
253    return z + (get16le(s) << 16);
254 }
255 
256 //
257 //  generic converter from built-in img_n to req_comp
258 //    individual types do this automatically as much as possible (e.g. jpeg
259 //    does all cases internally since it needs to colorspace convert anyway,
260 //    and it never has alpha, so very few cases ). png can automatically
261 //    interleave an alpha=255 channel, but falls back to this for other cases
262 //
263 //  assume data buffer is malloced, so malloc a new one and free that one
264 //  only failure mode is malloc failing
265 
266 ubyte compute_y(int r, int g, int b)
267 {
268    return cast(ubyte) (((r*77) + (g*150) +  (29*b)) >> 8);
269 }
270 
271 ubyte *convert_format(ubyte *data, int img_n, int req_comp, uint x, uint y)
272 {
273     int i,j;
274     ubyte *good;
275 
276     if (req_comp == img_n) return data;
277     assert(req_comp >= 1 && req_comp <= 4);
278 
279     good = cast(ubyte*) malloc(req_comp * x * y);
280     if (good == null) {
281         free(data);
282         throw new STBImageException("Out of memory");
283     }
284 
285     for (j=0; j < cast(int) y; ++j) {
286         ubyte *src  = data + j * x * img_n   ;
287         ubyte *dest = good + j * x * req_comp;
288 
289         // convert source image with img_n components to one with req_comp components;
290         // avoid switch per pixel, so use switch per scanline and massive macros
291         switch (img_n * 8 + req_comp)
292         {
293             case 1 * 8 + 2:
294                 for(i=x-1; i >= 0; --i, src += 1, dest += 2)
295                     dest[0] = src[0], dest[1] = 255;
296                 break;
297             case 1 * 8 + 3:
298                 for(i=x-1; i >= 0; --i, src += 1, dest += 3)
299                     dest[0]=dest[1]=dest[2]=src[0];
300                 break;
301             case 1 * 8 + 4:
302                 for(i=x-1; i >= 0; --i, src += 1, dest += 4)
303                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;
304                 break;
305             case 2 * 8 + 1:
306                 for(i=x-1; i >= 0; --i, src += 2, dest += 1)
307                     dest[0]=src[0];
308                 break;
309             case 2 * 8 + 3:
310                 for(i=x-1; i >= 0; --i, src += 2, dest += 3)
311                     dest[0]=dest[1]=dest[2]=src[0];
312                 break;
313             case 2 * 8 + 4:
314                 for(i=x-1; i >= 0; --i, src += 2, dest += 4)
315                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];
316                 break;
317             case 3 * 8 + 4:
318                 for(i=x-1; i >= 0; --i, src += 3, dest += 4)
319                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;
320                 break;
321             case 3 * 8 + 1:
322                 for(i=x-1; i >= 0; --i, src += 3, dest += 1)
323                     dest[0]=compute_y(src[0],src[1],src[2]);
324                 break;
325             case 3 * 8 + 2:
326                 for(i=x-1; i >= 0; --i, src += 3, dest += 2)
327                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255;
328                 break;
329             case 4 * 8 + 1:
330                 for(i=x-1; i >= 0; --i, src += 4, dest += 1)
331                     dest[0]=compute_y(src[0],src[1],src[2]);
332                 break;
333             case 4 * 8 + 2:
334                 for(i=x-1; i >= 0; --i, src += 4, dest += 2)
335                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3];
336                 break;
337             case 4 * 8 + 3:
338                 for(i=x-1; i >= 0; --i, src += 4, dest += 3)
339                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];
340                 break;
341             default: assert(0);
342         }
343     }
344 
345     free(data);
346     return good;
347 }
348 
349 //
350 //  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
351 //
352 //    simple implementation
353 //      - channel subsampling of at most 2 in each dimension
354 //      - doesn't support delayed output of y-dimension
355 //      - simple interface (only one output format: 8-bit interleaved RGB)
356 //      - doesn't try to recover corrupt jpegs
357 //      - doesn't allow partial loading, loading multiple at once
358 //      - still fast on x86 (copying globals into locals doesn't help x86)
359 //      - allocates lots of intermediate memory (full size of all components)
360 //        - non-interleaved case requires this anyway
361 //        - allows good upsampling (see next)
362 //    high-quality
363 //      - upsampled channels are bilinearly interpolated, even across blocks
364 //      - quality integer IDCT derived from IJG's 'slow'
365 //    performance
366 //      - fast huffman; reasonable integer IDCT
367 //      - uses a lot of intermediate memory, could cache poorly
368 //      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
369 //          stb_jpeg:   1.34 seconds (MSVC6, default release build)
370 //          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
371 //          IJL11.dll:  1.08 seconds (compiled by intel)
372 //          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
373 //          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
374 
375 // huffman decoding acceleration
376 enum FAST_BITS = 9;  // larger handles more cases; smaller stomps less cache
377 
378 struct huffman
379 {
380    ubyte[1 << FAST_BITS] fast;
381    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
382    ushort[256] code;
383    ubyte[256] values;
384    ubyte[257] size;
385    uint[18] maxcode;
386    int[17] delta;   // old 'firstsymbol' - old 'firstcode'
387 }
388 
389 struct jpeg
390 {
391    stbi *s;
392    huffman[4] huff_dc;
393    huffman[4] huff_ac;
394    ubyte[64][4] dequant;
395 
396 // sizes for components, interleaved MCUs
397    int img_h_max, img_v_max;
398    int img_mcu_x, img_mcu_y;
399    int img_mcu_w, img_mcu_h;
400 
401 // definition of jpeg image component
402    struct img_comp_
403    {
404       int id;
405       int h,v;
406       int tq;
407       int hd,ha;
408       int dc_pred;
409 
410       int x,y,w2,h2;
411       ubyte *data;
412       void *raw_data;
413       ubyte *linebuf;
414    }
415 
416    img_comp_[4] img_comp;
417 
418    uint         code_buffer; // jpeg entropy-coded buffer
419    int            code_bits;   // number of valid bits
420    ubyte          marker;      // marker seen while filling entropy buffer
421    int            nomore;      // flag if we saw a marker so must stop
422 
423    int scan_n;
424    int[4] order;
425    int restart_interval, todo;
426 }
427 
428 
429 int build_huffman(huffman *h, int *count)
430 {
431    int i,j,k=0,code;
432    // build size list for each symbol (from JPEG spec)
433    for (i=0; i < 16; ++i)
434       for (j=0; j < count[i]; ++j)
435          h.size[k++] = cast(ubyte) (i+1);
436    h.size[k] = 0;
437 
438    // compute actual symbols (from jpeg spec)
439    code = 0;
440    k = 0;
441    for(j=1; j <= 16; ++j) {
442       // compute delta to add to code to compute symbol id
443       h.delta[j] = k - code;
444       if (h.size[k] == j) {
445          while (h.size[k] == j)
446             h.code[k++] = cast(ushort) (code++);
447          if (code-1 >= (1 << j))
448              throw new STBImageException("Bad code lengths, corrupt JPEG");
449       }
450       // compute largest code + 1 for this size, preshifted as needed later
451       h.maxcode[j] = code << (16-j);
452       code <<= 1;
453    }
454    h.maxcode[j] = 0xffffffff;
455 
456    // build non-spec acceleration table; 255 is flag for not-accelerated
457    memset(h.fast.ptr, 255, 1 << FAST_BITS);
458    for (i=0; i < k; ++i) {
459       int s = h.size[i];
460       if (s <= FAST_BITS) {
461          int c = h.code[i] << (FAST_BITS-s);
462          int m = 1 << (FAST_BITS-s);
463          for (j=0; j < m; ++j) {
464             h.fast[c+j] = cast(ubyte) i;
465          }
466       }
467    }
468    return 1;
469 }
470 
471 void grow_buffer_unsafe(jpeg *j)
472 {
473    do {
474       int b = j.nomore ? 0 : get8(j.s);
475       if (b == 0xff) {
476          int c = get8(j.s);
477          if (c != 0) {
478             j.marker = cast(ubyte) c;
479             j.nomore = 1;
480             return;
481          }
482       }
483       j.code_buffer |= b << (24 - j.code_bits);
484       j.code_bits += 8;
485    } while (j.code_bits <= 24);
486 }
487 
488 // (1 << n) - 1
489 static immutable uint bmask[17]=[0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535];
490 
491 // decode a jpeg huffman value from the bitstream
492 int decode(jpeg *j, huffman *h)
493 {
494    uint temp;
495    int c,k;
496 
497    if (j.code_bits < 16) grow_buffer_unsafe(j);
498 
499    // look at the top FAST_BITS and determine what symbol ID it is,
500    // if the code is <= FAST_BITS
501    c = (j.code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
502    k = h.fast[c];
503    if (k < 255) {
504       int s = h.size[k];
505       if (s > j.code_bits)
506          return -1;
507       j.code_buffer <<= s;
508       j.code_bits -= s;
509       return h.values[k];
510    }
511 
512    // naive test is to shift the code_buffer down so k bits are
513    // valid, then test against maxcode. To speed this up, we've
514    // preshifted maxcode left so that it has (16-k) 0s at the
515    // end; in other words, regardless of the number of bits, it
516    // wants to be compared against something shifted to have 16;
517    // that way we don't need to shift inside the loop.
518    temp = j.code_buffer >> 16;
519    for (k=FAST_BITS+1 ; ; ++k)
520       if (temp < h.maxcode[k])
521          break;
522    if (k == 17) {
523       // error! code not found
524       j.code_bits -= 16;
525       return -1;
526    }
527 
528    if (k > j.code_bits)
529       return -1;
530 
531    // convert the huffman code to the symbol id
532    c = ((j.code_buffer >> (32 - k)) & bmask[k]) + h.delta[k];
533    assert((((j.code_buffer) >> (32 - h.size[c])) & bmask[h.size[c]]) == h.code[c]);
534 
535    // convert the id to a symbol
536    j.code_bits -= k;
537    j.code_buffer <<= k;
538    return h.values[c];
539 }
540 
541 // combined JPEG 'receive' and JPEG 'extend', since baseline
542 // always extends everything it receives.
543 int extend_receive(jpeg *j, int n)
544 {
545    uint m = 1 << (n-1);
546    uint k;
547    if (j.code_bits < n) grow_buffer_unsafe(j);
548 
549    k = stbi_lrot(j.code_buffer, n);
550    j.code_buffer = k & ~bmask[n];
551    k &= bmask[n];
552    j.code_bits -= n;
553 
554    // the following test is probably a random branch that won't
555    // predict well. I tried to table accelerate it but failed.
556    // maybe it's compiling as a conditional move?
557    if (k < m)
558       return (-1 << n) + k + 1;
559    else
560       return k;
561 }
562 
563 // given a value that's at position X in the zigzag stream,
564 // where does it appear in the 8x8 matrix coded as row-major?
565 static immutable ubyte dezigzag[64+15] =
566 [
567     0,  1,  8, 16,  9,  2,  3, 10,
568    17, 24, 32, 25, 18, 11,  4,  5,
569    12, 19, 26, 33, 40, 48, 41, 34,
570    27, 20, 13,  6,  7, 14, 21, 28,
571    35, 42, 49, 56, 57, 50, 43, 36,
572    29, 22, 15, 23, 30, 37, 44, 51,
573    58, 59, 52, 45, 38, 31, 39, 46,
574    53, 60, 61, 54, 47, 55, 62, 63,
575    // let corrupt input sample past end
576    63, 63, 63, 63, 63, 63, 63, 63,
577    63, 63, 63, 63, 63, 63, 63
578 ];
579 
580 // decode one 64-entry block--
581 int decode_block(jpeg *j, short data[64], huffman *hdc, huffman *hac, int b)
582 {
583    int diff,dc,k;
584    int t = decode(j, hdc);
585    if (t < 0)
586        throw new STBImageException("Bad huffman code, corrupt JPEG");
587 
588    // 0 all the ac values now so we can do it 32-bits at a time
589    memset(data.ptr,0,64*(data[0]).sizeof);
590 
591    diff = t ? extend_receive(j, t) : 0;
592    dc = j.img_comp[b].dc_pred + diff;
593    j.img_comp[b].dc_pred = dc;
594    data[0] = cast(short) dc;
595 
596    // decode AC components, see JPEG spec
597    k = 1;
598    do {
599       int r,s;
600       int rs = decode(j, hac);
601       if (rs < 0)
602          throw new STBImageException("Bad huffman code, corrupt JPEG");
603       s = rs & 15;
604       r = rs >> 4;
605       if (s == 0) {
606          if (rs != 0xf0) break; // end block
607          k += 16;
608       } else {
609          k += r;
610          // decode into unzigzag'd location
611          data[dezigzag[k++]] = cast(short) extend_receive(j,s);
612       }
613    } while (k < 64);
614    return 1;
615 }
616 
617 // take a -128..127 value and clamp it and convert to 0..255
618 ubyte clamp(int x)
619 {
620    // trick to use a single test to catch both cases
621    if (cast(uint) x > 255) {
622       if (x < 0) return 0;
623       if (x > 255) return 255;
624    }
625    return cast(ubyte) x;
626 }
627 
628 int f2f(double x)
629 {
630     return cast(int)(x * 4096 + 0.5);
631 }
632 
633 int fsh(int x)
634 {
635     return x << 12;
636 }
637 
638 // derived from jidctint -- DCT_ISLOW
639 void IDCT_1D(int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
640              out int t0, out int t1, out int t2, out int t3,
641              out int x0, out int x1, out int x2, out int x3)
642 {
643    int p1,p2,p3,p4,p5;
644    p2 = s2;
645    p3 = s6;
646    p1 = (p2+p3) * f2f(0.5411961f);
647    t2 = p1 + p3*f2f(-1.847759065f);
648    t3 = p1 + p2*f2f( 0.765366865f);
649    p2 = s0;
650    p3 = s4;
651    t0 = fsh(p2+p3);
652    t1 = fsh(p2-p3);
653    x0 = t0+t3;
654    x3 = t0-t3;
655    x1 = t1+t2;
656    x2 = t1-t2;
657    t0 = s7;
658    t1 = s5;
659    t2 = s3;
660    t3 = s1;
661    p3 = t0+t2;
662    p4 = t1+t3;
663    p1 = t0+t3;
664    p2 = t1+t2;
665    p5 = (p3+p4)*f2f( 1.175875602f);
666    t0 = t0*f2f( 0.298631336f);
667    t1 = t1*f2f( 2.053119869f);
668    t2 = t2*f2f( 3.072711026f);
669    t3 = t3*f2f( 1.501321110f);
670    p1 = p5 + p1*f2f(-0.899976223f);
671    p2 = p5 + p2*f2f(-2.562915447f);
672    p3 = p3*f2f(-1.961570560f);
673    p4 = p4*f2f(-0.390180644f);
674    t3 += p1+p4;
675    t2 += p2+p3;
676    t1 += p2+p4;
677    t0 += p1+p3;
678  }
679 
680 alias stbi_dequantize_t = ubyte;
681 
682 // .344 seconds on 3*anemones.jpg
683 void idct_block(ubyte *out_, int out_stride, short data[64], stbi_dequantize_t *dequantize)
684 {
685    int i;
686    int[64] val;
687    int*v = val.ptr;
688    stbi_dequantize_t *dq = dequantize;
689    ubyte *o;
690    short *d = data.ptr;
691 
692    // columns
693    for (i=0; i < 8; ++i,++d,++dq, ++v) {
694       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
695       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
696            && d[40]==0 && d[48]==0 && d[56]==0) {
697          //    no shortcut                 0     seconds
698          //    (1|2|3|4|5|6|7)==0          0     seconds
699          //    all separate               -0.047 seconds
700          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
701          int dcterm = d[0] * dq[0] << 2;
702          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
703       } else {
704          int t0, t1, t2, t3, x0, x1, x2, x3;
705          IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
706                  d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56],
707                  t0, t1, t2, t3, x0, x1, x2, x3);
708          // constants scaled things up by 1<<12; let's bring them back
709          // down, but keep 2 extra bits of precision
710          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
711          v[ 0] = (x0+t3) >> 10;
712          v[56] = (x0-t3) >> 10;
713          v[ 8] = (x1+t2) >> 10;
714          v[48] = (x1-t2) >> 10;
715          v[16] = (x2+t1) >> 10;
716          v[40] = (x2-t1) >> 10;
717          v[24] = (x3+t0) >> 10;
718          v[32] = (x3-t0) >> 10;
719       }
720    }
721 
722    for (i=0, v=val.ptr, o=out_; i < 8; ++i,v+=8,o+=out_stride) {
723 
724       // no fast case since the first 1D IDCT spread components out
725       int t0, t1, t2, t3, x0, x1, x2, x3;
726       IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7], t0, t1, t2, t3, x0, x1, x2, x3);
727       // constants scaled things up by 1<<12, plus we had 1<<2 from first
728       // loop, plus horizontal and vertical each scale by sqrt(8) so together
729       // we've got an extra 1<<3, so 1<<17 total we need to remove.
730       // so we want to round that, which means adding 0.5 * 1<<17,
731       // aka 65536. Also, we'll end up with -128 to 127 that we want
732       // to encode as 0..255 by adding 128, so we'll add that before the shift
733       x0 += 65536 + (128<<17);
734       x1 += 65536 + (128<<17);
735       x2 += 65536 + (128<<17);
736       x3 += 65536 + (128<<17);
737       // tried computing the shifts into temps, or'ing the temps to see
738       // if any were out of range, but that was slower
739       o[0] = clamp((x0+t3) >> 17);
740       o[7] = clamp((x0-t3) >> 17);
741       o[1] = clamp((x1+t2) >> 17);
742       o[6] = clamp((x1-t2) >> 17);
743       o[2] = clamp((x2+t1) >> 17);
744       o[5] = clamp((x2-t1) >> 17);
745       o[3] = clamp((x3+t0) >> 17);
746       o[4] = clamp((x3-t0) >> 17);
747    }
748 }
749 
750 
751 enum MARKER_none = 0xff;
752 
753 // if there's a pending marker from the entropy stream, return that
754 // otherwise, fetch from the stream and get a marker. if there's no
755 // marker, return 0xff, which is never a valid marker value
756 ubyte get_marker(jpeg *j)
757 {
758    ubyte x;
759    if (j.marker != MARKER_none) { x = j.marker; j.marker = MARKER_none; return x; }
760    x = get8u(j.s);
761    if (x != 0xff) return MARKER_none;
762    while (x == 0xff)
763       x = get8u(j.s);
764    return x;
765 }
766 
767 // in each scan, we'll have scan_n components, and the order
768 // of the components is specified by order[]
769 bool RESTART(int x)
770 {
771     return (x >= 0xd0) && (x <= 0xd7);
772 }
773 
774 // after a restart interval, reset the entropy decoder and
775 // the dc prediction
776 void reset(jpeg *j)
777 {
778    j.code_bits = 0;
779    j.code_buffer = 0;
780    j.nomore = 0;
781    j.img_comp[0].dc_pred = j.img_comp[1].dc_pred = j.img_comp[2].dc_pred = 0;
782    j.marker = MARKER_none;
783    j.todo = j.restart_interval ? j.restart_interval : 0x7fffffff;
784    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
785    // since we don't even allow 1<<30 pixels
786 }
787 
788 int parse_entropy_coded_data(jpeg *z)
789 {
790    reset(z);
791    if (z.scan_n == 1) {
792       int i,j;
793       short data[64];
794       int n = z.order[0];
795       // non-interleaved data, we just need to process one block at a time,
796       // in trivial scanline order
797       // number of blocks to do just depends on how many actual "pixels" this
798       // component has, independent of interleaved MCU blocking and such
799       int w = (z.img_comp[n].x+7) >> 3;
800       int h = (z.img_comp[n].y+7) >> 3;
801       for (j=0; j < h; ++j) {
802          for (i=0; i < w; ++i) {
803             if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
804             idct_block(z.img_comp[n].data+z.img_comp[n].w2*j*8+i*8, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
805             // every data block is an MCU, so countdown the restart interval
806             if (--z.todo <= 0) {
807                if (z.code_bits < 24) grow_buffer_unsafe(z);
808                // if it's NOT a restart, then just bail, so we get corrupt data
809                // rather than no data
810                if (!RESTART(z.marker)) return 1;
811                reset(z);
812             }
813          }
814       }
815    } else { // interleaved!
816       int i,j,k,x,y;
817       short[64] data;
818       for (j=0; j < z.img_mcu_y; ++j) {
819          for (i=0; i < z.img_mcu_x; ++i) {
820             // scan an interleaved mcu... process scan_n components in order
821             for (k=0; k < z.scan_n; ++k) {
822                int n = z.order[k];
823                // scan out an mcu's worth of this component; that's just determined
824                // by the basic H and V specified for the component
825                for (y=0; y < z.img_comp[n].v; ++y) {
826                   for (x=0; x < z.img_comp[n].h; ++x) {
827                      int x2 = (i*z.img_comp[n].h + x)*8;
828                      int y2 = (j*z.img_comp[n].v + y)*8;
829                      if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
830                      idct_block(z.img_comp[n].data+z.img_comp[n].w2*y2+x2, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
831                   }
832                }
833             }
834             // after all interleaved components, that's an interleaved MCU,
835             // so now count down the restart interval
836             if (--z.todo <= 0) {
837                if (z.code_bits < 24) grow_buffer_unsafe(z);
838                // if it's NOT a restart, then just bail, so we get corrupt data
839                // rather than no data
840                if (!RESTART(z.marker)) return 1;
841                reset(z);
842             }
843          }
844       }
845    }
846    return 1;
847 }
848 
849 int process_marker(jpeg *z, int m)
850 {
851    int L;
852    switch (m) {
853 
854       case MARKER_none: // no marker found
855          throw new STBImageException("Expected marker, corrupt JPEG");
856 
857       case 0xC2: // SOF - progressive
858           throw new STBImageException("JPEG format not supported (progressive)");
859 
860       case 0xDD: // DRI - specify restart interval
861          if (get16(z.s) != 4)
862              throw new STBImageException("Bad DRI len, corrupt JPEG");
863          z.restart_interval = get16(z.s);
864          return 1;
865 
866       case 0xDB: // DQT - define quantization table
867          L = get16(z.s)-2;
868          while (L > 0) {
869             int q = get8(z.s);
870             int p = q >> 4;
871             int t = q & 15,i;
872             if (p != 0)
873                throw new STBImageException("Bad DQT type, corrupt JPEG");
874             if (t > 3)
875                throw new STBImageException("Bad DQT table, corrupt JPEG");
876             for (i=0; i < 64; ++i)
877                z.dequant[t][dezigzag[i]] = get8u(z.s);
878             L -= 65;
879          }
880          return L==0;
881 
882       case 0xC4: // DHT - define huffman table
883          L = get16(z.s)-2;
884          while (L > 0) {
885             ubyte *v;
886             int[16] sizes;
887             int i;
888             int m_ = 0;
889             int q = get8(z.s);
890             int tc = q >> 4;
891             int th = q & 15;
892             if (tc > 1 || th > 3)
893                 throw new STBImageException("Bad DHT header, corrupt JPEG");
894             for (i=0; i < 16; ++i) {
895                sizes[i] = get8(z.s);
896                m_ += sizes[i];
897             }
898             L -= 17;
899             if (tc == 0) {
900                if (!build_huffman(z.huff_dc.ptr+th, sizes.ptr)) return 0;
901                v = z.huff_dc[th].values.ptr;
902             } else {
903                if (!build_huffman(z.huff_ac.ptr+th, sizes.ptr)) return 0;
904                v = z.huff_ac[th].values.ptr;
905             }
906             for (i=0; i < m_; ++i)
907                v[i] = get8u(z.s);
908             L -= m_;
909          }
910          return L==0;
911 
912       default:
913          break;
914    }
915    // check for comment block or APP blocks
916    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
917       skip(z.s, get16(z.s)-2);
918       return 1;
919    }
920    return 0;
921 }
922 
923 // after we see SOS
924 int process_scan_header(jpeg *z)
925 {
926    int i;
927    int Ls = get16(z.s);
928    z.scan_n = get8(z.s);
929    if (z.scan_n < 1 || z.scan_n > 4 || z.scan_n > cast(int) z.s.img_n)
930       throw new STBImageException("Bad SOS component count, Corrupt JPEG");
931 
932    if (Ls != 6+2*z.scan_n)
933       throw new STBImageException("Bad SOS length, Corrupt JPEG");
934 
935    for (i=0; i < z.scan_n; ++i) {
936       int id = get8(z.s), which;
937       int q = get8(z.s);
938       for (which = 0; which < z.s.img_n; ++which)
939          if (z.img_comp[which].id == id)
940             break;
941       if (which == z.s.img_n) return 0;
942       z.img_comp[which].hd = q >> 4;
943       if (z.img_comp[which].hd > 3)
944          throw new STBImageException("Bad DC huff, Corrupt JPEG");
945       z.img_comp[which].ha = q & 15;
946       if (z.img_comp[which].ha > 3)
947          throw new STBImageException("Bad AC huff, Corrupt JPEG");
948       z.order[i] = which;
949    }
950    if (get8(z.s) != 0)
951       throw new STBImageException("Bad SOS, Corrupt JPEG");
952    get8(z.s); // should be 63, but might be 0
953    if (get8(z.s) != 0)
954       throw new STBImageException("Bad SOS, Corrupt JPEG");
955 
956    return 1;
957 }
958 
959 int process_frame_header(jpeg *z, int scan)
960 {
961    stbi *s = z.s;
962    int Lf,p,i,q, h_max=1,v_max=1,c;
963    Lf = get16(s);         if (Lf < 11) throw new STBImageException("Bad SOF len, Corrupt JPEG");
964    p  = get8(s);          if (p != 8) throw new STBImageException("JPEG format not supported: 8-bit only"); // JPEG baseline
965    s.img_y = get16(s);   if (s.img_y == 0) throw new STBImageException("No header height, JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
966    s.img_x = get16(s);   if (s.img_x == 0) throw new STBImageException("0 width, corrupt JPEG"); // JPEG requires
967    c = get8(s);
968    if (c != 3 && c != 1) throw new STBImageException("Bad component count, corrupt JPEG");    // JFIF requires
969    s.img_n = c;
970    for (i=0; i < c; ++i) {
971       z.img_comp[i].data = null;
972       z.img_comp[i].linebuf = null;
973    }
974 
975    if (Lf != 8+3*s.img_n) throw new STBImageException("Bad SOF len, corrupt JPEG");
976 
977    for (i=0; i < s.img_n; ++i) {
978       z.img_comp[i].id = get8(s);
979       if (z.img_comp[i].id != i+1)   // JFIF requires
980          if (z.img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
981             throw new STBImageException("Bad component ID, corrupt JPEG");
982       q = get8(s);
983       z.img_comp[i].h = (q >> 4);  if (!z.img_comp[i].h || z.img_comp[i].h > 4) throw new STBImageException("Bad H, corrupt JPEG");
984       z.img_comp[i].v = q & 15;    if (!z.img_comp[i].v || z.img_comp[i].v > 4) throw new STBImageException("Bad V, corrupt JPEG");
985       z.img_comp[i].tq = get8(s);  if (z.img_comp[i].tq > 3) throw new STBImageException("Bad TQ, corrupt JPEG");
986    }
987 
988    if (scan != SCAN_load) return 1;
989 
990    if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
991 
992    for (i=0; i < s.img_n; ++i) {
993       if (z.img_comp[i].h > h_max) h_max = z.img_comp[i].h;
994       if (z.img_comp[i].v > v_max) v_max = z.img_comp[i].v;
995    }
996 
997    // compute interleaved mcu info
998    z.img_h_max = h_max;
999    z.img_v_max = v_max;
1000    z.img_mcu_w = h_max * 8;
1001    z.img_mcu_h = v_max * 8;
1002    z.img_mcu_x = (s.img_x + z.img_mcu_w-1) / z.img_mcu_w;
1003    z.img_mcu_y = (s.img_y + z.img_mcu_h-1) / z.img_mcu_h;
1004 
1005    for (i=0; i < s.img_n; ++i) {
1006       // number of effective pixels (e.g. for non-interleaved MCU)
1007       z.img_comp[i].x = (s.img_x * z.img_comp[i].h + h_max-1) / h_max;
1008       z.img_comp[i].y = (s.img_y * z.img_comp[i].v + v_max-1) / v_max;
1009       // to simplify generation, we'll allocate enough memory to decode
1010       // the bogus oversized data from using interleaved MCUs and their
1011       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1012       // discard the extra data until colorspace conversion
1013       z.img_comp[i].w2 = z.img_mcu_x * z.img_comp[i].h * 8;
1014       z.img_comp[i].h2 = z.img_mcu_y * z.img_comp[i].v * 8;
1015       z.img_comp[i].raw_data = malloc(z.img_comp[i].w2 * z.img_comp[i].h2+15);
1016       if (z.img_comp[i].raw_data == null) {
1017          for(--i; i >= 0; --i) {
1018             free(z.img_comp[i].raw_data);
1019             z.img_comp[i].data = null;
1020          }
1021          throw new STBImageException("Out of memory");
1022       }
1023       // align blocks for installable-idct using mmx/sse
1024       z.img_comp[i].data = cast(ubyte*) (( cast(size_t) z.img_comp[i].raw_data + 15) & ~15);
1025       z.img_comp[i].linebuf = null;
1026    }
1027 
1028    return 1;
1029 }
1030 
1031 // use comparisons since in some cases we handle more than one case (e.g. SOF)
1032 bool DNL(int x) { return x == 0xdc; }
1033 bool SOI(int x) { return x == 0xd8; }
1034 bool EOI(int x) { return x == 0xd9; }
1035 bool SOF(int x) { return x == 0xc0 || x == 0xc1; }
1036 bool SOS(int x) { return x == 0xda; }
1037 
1038 int decode_jpeg_header(jpeg *z, int scan)
1039 {
1040    int m;
1041    z.marker = MARKER_none; // initialize cached marker to empty
1042    m = get_marker(z);
1043    if (!SOI(m)) throw new STBImageException("No SOI, corrupt JPEG");
1044    if (scan == SCAN_type) return 1;
1045    m = get_marker(z);
1046    while (!SOF(m))
1047    {
1048 
1049       if (!process_marker(z,m)) return 0;
1050       m = get_marker(z);
1051 
1052 
1053 
1054       while (m == MARKER_none)
1055       {
1056          // some files have extra padding after their blocks, so ok, we'll scan
1057          if (at_eof(z.s)) throw new STBImageException("No SOF, corrupt JPEG");
1058          m = get_marker(z);
1059       }
1060    }
1061    if (!process_frame_header(z, scan)) return 0;
1062    return 1;
1063 }
1064 
1065 int decode_jpeg_image(jpeg *j)
1066 {
1067    int m;
1068    j.restart_interval = 0;
1069    if (!decode_jpeg_header(j, SCAN_load)) return 0;
1070    m = get_marker(j);
1071    while (!EOI(m)) {
1072       if (SOS(m)) {
1073          if (!process_scan_header(j)) return 0;
1074          if (!parse_entropy_coded_data(j)) return 0;
1075          if (j.marker == MARKER_none ) {
1076             // handle 0s at the end of image data from IP Kamera 9060
1077             while (!at_eof(j.s)) {
1078                int x = get8(j.s);
1079                if (x == 255) {
1080                   j.marker = get8u(j.s);
1081                   break;
1082                } else if (x != 0) {
1083                   return 0;
1084                }
1085             }
1086             // if we reach eof without hitting a marker, get_marker() below will fail and we'll eventually return 0
1087          }
1088       } else {
1089          if (!process_marker(j, m)) return 0;
1090       }
1091       m = get_marker(j);
1092    }
1093    return 1;
1094 }
1095 
1096 // static jfif-centered resampling (across block boundaries)
1097 
1098 alias resample_row_func = ubyte* function(ubyte *out_, ubyte *in0, ubyte *in1, int w, int hs);
1099 
1100 ubyte div4(int x)
1101 {
1102     return cast(ubyte)(x >> 2);
1103 }
1104 
1105 ubyte *resample_row_1(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1106 {
1107    return in_near;
1108 }
1109 
1110 ubyte* resample_row_v_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1111 {
1112    // need to generate two samples vertically for every one in input
1113    int i;
1114    for (i=0; i < w; ++i)
1115       out_[i] = div4(3*in_near[i] + in_far[i] + 2);
1116    return out_;
1117 }
1118 
1119 ubyte*  resample_row_h_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1120 {
1121    // need to generate two samples horizontally for every one in input
1122    int i;
1123    ubyte *input = in_near;
1124 
1125    if (w == 1) {
1126       // if only one sample, can't do any interpolation
1127       out_[0] = out_[1] = input[0];
1128       return out_;
1129    }
1130 
1131    out_[0] = input[0];
1132    out_[1] = div4(input[0]*3 + input[1] + 2);
1133    for (i=1; i < w-1; ++i) {
1134       int n = 3*input[i]+2;
1135       out_[i*2+0] = div4(n+input[i-1]);
1136       out_[i*2+1] = div4(n+input[i+1]);
1137    }
1138    out_[i*2+0] = div4(input[w-2]*3 + input[w-1] + 2);
1139    out_[i*2+1] = input[w-1];
1140 
1141    return out_;
1142 }
1143 
1144 ubyte div16(int x)
1145 {
1146     return cast(ubyte)(x >> 4);
1147 }
1148 
1149 
1150 ubyte *resample_row_hv_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1151 {
1152    // need to generate 2x2 samples for every one in input
1153    int i,t0,t1;
1154    if (w == 1) {
1155       out_[0] = out_[1] = div4(3*in_near[0] + in_far[0] + 2);
1156       return out_;
1157    }
1158 
1159    t1 = 3*in_near[0] + in_far[0];
1160    out_[0] = div4(t1+2);
1161    for (i=1; i < w; ++i) {
1162       t0 = t1;
1163       t1 = 3*in_near[i]+in_far[i];
1164       out_[i*2-1] = div16(3*t0 + t1 + 8);
1165       out_[i*2  ] = div16(3*t1 + t0 + 8);
1166    }
1167    out_[w*2-1] = div4(t1+2);
1168 
1169    return out_;
1170 }
1171 
1172 ubyte *resample_row_generic(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1173 {
1174    // resample with nearest-neighbor
1175    int i,j;
1176    in_far = in_far;
1177    for (i=0; i < w; ++i)
1178       for (j=0; j < hs; ++j)
1179          out_[i*hs+j] = in_near[i];
1180    return out_;
1181 }
1182 
1183 int float2fixed(double x)
1184 {
1185     return cast(int)((x) * 65536 + 0.5);
1186 }
1187 
1188 // 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
1189 // VC6 without processor=Pro is generating multiple LEAs per multiply!
1190 void YCbCr_to_RGB_row(ubyte *out_, const ubyte *y, const ubyte *pcb, const ubyte *pcr, int count, int step)
1191 {
1192    int i;
1193    for (i=0; i < count; ++i) {
1194       int y_fixed = (y[i] << 16) + 32768; // rounding
1195       int r,g,b;
1196       int cr = pcr[i] - 128;
1197       int cb = pcb[i] - 128;
1198       r = y_fixed + cr*float2fixed(1.40200f);
1199       g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
1200       b = y_fixed                            + cb*float2fixed(1.77200f);
1201       r >>= 16;
1202       g >>= 16;
1203       b >>= 16;
1204       if (cast(uint) r > 255) { if (r < 0) r = 0; else r = 255; }
1205       if (cast(uint) g > 255) { if (g < 0) g = 0; else g = 255; }
1206       if (cast(uint) b > 255) { if (b < 0) b = 0; else b = 255; }
1207       out_[0] = cast(ubyte)r;
1208       out_[1] = cast(ubyte)g;
1209       out_[2] = cast(ubyte)b;
1210       out_[3] = 255;
1211       out_ += step;
1212    }
1213 }
1214 
1215 // clean up the temporary component buffers
1216 void cleanup_jpeg(jpeg *j)
1217 {
1218    int i;
1219    for (i=0; i < j.s.img_n; ++i) {
1220       if (j.img_comp[i].data) {
1221          free(j.img_comp[i].raw_data);
1222          j.img_comp[i].data = null;
1223       }
1224       if (j.img_comp[i].linebuf) {
1225          free(j.img_comp[i].linebuf);
1226          j.img_comp[i].linebuf = null;
1227       }
1228    }
1229 }
1230 
1231 struct stbi_resample
1232 {
1233    resample_row_func resample;
1234    ubyte* line0;
1235    ubyte* line1;
1236    int hs,vs;   // expansion factor in each axis
1237    int w_lores; // horizontal pixels pre-expansion
1238    int ystep;   // how far through vertical expansion we are
1239    int ypos;    // which pre-expansion row we're on
1240 } ;
1241 
1242 ubyte *load_jpeg_image(jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
1243 {
1244    int n, decode_n;
1245    // validate req_comp
1246    if (req_comp < 0 || req_comp > 4)
1247        throw new STBImageException("Internal error: bad req_comp");
1248    z.s.img_n = 0;
1249 
1250    // load a jpeg image from whichever source
1251    if (!decode_jpeg_image(z)) { cleanup_jpeg(z); return null; }
1252 
1253    // determine actual number of components to generate
1254    n = req_comp ? req_comp : z.s.img_n;
1255 
1256    if (z.s.img_n == 3 && n < 3)
1257       decode_n = 1;
1258    else
1259       decode_n = z.s.img_n;
1260 
1261    // resample and color-convert
1262    {
1263       int k;
1264       uint i,j;
1265       ubyte *output;
1266       ubyte *coutput[4];
1267 
1268       stbi_resample res_comp[4];
1269 
1270       for (k=0; k < decode_n; ++k) {
1271          stbi_resample *r = &res_comp[k];
1272 
1273          // allocate line buffer big enough for upsampling off the edges
1274          // with upsample factor of 4
1275          z.img_comp[k].linebuf = cast(ubyte*) malloc(z.s.img_x + 3);
1276          if (!z.img_comp[k].linebuf)
1277          {
1278              cleanup_jpeg(z);
1279              throw new STBImageException("Out of memory");
1280          }
1281 
1282          r.hs      = z.img_h_max / z.img_comp[k].h;
1283          r.vs      = z.img_v_max / z.img_comp[k].v;
1284          r.ystep   = r.vs >> 1;
1285          r.w_lores = (z.s.img_x + r.hs-1) / r.hs;
1286          r.ypos    = 0;
1287          r.line0   = r.line1 = z.img_comp[k].data;
1288 
1289          if      (r.hs == 1 && r.vs == 1) r.resample = &resample_row_1;
1290          else if (r.hs == 1 && r.vs == 2) r.resample = &resample_row_v_2;
1291          else if (r.hs == 2 && r.vs == 1) r.resample = &resample_row_h_2;
1292          else if (r.hs == 2 && r.vs == 2) r.resample = &resample_row_hv_2;
1293          else                               r.resample = &resample_row_generic;
1294       }
1295 
1296       // can't error after this so, this is safe
1297       output = cast(ubyte*) malloc(n * z.s.img_x * z.s.img_y + 1);
1298       if (!output) { cleanup_jpeg(z); throw new STBImageException("Out of memory"); }
1299 
1300       // now go ahead and resample
1301       for (j=0; j < z.s.img_y; ++j) {
1302          ubyte *out_ = output + n * z.s.img_x * j;
1303          for (k=0; k < decode_n; ++k) {
1304             stbi_resample *r = &res_comp[k];
1305             int y_bot = r.ystep >= (r.vs >> 1);
1306             coutput[k] = r.resample(z.img_comp[k].linebuf,
1307                                      y_bot ? r.line1 : r.line0,
1308                                      y_bot ? r.line0 : r.line1,
1309                                      r.w_lores, r.hs);
1310             if (++r.ystep >= r.vs) {
1311                r.ystep = 0;
1312                r.line0 = r.line1;
1313                if (++r.ypos < z.img_comp[k].y)
1314                   r.line1 += z.img_comp[k].w2;
1315             }
1316          }
1317          if (n >= 3) {
1318             ubyte *y = coutput[0];
1319             if (z.s.img_n == 3) {
1320                YCbCr_to_RGB_row(out_, y, coutput[1], coutput[2], z.s.img_x, n);
1321             } else
1322                for (i=0; i < z.s.img_x; ++i) {
1323                   out_[0] = out_[1] = out_[2] = y[i];
1324                   out_[3] = 255; // not used if n==3
1325                   out_ += n;
1326                }
1327          } else {
1328             ubyte *y = coutput[0];
1329             if (n == 1)
1330                for (i=0; i < z.s.img_x; ++i) out_[i] = y[i];
1331             else
1332                for (i=0; i < z.s.img_x; ++i) *out_++ = y[i], *out_++ = 255;
1333          }
1334       }
1335       cleanup_jpeg(z);
1336       *out_x = z.s.img_x;
1337       *out_y = z.s.img_y;
1338       if (comp) *comp  = z.s.img_n; // report original components, not output
1339       return output;
1340    }
1341 }
1342 
1343 ubyte* stbi_jpeg_load(stbi *s, int *x, int *y, int *comp, int req_comp)
1344 {
1345    jpeg j;
1346    j.s = s;
1347    return load_jpeg_image(&j, x,y,comp,req_comp);
1348 }
1349 
1350 void stbi_jpeg_test(stbi *s)
1351 {
1352    jpeg j;
1353    j.s = s;
1354    int r = decode_jpeg_header(&j, SCAN_type);
1355    if (r == 0)
1356        throw new STBImageException("Couldn't decode JPEG header");
1357 }
1358 
1359 
1360 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
1361 //    simple implementation
1362 //      - all input must be provided in an upfront buffer
1363 //      - all output is written to a single output buffer (can malloc/realloc)
1364 //    performance
1365 //      - fast huffman
1366 
1367 // fast-way is faster to check than jpeg huffman, but slow way is slower
1368 enum ZFAST_BITS = 9; // accelerate all cases in default tables
1369 enum ZFAST_MASK = ((1 << ZFAST_BITS) - 1);
1370 
1371 // zlib-style huffman encoding
1372 // (jpegs packs from left, zlib from right, so can't share code)
1373 struct zhuffman
1374 {
1375    ushort[1 << ZFAST_BITS] fast;
1376    ushort[16] firstcode;
1377    int[17] maxcode;
1378    ushort[16] firstsymbol;
1379    ubyte[288] size;
1380    ushort[288] value;
1381 } ;
1382 
1383 int bitreverse16(int n)
1384 {
1385   n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
1386   n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
1387   n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
1388   n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
1389   return n;
1390 }
1391 
1392 int bit_reverse(int v, int bits)
1393 {
1394    assert(bits <= 16);
1395    // to bit reverse n bits, reverse 16 and shift
1396    // e.g. 11 bits, bit reverse and shift away 5
1397    return bitreverse16(v) >> (16-bits);
1398 }
1399 
1400 int zbuild_huffman(zhuffman *z, ubyte *sizelist, int num)
1401 {
1402    int i,k=0;
1403    int code;
1404    int[16] next_code;
1405    int[17] sizes;
1406 
1407    // DEFLATE spec for generating codes
1408    memset(sizes.ptr, 0, sizes.sizeof);
1409    memset(z.fast.ptr, 255, z.fast.sizeof);
1410    for (i=0; i < num; ++i)
1411       ++sizes[sizelist[i]];
1412    sizes[0] = 0;
1413    for (i=1; i < 16; ++i)
1414       assert(sizes[i] <= (1 << i));
1415    code = 0;
1416    for (i=1; i < 16; ++i) {
1417       next_code[i] = code;
1418       z.firstcode[i] = cast(ushort) code;
1419       z.firstsymbol[i] = cast(ushort) k;
1420       code = (code + sizes[i]);
1421       if (sizes[i])
1422          if (code-1 >= (1 << i))
1423             throw new STBImageException("Bad codelength, corrupt JPEG");
1424       z.maxcode[i] = code << (16-i); // preshift for inner loop
1425       code <<= 1;
1426       k += sizes[i];
1427    }
1428    z.maxcode[16] = 0x10000; // sentinel
1429    for (i=0; i < num; ++i) {
1430       int s = sizelist[i];
1431       if (s) {
1432          int c = next_code[s] - z.firstcode[s] + z.firstsymbol[s];
1433          z.size[c] = cast(ubyte)s;
1434          z.value[c] = cast(ushort)i;
1435          if (s <= ZFAST_BITS) {
1436             int k_ = bit_reverse(next_code[s],s);
1437             while (k_ < (1 << ZFAST_BITS)) {
1438                z.fast[k_] = cast(ushort) c;
1439                k_ += (1 << s);
1440             }
1441          }
1442          ++next_code[s];
1443       }
1444    }
1445    return 1;
1446 }
1447 
1448 // zlib-from-memory implementation for PNG reading
1449 //    because PNG allows splitting the zlib stream arbitrarily,
1450 //    and it's annoying structurally to have PNG call ZLIB call PNG,
1451 //    we require PNG read all the IDATs and combine them into a single
1452 //    memory buffer
1453 
1454 struct zbuf
1455 {
1456    const(ubyte) *zbuffer;
1457    const(ubyte) *zbuffer_end;
1458    int num_bits;
1459    uint code_buffer;
1460 
1461    ubyte *zout;
1462    ubyte *zout_start;
1463    ubyte *zout_end;
1464    int   z_expandable;
1465 
1466    zhuffman z_length, z_distance;
1467 } ;
1468 
1469 int zget8(zbuf *z)
1470 {
1471    if (z.zbuffer >= z.zbuffer_end) return 0;
1472    return *z.zbuffer++;
1473 }
1474 
1475 void fill_bits(zbuf *z)
1476 {
1477    do {
1478       assert(z.code_buffer < (1U << z.num_bits));
1479       z.code_buffer |= zget8(z) << z.num_bits;
1480       z.num_bits += 8;
1481    } while (z.num_bits <= 24);
1482 }
1483 
1484 uint zreceive(zbuf *z, int n)
1485 {
1486    uint k;
1487    if (z.num_bits < n) fill_bits(z);
1488    k = z.code_buffer & ((1 << n) - 1);
1489    z.code_buffer >>= n;
1490    z.num_bits -= n;
1491    return k;
1492 }
1493 
1494 int zhuffman_decode(zbuf *a, zhuffman *z)
1495 {
1496    int b,s,k;
1497    if (a.num_bits < 16) fill_bits(a);
1498    b = z.fast[a.code_buffer & ZFAST_MASK];
1499    if (b < 0xffff) {
1500       s = z.size[b];
1501       a.code_buffer >>= s;
1502       a.num_bits -= s;
1503       return z.value[b];
1504    }
1505 
1506    // not resolved by fast table, so compute it the slow way
1507    // use jpeg approach, which requires MSbits at top
1508    k = bit_reverse(a.code_buffer, 16);
1509    for (s=ZFAST_BITS+1; ; ++s)
1510       if (k < z.maxcode[s])
1511          break;
1512    if (s == 16) return -1; // invalid code!
1513    // code size is s, so:
1514    b = (k >> (16-s)) - z.firstcode[s] + z.firstsymbol[s];
1515    assert(z.size[b] == s);
1516    a.code_buffer >>= s;
1517    a.num_bits -= s;
1518    return z.value[b];
1519 }
1520 
1521 int expand(zbuf *z, int n)  // need to make room for n bytes
1522 {
1523    ubyte *q;
1524    int cur, limit;
1525    if (!z.z_expandable)
1526       throw new STBImageException("Output buffer limit, corrupt PNG");
1527    cur   = cast(int) (z.zout     - z.zout_start);
1528    limit = cast(int) (z.zout_end - z.zout_start);
1529    while (cur + n > limit)
1530       limit *= 2;
1531    q = cast(ubyte*) realloc(z.zout_start, limit);
1532    if (q == null)
1533       throw new STBImageException("Out of memory");
1534    z.zout_start = q;
1535    z.zout       = q + cur;
1536    z.zout_end   = q + limit;
1537    return 1;
1538 }
1539 
1540 static immutable int length_base[31] = [
1541    3,4,5,6,7,8,9,10,11,13,
1542    15,17,19,23,27,31,35,43,51,59,
1543    67,83,99,115,131,163,195,227,258,0,0 ];
1544 
1545 static immutable int length_extra[31]=
1546 [ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 ];
1547 
1548 static immutable int dist_base[32] = [ 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
1549 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0];
1550 
1551 static immutable int dist_extra[32] =
1552 [ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13];
1553 
1554 int parse_huffman_block(zbuf *a)
1555 {
1556    for(;;) {
1557       int z = zhuffman_decode(a, &a.z_length);
1558       if (z < 256) {
1559          if (z < 0)
1560              throw new STBImageException("Bad Huffman code, corrupt PNG");
1561          if (a.zout >= a.zout_end) if (!expand(a, 1)) return 0;
1562          *a.zout++ = cast(ubyte) z;
1563       } else {
1564          ubyte *p;
1565          int len,dist;
1566          if (z == 256) return 1;
1567          z -= 257;
1568          len = length_base[z];
1569          if (length_extra[z]) len += zreceive(a, length_extra[z]);
1570          z = zhuffman_decode(a, &a.z_distance);
1571          if (z < 0) throw new STBImageException("Bad Huffman code, corrupt PNG");
1572          dist = dist_base[z];
1573          if (dist_extra[z]) dist += zreceive(a, dist_extra[z]);
1574          if (a.zout - a.zout_start < dist) throw new STBImageException("Bad dist, corrupt PNG");
1575          if (a.zout + len > a.zout_end) if (!expand(a, len)) return 0;
1576          p = a.zout - dist;
1577          while (len--)
1578             *a.zout++ = *p++;
1579       }
1580    }
1581 }
1582 
1583 int compute_huffman_codes(zbuf *a)
1584 {
1585    static immutable ubyte length_dezigzag[19] = [ 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 ];
1586    zhuffman z_codelength;
1587    ubyte lencodes[286+32+137];//padding for maximum single op
1588    ubyte codelength_sizes[19];
1589    int i,n;
1590 
1591    int hlit  = zreceive(a,5) + 257;
1592    int hdist = zreceive(a,5) + 1;
1593    int hclen = zreceive(a,4) + 4;
1594 
1595    memset(codelength_sizes.ptr, 0, codelength_sizes.sizeof);
1596    for (i=0; i < hclen; ++i) {
1597       int s = zreceive(a,3);
1598       codelength_sizes[length_dezigzag[i]] = cast(ubyte) s;
1599    }
1600    if (!zbuild_huffman(&z_codelength, codelength_sizes.ptr, 19)) return 0;
1601 
1602    n = 0;
1603    while (n < hlit + hdist) {
1604       int c = zhuffman_decode(a, &z_codelength);
1605       assert(c >= 0 && c < 19);
1606       if (c < 16)
1607          lencodes[n++] = cast(ubyte) c;
1608       else if (c == 16) {
1609          c = zreceive(a,2)+3;
1610          memset(lencodes.ptr+n, lencodes[n-1], c);
1611          n += c;
1612       } else if (c == 17) {
1613          c = zreceive(a,3)+3;
1614          memset(lencodes.ptr+n, 0, c);
1615          n += c;
1616       } else {
1617          assert(c == 18);
1618          c = zreceive(a,7)+11;
1619          memset(lencodes.ptr+n, 0, c);
1620          n += c;
1621       }
1622    }
1623    if (n != hlit+hdist) throw new STBImageException("Bad codelengths, corrupt PNG");
1624    if (!zbuild_huffman(&a.z_length, lencodes.ptr, hlit)) return 0;
1625    if (!zbuild_huffman(&a.z_distance, lencodes.ptr+hlit, hdist)) return 0;
1626    return 1;
1627 }
1628 
1629 int parse_uncompressed_block(zbuf *a)
1630 {
1631    ubyte header[4];
1632    int len,nlen,k;
1633    if (a.num_bits & 7)
1634       zreceive(a, a.num_bits & 7); // discard
1635    // drain the bit-packed data into header
1636    k = 0;
1637    while (a.num_bits > 0) {
1638       header[k++] = cast(ubyte) (a.code_buffer & 255); // wtf this warns?
1639       a.code_buffer >>= 8;
1640       a.num_bits -= 8;
1641    }
1642    assert(a.num_bits == 0);
1643    // now fill header the normal way
1644    while (k < 4)
1645       header[k++] = cast(ubyte) zget8(a);
1646    len  = header[1] * 256 + header[0];
1647    nlen = header[3] * 256 + header[2];
1648    if (nlen != (len ^ 0xffff)) throw new STBImageException("Zlib corrupt, corrupt PNG");
1649    if (a.zbuffer + len > a.zbuffer_end) throw new STBImageException("Read past buffer, corrupt PNG");
1650    if (a.zout + len > a.zout_end)
1651       if (!expand(a, len)) return 0;
1652    memcpy(a.zout, a.zbuffer, len);
1653    a.zbuffer += len;
1654    a.zout += len;
1655    return 1;
1656 }
1657 
1658 int parse_zlib_header(zbuf *a)
1659 {
1660    int cmf   = zget8(a);
1661    int cm    = cmf & 15;
1662    /* int cinfo = cmf >> 4; */
1663    int flg   = zget8(a);
1664    if ((cmf*256+flg) % 31 != 0) throw new STBImageException("Bad zlib header, corrupt PNG"); // zlib spec
1665    if (flg & 32) throw new STBImageException("No preset dict, corrupt PNG"); // preset dictionary not allowed in png
1666    if (cm != 8) throw new STBImageException("Bad compression, corrupt PNG");  // DEFLATE required for png
1667    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
1668    return 1;
1669 }
1670 
1671 // @TODO: should statically initialize these for optimal thread safety
1672 __gshared ubyte[288] default_length;
1673 __gshared ubyte[32] default_distance;
1674 
1675 void init_defaults()
1676 {
1677    int i;   // use <= to match clearly with spec
1678    for (i=0; i <= 143; ++i)     default_length[i]   = 8;
1679    for (   ; i <= 255; ++i)     default_length[i]   = 9;
1680    for (   ; i <= 279; ++i)     default_length[i]   = 7;
1681    for (   ; i <= 287; ++i)     default_length[i]   = 8;
1682 
1683    for (i=0; i <=  31; ++i)     default_distance[i] = 5;
1684 }
1685 
1686 __gshared int stbi_png_partial; // a quick hack to only allow decoding some of a PNG... I should implement real streaming support instead
1687 int parse_zlib(zbuf *a, int parse_header)
1688 {
1689    int final_, type;
1690    if (parse_header)
1691       if (!parse_zlib_header(a)) return 0;
1692    a.num_bits = 0;
1693    a.code_buffer = 0;
1694    do {
1695       final_ = zreceive(a,1);
1696       type = zreceive(a,2);
1697       if (type == 0) {
1698          if (!parse_uncompressed_block(a)) return 0;
1699       } else if (type == 3) {
1700          return 0;
1701       } else {
1702          if (type == 1) {
1703             // use fixed code lengths
1704             if (!default_distance[31]) init_defaults();
1705             if (!zbuild_huffman(&a.z_length  , default_length.ptr  , 288)) return 0;
1706             if (!zbuild_huffman(&a.z_distance, default_distance.ptr,  32)) return 0;
1707          } else {
1708             if (!compute_huffman_codes(a)) return 0;
1709          }
1710          if (!parse_huffman_block(a)) return 0;
1711       }
1712       if (stbi_png_partial && a.zout - a.zout_start > 65536)
1713          break;
1714    } while (!final_);
1715    return 1;
1716 }
1717 
1718 int do_zlib(zbuf *a, ubyte *obuf, int olen, int exp, int parse_header)
1719 {
1720    a.zout_start = obuf;
1721    a.zout       = obuf;
1722    a.zout_end   = obuf + olen;
1723    a.z_expandable = exp;
1724 
1725    return parse_zlib(a, parse_header);
1726 }
1727 
1728 ubyte *stbi_zlib_decode_malloc_guesssize(const(ubyte) *buffer, int len, int initial_size, int *outlen)
1729 {
1730    zbuf a;
1731    ubyte *p = cast(ubyte*) malloc(initial_size);
1732    if (p == null) return null;
1733    a.zbuffer = buffer;
1734    a.zbuffer_end = buffer + len;
1735    if (do_zlib(&a, p, initial_size, 1, 1)) {
1736       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1737       return a.zout_start;
1738    } else {
1739       free(a.zout_start);
1740       return null;
1741    }
1742 }
1743 
1744 ubyte *stbi_zlib_decode_malloc(const(ubyte) *buffer, int len, int *outlen)
1745 {
1746    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
1747 }
1748 
1749 ubyte *stbi_zlib_decode_malloc_guesssize_headerflag(const(ubyte) *buffer, int len, int initial_size, int *outlen, int parse_header)
1750 {
1751    zbuf a;
1752    ubyte *p = cast(ubyte*) malloc(initial_size);
1753    if (p == null) return null;
1754    a.zbuffer = buffer;
1755    a.zbuffer_end = buffer + len;
1756    if (do_zlib(&a, p, initial_size, 1, parse_header)) {
1757       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1758       return a.zout_start;
1759    } else {
1760       free(a.zout_start);
1761       return null;
1762    }
1763 }
1764 
1765 int stbi_zlib_decode_buffer(ubyte* obuffer, int olen, const(ubyte)* ibuffer, int ilen)
1766 {
1767    zbuf a;
1768    a.zbuffer = ibuffer;
1769    a.zbuffer_end = ibuffer + ilen;
1770    if (do_zlib(&a, obuffer, olen, 0, 1))
1771       return cast(int) (a.zout - a.zout_start);
1772    else
1773       return -1;
1774 }
1775 
1776 ubyte *stbi_zlib_decode_noheader_malloc(const(ubyte) *buffer, int len, int *outlen)
1777 {
1778    zbuf a;
1779    ubyte *p = cast(ubyte*) malloc(16384);
1780    if (p == null) return null;
1781    a.zbuffer = buffer;
1782    a.zbuffer_end = buffer+len;
1783    if (do_zlib(&a, p, 16384, 1, 0)) {
1784       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1785       return a.zout_start;
1786    } else {
1787       free(a.zout_start);
1788       return null;
1789    }
1790 }
1791 
1792 int stbi_zlib_decode_noheader_buffer(ubyte *obuffer, int olen, const(ubyte) *ibuffer, int ilen)
1793 {
1794    zbuf a;
1795    a.zbuffer = ibuffer;
1796    a.zbuffer_end = ibuffer + ilen;
1797    if (do_zlib(&a, obuffer, olen, 0, 0))
1798       return cast(int) (a.zout - a.zout_start);
1799    else
1800       return -1;
1801 }
1802 
1803 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
1804 //    simple implementation
1805 //      - only 8-bit samples
1806 //      - no CRC checking
1807 //      - allocates lots of intermediate memory
1808 //        - avoids problem of streaming data between subsystems
1809 //        - avoids explicit window management
1810 //    performance
1811 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
1812 
1813 
1814 struct chunk
1815 {
1816    uint length;
1817    uint type;
1818 }
1819 
1820 uint PNG_TYPE(ubyte a, ubyte b, ubyte c, ubyte d)
1821 {
1822    return (a << 24) + (b << 16) + (c << 8) + d;
1823 }
1824 
1825 chunk get_chunk_header(stbi *s)
1826 {
1827    chunk c;
1828    c.length = get32(s);
1829    c.type   = get32(s);
1830    return c;
1831 }
1832 
1833 static int check_png_header(stbi *s)
1834 {
1835    static immutable ubyte[8] png_sig = [ 137, 80, 78, 71, 13, 10, 26, 10 ];
1836    for (int i = 0; i < 8; ++i)
1837    {
1838        ubyte headerByte = get8u(s);
1839        ubyte expected = png_sig[i];
1840        if (headerByte != expected)
1841            throw new STBImageException("Bad PNG sig, not a PNG");
1842    }
1843    return 1;
1844 }
1845 
1846 struct png
1847 {
1848    stbi *s;
1849    ubyte *idata;
1850    ubyte *expanded;
1851    ubyte *out_;
1852 }
1853 
1854 
1855 enum : int
1856 {
1857    F_none=0, F_sub=1, F_up=2, F_avg=3, F_paeth=4,
1858    F_avg_first, F_paeth_first
1859 }
1860 
1861 static immutable ubyte[5] first_row_filter =
1862 [
1863    F_none, F_sub, F_none, F_avg_first, F_paeth_first
1864 ];
1865 
1866 static int paeth(int a, int b, int c)
1867 {
1868    int p = a + b - c;
1869    int pa = abs(p-a);
1870    int pb = abs(p-b);
1871    int pc = abs(p-c);
1872    if (pa <= pb && pa <= pc) return a;
1873    if (pb <= pc) return b;
1874    return c;
1875 }
1876 
1877 // create the png data from post-deflated data
1878 static int create_png_image_raw(png *a, ubyte *raw, uint raw_len, int out_n, uint x, uint y)
1879 {
1880    stbi *s = a.s;
1881    uint i,j,stride = x*out_n;
1882    int k;
1883    int img_n = s.img_n; // copy it into a local for later
1884    assert(out_n == s.img_n || out_n == s.img_n+1);
1885    if (stbi_png_partial) y = 1;
1886    a.out_ = cast(ubyte*) malloc(x * y * out_n);
1887    if (!a.out_) throw new STBImageException("Out of memory");
1888    if (!stbi_png_partial) {
1889       if (s.img_x == x && s.img_y == y) {
1890          if (raw_len != (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1891       } else { // interlaced:
1892          if (raw_len < (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1893       }
1894    }
1895    for (j=0; j < y; ++j) {
1896       ubyte *cur = a.out_ + stride*j;
1897       ubyte *prior = cur - stride;
1898       int filter = *raw++;
1899       if (filter > 4) throw new STBImageException("Invalid filter, corrupt PNG");
1900       // if first row, use special filter that doesn't sample previous row
1901       if (j == 0) filter = first_row_filter[filter];
1902       // handle first pixel explicitly
1903       for (k=0; k < img_n; ++k) {
1904          switch (filter) {
1905             case F_none       : cur[k] = raw[k]; break;
1906             case F_sub        : cur[k] = raw[k]; break;
1907             case F_up         : cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1908             case F_avg        : cur[k] = cast(ubyte)(raw[k] + (prior[k]>>1)); break;
1909             case F_paeth      : cur[k] = cast(ubyte) (raw[k] + paeth(0,prior[k],0)); break;
1910             case F_avg_first  : cur[k] = raw[k]; break;
1911             case F_paeth_first: cur[k] = raw[k]; break;
1912             default: break;
1913          }
1914       }
1915       if (img_n != out_n) cur[img_n] = 255;
1916       raw += img_n;
1917       cur += out_n;
1918       prior += out_n;
1919       // this is a little gross, so that we don't switch per-pixel or per-component
1920       if (img_n == out_n) {
1921 
1922          for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n)
1923             for (k=0; k < img_n; ++k)
1924             {
1925                switch (filter) {
1926                   case F_none:  cur[k] = raw[k]; break;
1927                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-img_n]); break;
1928                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1929                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-img_n])>>1)); break;
1930                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
1931                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-img_n] >> 1)); break;
1932                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],0,0)); break;
1933                   default: break;
1934                }
1935             }
1936       } else {
1937          assert(img_n+1 == out_n);
1938 
1939          for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n)
1940             for (k=0; k < img_n; ++k)
1941             {
1942                switch (filter) {
1943                   case F_none:  cur[k] = raw[k]; break;
1944                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-out_n]); break;
1945                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1946                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
1947                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
1948                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-out_n] >> 1)); break;
1949                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],0,0)); break;
1950                   default: break;
1951                }
1952             }
1953       }
1954    }
1955    return 1;
1956 }
1957 
1958 int create_png_image(png *a, ubyte *raw, uint raw_len, int out_n, int interlaced)
1959 {
1960    ubyte *final_;
1961    int p;
1962    int save;
1963    if (!interlaced)
1964       return create_png_image_raw(a, raw, raw_len, out_n, a.s.img_x, a.s.img_y);
1965    save = stbi_png_partial;
1966    stbi_png_partial = 0;
1967 
1968    // de-interlacing
1969    final_ = cast(ubyte*) malloc(a.s.img_x * a.s.img_y * out_n);
1970    for (p=0; p < 7; ++p) {
1971       int xorig[] = [ 0,4,0,2,0,1,0 ];
1972       int yorig[] = [ 0,0,4,0,2,0,1 ];
1973       int xspc[]  = [ 8,8,4,4,2,2,1 ];
1974       int yspc[]  = [ 8,8,8,4,4,2,2 ];
1975       int i,j,x,y;
1976       // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
1977       x = (a.s.img_x - xorig[p] + xspc[p]-1) / xspc[p];
1978       y = (a.s.img_y - yorig[p] + yspc[p]-1) / yspc[p];
1979       if (x && y) {
1980          if (!create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
1981             free(final_);
1982             return 0;
1983          }
1984          for (j=0; j < y; ++j)
1985             for (i=0; i < x; ++i)
1986                memcpy(final_ + (j*yspc[p]+yorig[p])*a.s.img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
1987                       a.out_ + (j*x+i)*out_n, out_n);
1988          free(a.out_);
1989          raw += (x*out_n+1)*y;
1990          raw_len -= (x*out_n+1)*y;
1991       }
1992    }
1993    a.out_ = final_;
1994 
1995    stbi_png_partial = save;
1996    return 1;
1997 }
1998 
1999 static int compute_transparency(png *z, ubyte tc[3], int out_n)
2000 {
2001    stbi *s = z.s;
2002    uint i, pixel_count = s.img_x * s.img_y;
2003    ubyte *p = z.out_;
2004 
2005    // compute color-based transparency, assuming we've
2006    // already got 255 as the alpha value in the output
2007    assert(out_n == 2 || out_n == 4);
2008 
2009    if (out_n == 2) {
2010       for (i=0; i < pixel_count; ++i) {
2011          p[1] = (p[0] == tc[0] ? 0 : 255);
2012          p += 2;
2013       }
2014    } else {
2015       for (i=0; i < pixel_count; ++i) {
2016          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
2017             p[3] = 0;
2018          p += 4;
2019       }
2020    }
2021    return 1;
2022 }
2023 
2024 int expand_palette(png *a, ubyte *palette, int len, int pal_img_n)
2025 {
2026    uint i, pixel_count = a.s.img_x * a.s.img_y;
2027    ubyte *p;
2028    ubyte *temp_out;
2029    ubyte *orig = a.out_;
2030 
2031    p = cast(ubyte*) malloc(pixel_count * pal_img_n);
2032    if (p == null)
2033       throw new STBImageException("Out of memory");
2034 
2035    // between here and free(out) below, exitting would leak
2036    temp_out = p;
2037 
2038    if (pal_img_n == 3) {
2039       for (i=0; i < pixel_count; ++i) {
2040          int n = orig[i]*4;
2041          p[0] = palette[n  ];
2042          p[1] = palette[n+1];
2043          p[2] = palette[n+2];
2044          p += 3;
2045       }
2046    } else {
2047       for (i=0; i < pixel_count; ++i) {
2048          int n = orig[i]*4;
2049          p[0] = palette[n  ];
2050          p[1] = palette[n+1];
2051          p[2] = palette[n+2];
2052          p[3] = palette[n+3];
2053          p += 4;
2054       }
2055    }
2056    free(a.out_);
2057    a.out_ = temp_out;
2058 
2059    return 1;
2060 }
2061 
2062 int parse_png_file(png *z, int scan, int req_comp)
2063 {
2064    ubyte[1024] palette;
2065    ubyte pal_img_n=0;
2066    ubyte has_trans=0;
2067    ubyte tc[3];
2068    uint ioff=0, idata_limit=0, i, pal_len=0;
2069    int first=1,k,interlace=0;
2070    stbi *s = z.s;
2071 
2072    z.expanded = null;
2073    z.idata = null;
2074    z.out_ = null;
2075 
2076    if (!check_png_header(s)) return 0;
2077 
2078    if (scan == SCAN_type) return 1;
2079 
2080    for (;;) {
2081       chunk c = get_chunk_header(s);
2082       switch (c.type) {
2083          case PNG_TYPE('I','H','D','R'): {
2084             int depth,color,comp,filter;
2085             if (!first) throw new STBImageException("Multiple IHDR, corrupt PNG");
2086             first = 0;
2087             if (c.length != 13) throw new STBImageException("Bad IHDR len, corrupt PNG");
2088             s.img_x = get32(s); if (s.img_x > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2089             s.img_y = get32(s); if (s.img_y > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2090             depth = get8(s);  if (depth != 8)        throw new STBImageException("8bit only, PNG not supported: 8-bit only");
2091             color = get8(s);  if (color > 6)         throw new STBImageException("Bad ctype, corrupt PNG");
2092             if (color == 3) pal_img_n = 3; else if (color & 1) throw new STBImageException("Bad ctype, corrupt PNG");
2093             comp  = get8(s);  if (comp) throw new STBImageException("Bad comp method, corrupt PNG");
2094             filter= get8(s);  if (filter) throw new STBImageException("Bad filter method, corrupt PNG");
2095             interlace = get8(s); if (interlace>1) throw new STBImageException("Bad interlace method, corrupt PNG");
2096             if (!s.img_x || !s.img_y) throw new STBImageException("0-pixel image, corrupt PNG");
2097             if (!pal_img_n) {
2098                s.img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
2099                if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
2100                if (scan == SCAN_header) return 1;
2101             } else {
2102                // if paletted, then pal_n is our final components, and
2103                // img_n is # components to decompress/filter.
2104                s.img_n = 1;
2105                if ((1 << 30) / s.img_x / 4 < s.img_y) throw new STBImageException("Too large, corrupt PNG");
2106                // if SCAN_header, have to scan to see if we have a tRNS
2107             }
2108             break;
2109          }
2110 
2111          case PNG_TYPE('P','L','T','E'):  {
2112             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2113             if (c.length > 256*3) throw new STBImageException("invalid PLTE, corrupt PNG");
2114             pal_len = c.length / 3;
2115             if (pal_len * 3 != c.length) throw new STBImageException("invalid PLTE, corrupt PNG");
2116             for (i=0; i < pal_len; ++i) {
2117                palette[i*4+0] = get8u(s);
2118                palette[i*4+1] = get8u(s);
2119                palette[i*4+2] = get8u(s);
2120                palette[i*4+3] = 255;
2121             }
2122             break;
2123          }
2124 
2125          case PNG_TYPE('t','R','N','S'): {
2126             if (first) throw new STBImageException("first not IHDR, cCorrupt PNG");
2127             if (z.idata) throw new STBImageException("tRNS after IDAT, corrupt PNG");
2128             if (pal_img_n) {
2129                if (scan == SCAN_header) { s.img_n = 4; return 1; }
2130                if (pal_len == 0) throw new STBImageException("tRNS before PLTE, corrupt PNG");
2131                if (c.length > pal_len) throw new STBImageException("bad tRNS len, corrupt PNG");
2132                pal_img_n = 4;
2133                for (i=0; i < c.length; ++i)
2134                   palette[i*4+3] = get8u(s);
2135             } else {
2136                if (!(s.img_n & 1)) throw new STBImageException("tRNS with alpha, corrupt PNG");
2137                if (c.length != cast(uint) s.img_n*2) throw new STBImageException("bad tRNS len, corrupt PNG");
2138                has_trans = 1;
2139                for (k=0; k < s.img_n; ++k)
2140                   tc[k] = cast(ubyte) get16(s); // non 8-bit images will be larger
2141             }
2142             break;
2143          }
2144 
2145          case PNG_TYPE('I','D','A','T'): {
2146             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2147             if (pal_img_n && !pal_len) throw new STBImageException("no PLTE, corrupt PNG");
2148             if (scan == SCAN_header) { s.img_n = pal_img_n; return 1; }
2149             if (ioff + c.length > idata_limit) {
2150                ubyte *p;
2151                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
2152                while (ioff + c.length > idata_limit)
2153                   idata_limit *= 2;
2154                p = cast(ubyte*) realloc(z.idata, idata_limit); if (p == null) throw new STBImageException("outofmem, cOut of memory");
2155                z.idata = p;
2156             }
2157             if (!getn(s, z.idata+ioff,c.length)) throw new STBImageException("outofdata, corrupt PNG");
2158             ioff += c.length;
2159             break;
2160          }
2161 
2162          case PNG_TYPE('I','E','N','D'): {
2163             uint raw_len;
2164             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2165             if (scan != SCAN_load) return 1;
2166             if (z.idata == null) throw new STBImageException("no IDAT, corrupt PNG");
2167             z.expanded = stbi_zlib_decode_malloc_guesssize_headerflag(z.idata, ioff, 16384, cast(int *) &raw_len, 1);
2168             if (z.expanded == null) return 0; // zlib should set error
2169             free(z.idata); z.idata = null;
2170             if ((req_comp == s.img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
2171                s.img_out_n = s.img_n+1;
2172             else
2173                s.img_out_n = s.img_n;
2174             if (!create_png_image(z, z.expanded, raw_len, s.img_out_n, interlace)) return 0;
2175             if (has_trans)
2176                if (!compute_transparency(z, tc, s.img_out_n)) return 0;
2177             if (pal_img_n) {
2178                // pal_img_n == 3 or 4
2179                s.img_n = pal_img_n; // record the actual colors we had
2180                s.img_out_n = pal_img_n;
2181                if (req_comp >= 3) s.img_out_n = req_comp;
2182                if (!expand_palette(z, palette.ptr, pal_len, s.img_out_n))
2183                   return 0;
2184             }
2185             free(z.expanded); z.expanded = null;
2186             return 1;
2187          }
2188 
2189          default:
2190             // if critical, fail
2191             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2192             if ((c.type & (1 << 29)) == 0) {
2193 
2194                throw new STBImageException("PNG not supported: unknown chunk type");
2195             }
2196             skip(s, c.length);
2197             break;
2198       }
2199       // end of chunk, read and skip CRC
2200       get32(s);
2201    }
2202 }
2203 
2204 ubyte *do_png(png *p, int *x, int *y, int *n, int req_comp)
2205 {
2206    ubyte *result=null;
2207    if (req_comp < 0 || req_comp > 4)
2208       throw new STBImageException("Internal error: bad req_comp");
2209    if (parse_png_file(p, SCAN_load, req_comp)) {
2210       result = p.out_;
2211       p.out_ = null;
2212       if (req_comp && req_comp != p.s.img_out_n) {
2213          result = convert_format(result, p.s.img_out_n, req_comp, p.s.img_x, p.s.img_y);
2214          p.s.img_out_n = req_comp;
2215          if (result == null) return result;
2216       }
2217       *x = p.s.img_x;
2218       *y = p.s.img_y;
2219       if (n) *n = p.s.img_n;
2220    }
2221    free(p.out_);      p.out_    = null;
2222    free(p.expanded); p.expanded = null;
2223    free(p.idata);    p.idata    = null;
2224 
2225    return result;
2226 }
2227 
2228 ubyte *stbi_png_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2229 {
2230    png p;
2231    p.s = s;
2232    return do_png(&p, x,y,comp,req_comp);
2233 }
2234 
2235 void stbi_png_test(stbi *s)
2236 {
2237    int r = check_png_header(s);
2238    if (r == 0)
2239        throw new STBImageException("Couldn't decode PNG header");
2240 }
2241 
2242 // Microsoft/Windows BMP image
2243 
2244 void stbi_bmp_test(stbi *s)
2245 {
2246     if (get8(s) != 'B') throw new STBImageException("Couldn't decode BMP header");
2247     if (get8(s) != 'M') throw new STBImageException("Couldn't decode BMP header");
2248     get32le(s); // discard filesize
2249     get16le(s); // discard reserved
2250     get16le(s); // discard reserved
2251     get32le(s); // discard data offset
2252     int sz = get32le(s);
2253     if (sz == 12 || sz == 40 || sz == 56 || sz == 108)
2254         return;
2255 
2256     throw new STBImageException("Couldn't decode BMP header");
2257 }
2258 
2259 
2260 // returns 0..31 for the highest set bit
2261 int high_bit(uint z)
2262 {
2263    int n=0;
2264    if (z == 0) return -1;
2265    if (z >= 0x10000) n += 16, z >>= 16;
2266    if (z >= 0x00100) n +=  8, z >>=  8;
2267    if (z >= 0x00010) n +=  4, z >>=  4;
2268    if (z >= 0x00004) n +=  2, z >>=  2;
2269    if (z >= 0x00002) n +=  1, z >>=  1;
2270    return n;
2271 }
2272 
2273 int bitcount(uint a)
2274 {
2275    a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
2276    a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
2277    a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
2278    a = (a + (a >> 8)); // max 16 per 8 bits
2279    a = (a + (a >> 16)); // max 32 per 8 bits
2280    return a & 0xff;
2281 }
2282 
2283 int shiftsigned(int v, int shift, int bits)
2284 {
2285    int result;
2286    int z=0;
2287 
2288    if (shift < 0) v <<= -shift;
2289    else v >>= shift;
2290    result = v;
2291 
2292    z = bits;
2293    while (z < 8) {
2294       result += v >> z;
2295       z += bits;
2296    }
2297    return result;
2298 }
2299 
2300 ubyte *bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2301 {
2302    ubyte *out_;
2303    uint mr=0,mg=0,mb=0,ma=0, fake_a=0;
2304    ubyte pal[256][4];
2305    int psize=0,i,j,compress=0,width;
2306    int bpp, flip_vertically, pad, target, offset, hsz;
2307    if (get8(s) != 'B' || get8(s) != 'M') throw new STBImageException("not BMP, Corrupt BMP");
2308    get32le(s); // discard filesize
2309    get16le(s); // discard reserved
2310    get16le(s); // discard reserved
2311    offset = get32le(s);
2312    hsz = get32le(s);
2313    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108) throw new STBImageException("unknown BMP, BMP type not supported: unknown");
2314    if (hsz == 12) {
2315       s.img_x = get16le(s);
2316       s.img_y = get16le(s);
2317    } else {
2318       s.img_x = get32le(s);
2319       s.img_y = get32le(s);
2320    }
2321    if (get16le(s) != 1) throw new STBImageException("bad BMP");
2322    bpp = get16le(s);
2323    if (bpp == 1) throw new STBImageException("monochrome, BMP type not supported: 1-bit");
2324    flip_vertically = (cast(int) s.img_y) > 0;
2325    s.img_y = abs(cast(int) s.img_y);
2326    if (hsz == 12) {
2327       if (bpp < 24)
2328          psize = (offset - 14 - 24) / 3;
2329    } else {
2330       compress = get32le(s);
2331       if (compress == 1 || compress == 2) throw new STBImageException("BMP RLE, BMP type not supported: RLE");
2332       get32le(s); // discard sizeof
2333       get32le(s); // discard hres
2334       get32le(s); // discard vres
2335       get32le(s); // discard colorsused
2336       get32le(s); // discard max important
2337       if (hsz == 40 || hsz == 56) {
2338          if (hsz == 56) {
2339             get32le(s);
2340             get32le(s);
2341             get32le(s);
2342             get32le(s);
2343          }
2344          if (bpp == 16 || bpp == 32) {
2345             mr = mg = mb = 0;
2346             if (compress == 0) {
2347                if (bpp == 32) {
2348                   mr = 0xffu << 16;
2349                   mg = 0xffu <<  8;
2350                   mb = 0xffu <<  0;
2351                   ma = 0xffu << 24;
2352                   fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
2353                } else {
2354                   mr = 31u << 10;
2355                   mg = 31u <<  5;
2356                   mb = 31u <<  0;
2357                }
2358             } else if (compress == 3) {
2359                mr = get32le(s);
2360                mg = get32le(s);
2361                mb = get32le(s);
2362                // not documented, but generated by photoshop and handled by mspaint
2363                if (mr == mg && mg == mb) {
2364                   // ?!?!?
2365                   throw new STBImageException("bad BMP");
2366                }
2367             } else
2368                throw new STBImageException("bad BMP");
2369          }
2370       } else {
2371          assert(hsz == 108);
2372          mr = get32le(s);
2373          mg = get32le(s);
2374          mb = get32le(s);
2375          ma = get32le(s);
2376          get32le(s); // discard color space
2377          for (i=0; i < 12; ++i)
2378             get32le(s); // discard color space parameters
2379       }
2380       if (bpp < 16)
2381          psize = (offset - 14 - hsz) >> 2;
2382    }
2383    s.img_n = ma ? 4 : 3;
2384    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
2385       target = req_comp;
2386    else
2387       target = s.img_n; // if they want monochrome, we'll post-convert
2388    out_ = cast(ubyte*) malloc(target * s.img_x * s.img_y);
2389    if (!out_) throw new STBImageException("Out of memory");
2390    if (bpp < 16) {
2391       int z=0;
2392       if (psize == 0 || psize > 256) { free(out_); throw new STBImageException("invalid, Corrupt BMP"); }
2393       for (i=0; i < psize; ++i) {
2394          pal[i][2] = get8u(s);
2395          pal[i][1] = get8u(s);
2396          pal[i][0] = get8u(s);
2397          if (hsz != 12) get8(s);
2398          pal[i][3] = 255;
2399       }
2400       skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
2401       if (bpp == 4) width = (s.img_x + 1) >> 1;
2402       else if (bpp == 8) width = s.img_x;
2403       else { free(out_); throw new STBImageException("bad bpp, corrupt BMP"); }
2404       pad = (-width)&3;
2405       for (j=0; j < cast(int) s.img_y; ++j) {
2406          for (i=0; i < cast(int) s.img_x; i += 2) {
2407             int v=get8(s),v2=0;
2408             if (bpp == 4) {
2409                v2 = v & 15;
2410                v >>= 4;
2411             }
2412             out_[z++] = pal[v][0];
2413             out_[z++] = pal[v][1];
2414             out_[z++] = pal[v][2];
2415             if (target == 4) out_[z++] = 255;
2416             if (i+1 == cast(int) s.img_x) break;
2417             v = (bpp == 8) ? get8(s) : v2;
2418             out_[z++] = pal[v][0];
2419             out_[z++] = pal[v][1];
2420             out_[z++] = pal[v][2];
2421             if (target == 4) out_[z++] = 255;
2422          }
2423          skip(s, pad);
2424       }
2425    } else {
2426       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
2427       int z = 0;
2428       int easy=0;
2429       skip(s, offset - 14 - hsz);
2430       if (bpp == 24) width = 3 * s.img_x;
2431       else if (bpp == 16) width = 2*s.img_x;
2432       else /* bpp = 32 and pad = 0 */ width=0;
2433       pad = (-width) & 3;
2434       if (bpp == 24) {
2435          easy = 1;
2436       } else if (bpp == 32) {
2437          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
2438             easy = 2;
2439       }
2440       if (!easy) {
2441          if (!mr || !mg || !mb) { free(out_); throw new STBImageException("bad masks, corrupt BMP"); }
2442          // right shift amt to put high bit in position #7
2443          rshift = high_bit(mr)-7; rcount = bitcount(mr);
2444          gshift = high_bit(mg)-7; gcount = bitcount(mr);
2445          bshift = high_bit(mb)-7; bcount = bitcount(mr);
2446          ashift = high_bit(ma)-7; acount = bitcount(mr);
2447       }
2448       for (j=0; j < cast(int) s.img_y; ++j) {
2449          if (easy) {
2450             for (i=0; i < cast(int) s.img_x; ++i) {
2451                int a;
2452                out_[z+2] = get8u(s);
2453                out_[z+1] = get8u(s);
2454                out_[z+0] = get8u(s);
2455                z += 3;
2456                a = (easy == 2 ? get8(s) : 255);
2457                if (target == 4) out_[z++] = cast(ubyte) a;
2458             }
2459          } else {
2460             for (i=0; i < cast(int) s.img_x; ++i) {
2461                uint v = (bpp == 16 ? get16le(s) : get32le(s));
2462                int a;
2463                out_[z++] = cast(ubyte) shiftsigned(v & mr, rshift, rcount);
2464                out_[z++] = cast(ubyte) shiftsigned(v & mg, gshift, gcount);
2465                out_[z++] = cast(ubyte) shiftsigned(v & mb, bshift, bcount);
2466                a = (ma ? shiftsigned(v & ma, ashift, acount) : 255);
2467                if (target == 4) out_[z++] = cast(ubyte) a;
2468             }
2469          }
2470          skip(s, pad);
2471       }
2472    }
2473    if (flip_vertically) {
2474       ubyte t;
2475       for (j=0; j < cast(int) s.img_y>>1; ++j) {
2476          ubyte *p1 = out_ +      j     *s.img_x*target;
2477          ubyte *p2 = out_ + (s.img_y-1-j)*s.img_x*target;
2478          for (i=0; i < cast(int) s.img_x*target; ++i) {
2479             t = p1[i], p1[i] = p2[i], p2[i] = t;
2480          }
2481       }
2482    }
2483 
2484    if (req_comp && req_comp != target) {
2485       out_ = convert_format(out_, target, req_comp, s.img_x, s.img_y);
2486       if (out_ == null) return out_; // convert_format frees input on failure
2487    }
2488 
2489    *x = s.img_x;
2490    *y = s.img_y;
2491    if (comp) *comp = s.img_n;
2492    return out_;
2493 }
2494 
2495 ubyte *stbi_bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2496 {
2497    return bmp_load(s, x,y,comp,req_comp);
2498 }
2499 
2500 // *************************************************************************************************
2501 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
2502 struct stbi_gif_lzw
2503 {
2504    short prefix;
2505    ubyte first;
2506    ubyte suffix;
2507 }
2508 
2509 struct stbi_gif
2510 {
2511    int w,h;
2512    ubyte *out_;                 // output buffer (always 4 components)
2513    int flags, bgindex, ratio, transparent, eflags;
2514    ubyte  pal[256][4];
2515    ubyte lpal[256][4];
2516    stbi_gif_lzw codes[4096];
2517    ubyte *color_table;
2518    int parse, step;
2519    int lflags;
2520    int start_x, start_y;
2521    int max_x, max_y;
2522    int cur_x, cur_y;
2523    int line_size;
2524 }
2525 
2526 void stbi_gif_test(stbi *s)
2527 {
2528     int sz;
2529     if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2530         throw new STBImageException("Couldn't decode GIF header");
2531     sz = get8(s);
2532     if (sz != '9' && sz != '7')
2533         throw new STBImageException("Couldn't decode GIF header");
2534     if (get8(s) != 'a')
2535         throw new STBImageException("Couldn't decode GIF header");
2536 }
2537 
2538 void stbi_gif_parse_colortable(stbi *s, ubyte pal[256][4], int num_entries, int transp)
2539 {
2540    int i;
2541    for (i=0; i < num_entries; ++i) {
2542       pal[i][2] = get8u(s);
2543       pal[i][1] = get8u(s);
2544       pal[i][0] = get8u(s);
2545       pal[i][3] = transp ? 0 : 255;
2546    }
2547 }
2548 
2549 int stbi_gif_header(stbi *s, stbi_gif *g, int *comp, int is_info)
2550 {
2551    ubyte version_;
2552    if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2553       throw new STBImageException("not GIF, corrupt GIF");
2554 
2555    version_ = get8u(s);
2556    if (version_ != '7' && version_ != '9')    throw new STBImageException("not GIF, corrupt GIF");
2557    if (get8(s) != 'a')                      throw new STBImageException("not GIF, corrupt GIF");
2558 
2559    g.w = get16le(s);
2560    g.h = get16le(s);
2561    g.flags = get8(s);
2562    g.bgindex = get8(s);
2563    g.ratio = get8(s);
2564    g.transparent = -1;
2565 
2566    if (comp != null) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
2567 
2568    if (is_info) return 1;
2569 
2570    if (g.flags & 0x80)
2571       stbi_gif_parse_colortable(s,g.pal, 2 << (g.flags & 7), -1);
2572 
2573    return 1;
2574 }
2575 
2576 void stbi_out_gif_code(stbi_gif *g, ushort code)
2577 {
2578    ubyte *p;
2579    ubyte *c;
2580 
2581    // recurse to decode the prefixes, since the linked-list is backwards,
2582    // and working backwards through an interleaved image would be nasty
2583    if (g.codes[code].prefix >= 0)
2584       stbi_out_gif_code(g, g.codes[code].prefix);
2585 
2586    if (g.cur_y >= g.max_y) return;
2587 
2588    p = (&g.out_[g.cur_x + g.cur_y]);
2589    c = &g.color_table[g.codes[code].suffix * 4];
2590 
2591    if (c[3] >= 128) {
2592       p[0] = c[2];
2593       p[1] = c[1];
2594       p[2] = c[0];
2595       p[3] = c[3];
2596    }
2597    g.cur_x += 4;
2598 
2599    if (g.cur_x >= g.max_x) {
2600       g.cur_x = g.start_x;
2601       g.cur_y += g.step;
2602 
2603       while (g.cur_y >= g.max_y && g.parse > 0) {
2604          g.step = (1 << g.parse) * g.line_size;
2605          g.cur_y = g.start_y + (g.step >> 1);
2606          --g.parse;
2607       }
2608    }
2609 }
2610 
2611 ubyte *stbi_process_gif_raster(stbi *s, stbi_gif *g)
2612 {
2613    ubyte lzw_cs;
2614    int len, code;
2615    uint first;
2616    int codesize, codemask, avail, oldcode, bits, valid_bits, clear;
2617    stbi_gif_lzw *p;
2618 
2619    lzw_cs = get8u(s);
2620    clear = 1 << lzw_cs;
2621    first = 1;
2622    codesize = lzw_cs + 1;
2623    codemask = (1 << codesize) - 1;
2624    bits = 0;
2625    valid_bits = 0;
2626    for (code = 0; code < clear; code++) {
2627       g.codes[code].prefix = -1;
2628       g.codes[code].first = cast(ubyte) code;
2629       g.codes[code].suffix = cast(ubyte) code;
2630    }
2631 
2632    // support no starting clear code
2633    avail = clear+2;
2634    oldcode = -1;
2635 
2636    len = 0;
2637    for(;;) {
2638       if (valid_bits < codesize) {
2639          if (len == 0) {
2640             len = get8(s); // start new block
2641             if (len == 0)
2642                return g.out_;
2643          }
2644          --len;
2645          bits |= cast(int) get8(s) << valid_bits;
2646          valid_bits += 8;
2647       } else {
2648          int code_ = bits & codemask;
2649          bits >>= codesize;
2650          valid_bits -= codesize;
2651          // @OPTIMIZE: is there some way we can accelerate the non-clear path?
2652          if (code_ == clear) {  // clear code
2653             codesize = lzw_cs + 1;
2654             codemask = (1 << codesize) - 1;
2655             avail = clear + 2;
2656             oldcode = -1;
2657             first = 0;
2658          } else if (code_ == clear + 1) { // end of stream code
2659             skip(s, len);
2660             while ((len = get8(s)) > 0)
2661                skip(s,len);
2662             return g.out_;
2663          } else if (code_ <= avail) {
2664             if (first) throw new STBImageException("no clear code, corrupt GIF");
2665 
2666             if (oldcode >= 0) {
2667                p = &g.codes[avail++];
2668                if (avail > 4096)        throw new STBImageException("too many codes, corrupt GIF");
2669                p.prefix = cast(short) oldcode;
2670                p.first = g.codes[oldcode].first;
2671                p.suffix = (code_ == avail) ? p.first : g.codes[code_].first;
2672             } else if (code_ == avail)
2673                throw new STBImageException("illegal code in raster, corrupt GIF");
2674 
2675             stbi_out_gif_code(g, cast(ushort) code);
2676 
2677             if ((avail & codemask) == 0 && avail <= 0x0FFF) {
2678                codesize++;
2679                codemask = (1 << codesize) - 1;
2680             }
2681 
2682             oldcode = code_;
2683          } else {
2684             throw new STBImageException("illegal code in raster, corrupt GIF");
2685          }
2686       }
2687    }
2688 }
2689 
2690 void stbi_fill_gif_background(stbi_gif *g)
2691 {
2692    int i;
2693    ubyte *c = g.pal[g.bgindex].ptr;
2694    // @OPTIMIZE: write a dword at a time
2695    for (i = 0; i < g.w * g.h * 4; i += 4) {
2696       ubyte *p  = &g.out_[i];
2697       p[0] = c[2];
2698       p[1] = c[1];
2699       p[2] = c[0];
2700       p[3] = c[3];
2701    }
2702 }
2703 
2704 // this function is designed to support animated gifs, although stb_image doesn't support it
2705 ubyte *stbi_gif_load_next(stbi *s, stbi_gif *g, int *comp, int req_comp)
2706 {
2707    int i;
2708    ubyte *old_out = null;
2709 
2710    if (g.out_ == null) {
2711       if (!stbi_gif_header(s, g, comp,0))     return null; // failure_reason set by stbi_gif_header
2712       g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2713       if (g.out_ == null)                      throw new STBImageException("Out of memory");
2714       stbi_fill_gif_background(g);
2715    } else {
2716       // animated-gif-only path
2717       if (((g.eflags & 0x1C) >> 2) == 3) {
2718          old_out = g.out_;
2719          g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2720          if (g.out_ == null)                   throw new STBImageException("Out of memory");
2721          memcpy(g.out_, old_out, g.w*g.h*4);
2722       }
2723    }
2724 
2725    for (;;) {
2726       switch (get8(s)) {
2727          case 0x2C: /* Image Descriptor */
2728          {
2729             int x, y, w, h;
2730             ubyte *o;
2731 
2732             x = get16le(s);
2733             y = get16le(s);
2734             w = get16le(s);
2735             h = get16le(s);
2736             if (((x + w) > (g.w)) || ((y + h) > (g.h)))
2737                throw new STBImageException("bad Image Descriptor, corrupt GIF");
2738 
2739             g.line_size = g.w * 4;
2740             g.start_x = x * 4;
2741             g.start_y = y * g.line_size;
2742             g.max_x   = g.start_x + w * 4;
2743             g.max_y   = g.start_y + h * g.line_size;
2744             g.cur_x   = g.start_x;
2745             g.cur_y   = g.start_y;
2746 
2747             g.lflags = get8(s);
2748 
2749             if (g.lflags & 0x40) {
2750                g.step = 8 * g.line_size; // first interlaced spacing
2751                g.parse = 3;
2752             } else {
2753                g.step = g.line_size;
2754                g.parse = 0;
2755             }
2756 
2757             if (g.lflags & 0x80) {
2758                stbi_gif_parse_colortable(s,g.lpal, 2 << (g.lflags & 7), g.eflags & 0x01 ? g.transparent : -1);
2759                g.color_table = &g.lpal[0][0];
2760             } else if (g.flags & 0x80) {
2761                for (i=0; i < 256; ++i)  // @OPTIMIZE: reset only the previous transparent
2762                   g.pal[i][3] = 255;
2763                if (g.transparent >= 0 && (g.eflags & 0x01))
2764                   g.pal[g.transparent][3] = 0;
2765                g.color_table = &g.pal[0][0];
2766             } else
2767                throw new STBImageException("missing color table, corrupt GIF");
2768 
2769             o = stbi_process_gif_raster(s, g);
2770             if (o == null) return null;
2771 
2772             if (req_comp && req_comp != 4)
2773                o = convert_format(o, 4, req_comp, g.w, g.h);
2774             return o;
2775          }
2776 
2777          case 0x21: // Comment Extension.
2778          {
2779             int len;
2780             if (get8(s) == 0xF9) { // Graphic Control Extension.
2781                len = get8(s);
2782                if (len == 4) {
2783                   g.eflags = get8(s);
2784                   get16le(s); // delay
2785                   g.transparent = get8(s);
2786                } else {
2787                   skip(s, len);
2788                   break;
2789                }
2790             }
2791             while ((len = get8(s)) != 0)
2792                skip(s, len);
2793             break;
2794          }
2795 
2796          case 0x3B: // gif stream termination code
2797             return cast(ubyte*) 1;
2798 
2799          default:
2800             throw new STBImageException("unknown code, corrupt GIF");
2801       }
2802    }
2803 }
2804 
2805 ubyte *stbi_gif_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2806 {
2807    ubyte *u = null;
2808    stbi_gif g={0};
2809 
2810    u = stbi_gif_load_next(s, &g, comp, req_comp);
2811    if (u == cast(void *) 1) u = null;  // end of animated gif marker
2812    if (u) {
2813       *x = g.w;
2814       *y = g.h;
2815    }
2816 
2817    return u;
2818 }
2819 
2820