1 /// D translation of stb_image-1.33 (http://nothings.org/stb_image.c)
2 ///
3 /// This port only supports:
4 /// $(UL
5 ///   $(LI PNG 8-bit-per-channel only.)
6 ///   $(LI JPEG baseline (no JPEG progressive).)
7 ///   $(LI GIF.)
8 ///   $(LI BMP non-1bpp, non-RLE.)
9 /// )
10 ///
11 /// TODO:
12 /// $(UL
13 ///   $(LI Support a range as input.)
14 ///  )
15 
16 //============================    Contributors    =========================
17 //
18 // Image formats                                Optimizations & bugfixes
19 // Sean Barrett (jpeg, png, bmp)                Fabian "ryg" Giesen
20 // Nicolas Schulz (hdr, psd)
21 // Jonathan Dummer (tga)                     Bug fixes & warning fixes
22 // Jean-Marc Lienher (gif)                      Marc LeBlanc
23 // Tom Seddon (pic)                             Christpher Lloyd
24 // Thatcher Ulrich (psd)                        Dave Moore
25 // Won Chun
26 // the Horde3D community
27 // Extensions, features                            Janez Zemva
28 // Jetro Lauha (stbi_info)                      Jonathan Blow
29 // James "moose2000" Brown (iPhone PNG)         Laurent Gomila
30 // Ben "Disch" Wenger (io callbacks)            Aruelien Pocheville
31 // Martin "SpartanJ" Golini                     Ryamond Barbiero
32 // David Woo
33 
34 module gfm.image.stb_image;
35 
36 import core.stdc.stdlib;
37 import core.stdc..string;
38 
39 import ae.utils.graphics.image;
40 import ae.utils.graphics.color;
41 
42 
43 
44 /// The one function you probably want to use.
45 /// Loads an image from a static array.
46 /// Because probing has been removed from stb_image, parsing is optimistic 
47 /// and might throw internally before finding the right image format.
48 /// Throws: $(D STBImageException) on error.
49 Image!RGBA loadImage(const(ubyte[]) imageData)
50 {
51     import gfm.image.stb_image;
52     import core.stdc..string;
53 
54     void[] data = cast(void[])imageData;
55     int width, height, components;
56     ubyte* decoded = stbi_load_from_memory(data, width, height, components, 4);
57     scope(exit) stbi_image_free(decoded);
58 
59     // stb_image guarantees that ouput will always have 4 components when asked
60     // Fortunately they are already RGBA
61 
62     // allocates result
63     Image!RGBA loaded;
64     loaded.size(width, height);
65 
66     // copy pixels (here they are contiguous in each case)
67     memcpy(loaded.pixels.ptr, decoded, width * height * 4);
68     return loaded; // this uses the GC to give up ownership
69 }
70 
71 enum STBI_VERSION = 1;
72 
73 /// The exception type thrown when loading an image failed.
74 class STBImageException : Exception
75 {
76     public
77     {
78         @safe pure nothrow this(string message, string file =__FILE__, size_t line = __LINE__, Throwable next = null)
79         {
80             super(message, file, line, next);
81         }
82     }
83 }
84 
85 enum : int
86 {
87    STBI_default    = 0, // only used for req_comp
88    STBI_grey       = 1,
89    STBI_grey_alpha = 2,
90    STBI_rgb        = 3,
91    STBI_rgb_alpha  = 4
92 };
93 
94 // define faster low-level operations (typically SIMD support)
95 
96 
97 uint stbi_lrot(uint x, uint y)
98 {
99     return (x << y) | (x >> (32 - y));
100 }
101 
102 // stbi structure is our basic context used by all images, so it
103 // contains all the IO context, plus some basic image information
104 struct stbi
105 {
106    uint img_x, img_y;
107    int img_n, img_out_n;
108 
109    int buflen;
110    ubyte[128] buffer_start;
111 
112    const(ubyte) *img_buffer;
113    const(ubyte) *img_buffer_end;
114    const(ubyte) *img_buffer_original;
115 }
116 
117 
118 // initialize a memory-decode context
119 void start_mem(stbi *s, const(ubyte)*buffer, int len)
120 {
121    s.img_buffer = buffer;
122    s.img_buffer_original = buffer;
123    s.img_buffer_end = buffer+len;
124 }
125 
126 void stbi_rewind(stbi *s)
127 {
128    // conceptually rewind SHOULD rewind to the beginning of the stream,
129    // but we just rewind to the beginning of the initial buffer, because
130    // we only use it after doing 'test', which only ever looks at at most 92 bytes
131    s.img_buffer = s.img_buffer_original;
132 }
133 
134 
135 ubyte *stbi_load_main(stbi *s, int *x, int *y, int *comp, int req_comp)
136 {
137     try
138     {
139         stbi_jpeg_test(s);
140         stbi_rewind(s);
141         return stbi_jpeg_load(s,x,y,comp,req_comp);
142     }
143     catch(STBImageException e)
144     {
145         stbi_rewind(s);
146     }
147 
148     try
149     {
150         stbi_png_test(s);
151         stbi_rewind(s);
152         return stbi_png_load(s,x,y,comp,req_comp);
153     }
154     catch(STBImageException e)
155     {
156         stbi_rewind(s);
157     }
158 
159     try
160     {
161         stbi_bmp_test(s);
162         stbi_rewind(s);
163         return stbi_bmp_load(s,x,y,comp,req_comp);
164     }
165     catch(STBImageException e)
166     {
167         stbi_rewind(s);
168     }
169 
170     try
171     {
172         stbi_gif_test(s);
173         stbi_rewind(s);
174         return stbi_gif_load(s,x,y,comp,req_comp);
175     }
176     catch(STBImageException e)
177     {
178         stbi_rewind(s);
179     }
180 
181     throw new STBImageException("Image not of any known type, or corrupt");
182 }
183 
184 /// Loads an image from memory.
185 /// Throws: STBImageException on error.
186 ubyte* stbi_load_from_memory(void[] buffer, out int width, out int height, out int components, int requestedComponents)
187 {
188    stbi s;
189    start_mem(&s, cast(ubyte*)buffer.ptr, cast(int)(buffer.length));
190    return stbi_load_main(&s, &width, &height, &components, requestedComponents);
191 }
192 
193 /// Frees an image loaded by stb_image.
194 void stbi_image_free(void *retval_from_stbi_load)
195 {
196     free(retval_from_stbi_load);
197 }
198 
199 /// Load an image from memory and puts it in a ae.utils.graphics.image.Image.
200 /// Throws: STBImageException on error.
201 Image!RGBA stbiLoadImageAE(void[] buffer)
202 {
203     int width, height, components;
204     ubyte* data = stbi_load_from_memory(buffer, width, height, components, 4);
205     ubyte[] range = data[0..width*height*4];
206     scope(exit) stbi_image_free(data);
207 
208     auto result = Image!RGBA(width, height);
209     size_t length = width * height * RGBA.sizeof;
210     result.pixels[] = cast(RGBA[])(data[0..length]);
211     return result;
212 }
213 
214 //
215 // Common code used by all image loaders
216 //
217 
218 enum : int
219 {
220    SCAN_load=0,
221    SCAN_type,
222    SCAN_header
223 };
224 
225 
226 int get8(stbi *s)
227 {
228    if (s.img_buffer < s.img_buffer_end)
229       return *s.img_buffer++;
230 
231    return 0;
232 }
233 
234 int at_eof(stbi *s)
235 {
236    return s.img_buffer >= s.img_buffer_end;
237 }
238 
239 ubyte get8u(stbi *s)
240 {
241    return cast(ubyte) get8(s);
242 }
243 
244 void skip(stbi *s, int n)
245 {
246    s.img_buffer += n;
247 }
248 
249 int getn(stbi *s, ubyte *buffer, int n)
250 {
251    if (s.img_buffer+n <= s.img_buffer_end) {
252       memcpy(buffer, s.img_buffer, n);
253       s.img_buffer += n;
254       return 1;
255    } else
256       return 0;
257 }
258 
259 int get16(stbi *s)
260 {
261    int z = get8(s);
262    return (z << 8) + get8(s);
263 }
264 
265 uint get32(stbi *s)
266 {
267    uint z = get16(s);
268    return (z << 16) + get16(s);
269 }
270 
271 int get16le(stbi *s)
272 {
273    int z = get8(s);
274    return z + (get8(s) << 8);
275 }
276 
277 uint get32le(stbi *s)
278 {
279    uint z = get16le(s);
280    return z + (get16le(s) << 16);
281 }
282 
283 //
284 //  generic converter from built-in img_n to req_comp
285 //    individual types do this automatically as much as possible (e.g. jpeg
286 //    does all cases internally since it needs to colorspace convert anyway,
287 //    and it never has alpha, so very few cases ). png can automatically
288 //    interleave an alpha=255 channel, but falls back to this for other cases
289 //
290 //  assume data buffer is malloced, so malloc a new one and free that one
291 //  only failure mode is malloc failing
292 
293 ubyte compute_y(int r, int g, int b)
294 {
295    return cast(ubyte) (((r*77) + (g*150) +  (29*b)) >> 8);
296 }
297 
298 ubyte *convert_format(ubyte *data, int img_n, int req_comp, uint x, uint y)
299 {
300     int i,j;
301     ubyte *good;
302 
303     if (req_comp == img_n) return data;
304     assert(req_comp >= 1 && req_comp <= 4);
305 
306     good = cast(ubyte*) malloc(req_comp * x * y);
307     if (good == null) {
308         free(data);
309         throw new STBImageException("Out of memory");
310     }
311 
312     for (j=0; j < cast(int) y; ++j) {
313         ubyte *src  = data + j * x * img_n   ;
314         ubyte *dest = good + j * x * req_comp;
315 
316         // convert source image with img_n components to one with req_comp components;
317         // avoid switch per pixel, so use switch per scanline and massive macros
318         switch (img_n * 8 + req_comp)
319         {
320             case 1 * 8 + 2:
321                 for(i=x-1; i >= 0; --i, src += 1, dest += 2)
322                     dest[0] = src[0], dest[1] = 255;
323                 break;
324             case 1 * 8 + 3:
325                 for(i=x-1; i >= 0; --i, src += 1, dest += 3)
326                     dest[0]=dest[1]=dest[2]=src[0];
327                 break;
328             case 1 * 8 + 4:
329                 for(i=x-1; i >= 0; --i, src += 1, dest += 4)
330                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;
331                 break;
332             case 2 * 8 + 1:
333                 for(i=x-1; i >= 0; --i, src += 2, dest += 1)
334                     dest[0]=src[0];
335                 break;
336             case 2 * 8 + 3:
337                 for(i=x-1; i >= 0; --i, src += 2, dest += 3)
338                     dest[0]=dest[1]=dest[2]=src[0];
339                 break;
340             case 2 * 8 + 4:
341                 for(i=x-1; i >= 0; --i, src += 2, dest += 4)
342                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];
343                 break;
344             case 3 * 8 + 4:
345                 for(i=x-1; i >= 0; --i, src += 3, dest += 4)
346                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;
347                 break;
348             case 3 * 8 + 1:
349                 for(i=x-1; i >= 0; --i, src += 3, dest += 1)
350                     dest[0]=compute_y(src[0],src[1],src[2]);
351                 break;
352             case 3 * 8 + 2:
353                 for(i=x-1; i >= 0; --i, src += 3, dest += 2)
354                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255;
355                 break;
356             case 4 * 8 + 1:
357                 for(i=x-1; i >= 0; --i, src += 4, dest += 1)
358                     dest[0]=compute_y(src[0],src[1],src[2]);
359                 break;
360             case 4 * 8 + 2:
361                 for(i=x-1; i >= 0; --i, src += 4, dest += 2)
362                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3];
363                 break;
364             case 4 * 8 + 3:
365                 for(i=x-1; i >= 0; --i, src += 4, dest += 3)
366                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];
367                 break;
368             default: assert(0);
369         }
370     }
371 
372     free(data);
373     return good;
374 }
375 
376 //
377 //  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
378 //
379 //    simple implementation
380 //      - channel subsampling of at most 2 in each dimension
381 //      - doesn't support delayed output of y-dimension
382 //      - simple interface (only one output format: 8-bit interleaved RGB)
383 //      - doesn't try to recover corrupt jpegs
384 //      - doesn't allow partial loading, loading multiple at once
385 //      - still fast on x86 (copying globals into locals doesn't help x86)
386 //      - allocates lots of intermediate memory (full size of all components)
387 //        - non-interleaved case requires this anyway
388 //        - allows good upsampling (see next)
389 //    high-quality
390 //      - upsampled channels are bilinearly interpolated, even across blocks
391 //      - quality integer IDCT derived from IJG's 'slow'
392 //    performance
393 //      - fast huffman; reasonable integer IDCT
394 //      - uses a lot of intermediate memory, could cache poorly
395 //      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
396 //          stb_jpeg:   1.34 seconds (MSVC6, default release build)
397 //          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
398 //          IJL11.dll:  1.08 seconds (compiled by intel)
399 //          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
400 //          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
401 
402 // huffman decoding acceleration
403 enum FAST_BITS = 9;  // larger handles more cases; smaller stomps less cache
404 
405 struct huffman
406 {
407    ubyte[1 << FAST_BITS] fast;
408    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
409    ushort[256] code;
410    ubyte[256] values;
411    ubyte[257] size;
412    uint[18] maxcode;
413    int[17] delta;   // old 'firstsymbol' - old 'firstcode'
414 }
415 
416 struct jpeg
417 {
418    stbi *s;
419    huffman[4] huff_dc;
420    huffman[4] huff_ac;
421    ubyte[64][4] dequant;
422 
423 // sizes for components, interleaved MCUs
424    int img_h_max, img_v_max;
425    int img_mcu_x, img_mcu_y;
426    int img_mcu_w, img_mcu_h;
427 
428 // definition of jpeg image component
429    struct img_comp_
430    {
431       int id;
432       int h,v;
433       int tq;
434       int hd,ha;
435       int dc_pred;
436 
437       int x,y,w2,h2;
438       ubyte *data;
439       void *raw_data;
440       ubyte *linebuf;
441    }
442 
443    img_comp_[4] img_comp;
444 
445    uint         code_buffer; // jpeg entropy-coded buffer
446    int            code_bits;   // number of valid bits
447    ubyte          marker;      // marker seen while filling entropy buffer
448    int            nomore;      // flag if we saw a marker so must stop
449 
450    int scan_n;
451    int[4] order;
452    int restart_interval, todo;
453 }
454 
455 
456 int build_huffman(huffman *h, int *count)
457 {
458    int i,j,k=0,code;
459    // build size list for each symbol (from JPEG spec)
460    for (i=0; i < 16; ++i)
461       for (j=0; j < count[i]; ++j)
462          h.size[k++] = cast(ubyte) (i+1);
463    h.size[k] = 0;
464 
465    // compute actual symbols (from jpeg spec)
466    code = 0;
467    k = 0;
468    for(j=1; j <= 16; ++j) {
469       // compute delta to add to code to compute symbol id
470       h.delta[j] = k - code;
471       if (h.size[k] == j) {
472          while (h.size[k] == j)
473             h.code[k++] = cast(ushort) (code++);
474          if (code-1 >= (1 << j))
475              throw new STBImageException("Bad code lengths, corrupt JPEG");
476       }
477       // compute largest code + 1 for this size, preshifted as needed later
478       h.maxcode[j] = code << (16-j);
479       code <<= 1;
480    }
481    h.maxcode[j] = 0xffffffff;
482 
483    // build non-spec acceleration table; 255 is flag for not-accelerated
484    memset(h.fast.ptr, 255, 1 << FAST_BITS);
485    for (i=0; i < k; ++i) {
486       int s = h.size[i];
487       if (s <= FAST_BITS) {
488          int c = h.code[i] << (FAST_BITS-s);
489          int m = 1 << (FAST_BITS-s);
490          for (j=0; j < m; ++j) {
491             h.fast[c+j] = cast(ubyte) i;
492          }
493       }
494    }
495    return 1;
496 }
497 
498 void grow_buffer_unsafe(jpeg *j)
499 {
500    do {
501       int b = j.nomore ? 0 : get8(j.s);
502       if (b == 0xff) {
503          int c = get8(j.s);
504          if (c != 0) {
505             j.marker = cast(ubyte) c;
506             j.nomore = 1;
507             return;
508          }
509       }
510       j.code_buffer |= b << (24 - j.code_bits);
511       j.code_bits += 8;
512    } while (j.code_bits <= 24);
513 }
514 
515 // (1 << n) - 1
516 static immutable uint[17] bmask=[0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535];
517 
518 // decode a jpeg huffman value from the bitstream
519 int decode(jpeg *j, huffman *h)
520 {
521    uint temp;
522    int c,k;
523 
524    if (j.code_bits < 16) grow_buffer_unsafe(j);
525 
526    // look at the top FAST_BITS and determine what symbol ID it is,
527    // if the code is <= FAST_BITS
528    c = (j.code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
529    k = h.fast[c];
530    if (k < 255) {
531       int s = h.size[k];
532       if (s > j.code_bits)
533          return -1;
534       j.code_buffer <<= s;
535       j.code_bits -= s;
536       return h.values[k];
537    }
538 
539    // naive test is to shift the code_buffer down so k bits are
540    // valid, then test against maxcode. To speed this up, we've
541    // preshifted maxcode left so that it has (16-k) 0s at the
542    // end; in other words, regardless of the number of bits, it
543    // wants to be compared against something shifted to have 16;
544    // that way we don't need to shift inside the loop.
545    temp = j.code_buffer >> 16;
546    for (k=FAST_BITS+1 ; ; ++k)
547       if (temp < h.maxcode[k])
548          break;
549    if (k == 17) {
550       // error! code not found
551       j.code_bits -= 16;
552       return -1;
553    }
554 
555    if (k > j.code_bits)
556       return -1;
557 
558    // convert the huffman code to the symbol id
559    c = ((j.code_buffer >> (32 - k)) & bmask[k]) + h.delta[k];
560    assert((((j.code_buffer) >> (32 - h.size[c])) & bmask[h.size[c]]) == h.code[c]);
561 
562    // convert the id to a symbol
563    j.code_bits -= k;
564    j.code_buffer <<= k;
565    return h.values[c];
566 }
567 
568 // combined JPEG 'receive' and JPEG 'extend', since baseline
569 // always extends everything it receives.
570 int extend_receive(jpeg *j, int n)
571 {
572    uint m = 1 << (n-1);
573    uint k;
574    if (j.code_bits < n) grow_buffer_unsafe(j);
575 
576    k = stbi_lrot(j.code_buffer, n);
577    j.code_buffer = k & ~bmask[n];
578    k &= bmask[n];
579    j.code_bits -= n;
580 
581    // the following test is probably a random branch that won't
582    // predict well. I tried to table accelerate it but failed.
583    // maybe it's compiling as a conditional move?
584    if (k < m)
585       return (-1 << n) + k + 1;
586    else
587       return k;
588 }
589 
590 // given a value that's at position X in the zigzag stream,
591 // where does it appear in the 8x8 matrix coded as row-major?
592 static immutable ubyte[64+15] dezigzag =
593 [
594     0,  1,  8, 16,  9,  2,  3, 10,
595    17, 24, 32, 25, 18, 11,  4,  5,
596    12, 19, 26, 33, 40, 48, 41, 34,
597    27, 20, 13,  6,  7, 14, 21, 28,
598    35, 42, 49, 56, 57, 50, 43, 36,
599    29, 22, 15, 23, 30, 37, 44, 51,
600    58, 59, 52, 45, 38, 31, 39, 46,
601    53, 60, 61, 54, 47, 55, 62, 63,
602    // let corrupt input sample past end
603    63, 63, 63, 63, 63, 63, 63, 63,
604    63, 63, 63, 63, 63, 63, 63
605 ];
606 
607 // decode one 64-entry block--
608 int decode_block(jpeg *j, short[64] data, huffman *hdc, huffman *hac, int b)
609 {
610    int diff,dc,k;
611    int t = decode(j, hdc);
612    if (t < 0)
613        throw new STBImageException("Bad huffman code, corrupt JPEG");
614 
615    // 0 all the ac values now so we can do it 32-bits at a time
616    memset(data.ptr,0,64*(data[0]).sizeof);
617 
618    diff = t ? extend_receive(j, t) : 0;
619    dc = j.img_comp[b].dc_pred + diff;
620    j.img_comp[b].dc_pred = dc;
621    data[0] = cast(short) dc;
622 
623    // decode AC components, see JPEG spec
624    k = 1;
625    do {
626       int r,s;
627       int rs = decode(j, hac);
628       if (rs < 0)
629          throw new STBImageException("Bad huffman code, corrupt JPEG");
630       s = rs & 15;
631       r = rs >> 4;
632       if (s == 0) {
633          if (rs != 0xf0) break; // end block
634          k += 16;
635       } else {
636          k += r;
637          // decode into unzigzag'd location
638          data[dezigzag[k++]] = cast(short) extend_receive(j,s);
639       }
640    } while (k < 64);
641    return 1;
642 }
643 
644 // take a -128..127 value and clamp it and convert to 0..255
645 ubyte clamp(int x)
646 {
647    // trick to use a single test to catch both cases
648    if (cast(uint) x > 255) {
649       if (x < 0) return 0;
650       if (x > 255) return 255;
651    }
652    return cast(ubyte) x;
653 }
654 
655 int f2f(double x)
656 {
657     return cast(int)(x * 4096 + 0.5);
658 }
659 
660 int fsh(int x)
661 {
662     return x << 12;
663 }
664 
665 // derived from jidctint -- DCT_ISLOW
666 void IDCT_1D(int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
667              out int t0, out int t1, out int t2, out int t3,
668              out int x0, out int x1, out int x2, out int x3)
669 {
670    int p1,p2,p3,p4,p5;
671    p2 = s2;
672    p3 = s6;
673    p1 = (p2+p3) * f2f(0.5411961f);
674    t2 = p1 + p3*f2f(-1.847759065f);
675    t3 = p1 + p2*f2f( 0.765366865f);
676    p2 = s0;
677    p3 = s4;
678    t0 = fsh(p2+p3);
679    t1 = fsh(p2-p3);
680    x0 = t0+t3;
681    x3 = t0-t3;
682    x1 = t1+t2;
683    x2 = t1-t2;
684    t0 = s7;
685    t1 = s5;
686    t2 = s3;
687    t3 = s1;
688    p3 = t0+t2;
689    p4 = t1+t3;
690    p1 = t0+t3;
691    p2 = t1+t2;
692    p5 = (p3+p4)*f2f( 1.175875602f);
693    t0 = t0*f2f( 0.298631336f);
694    t1 = t1*f2f( 2.053119869f);
695    t2 = t2*f2f( 3.072711026f);
696    t3 = t3*f2f( 1.501321110f);
697    p1 = p5 + p1*f2f(-0.899976223f);
698    p2 = p5 + p2*f2f(-2.562915447f);
699    p3 = p3*f2f(-1.961570560f);
700    p4 = p4*f2f(-0.390180644f);
701    t3 += p1+p4;
702    t2 += p2+p3;
703    t1 += p2+p4;
704    t0 += p1+p3;
705  }
706 
707 alias stbi_dequantize_t = ubyte;
708 
709 // .344 seconds on 3*anemones.jpg
710 void idct_block(ubyte *out_, int out_stride, short[64] data, stbi_dequantize_t *dequantize)
711 {
712    int i;
713    int[64] val;
714    int*v = val.ptr;
715    stbi_dequantize_t *dq = dequantize;
716    ubyte *o;
717    short *d = data.ptr;
718 
719    // columns
720    for (i=0; i < 8; ++i,++d,++dq, ++v) {
721       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
722       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
723            && d[40]==0 && d[48]==0 && d[56]==0) {
724          //    no shortcut                 0     seconds
725          //    (1|2|3|4|5|6|7)==0          0     seconds
726          //    all separate               -0.047 seconds
727          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
728          int dcterm = d[0] * dq[0] << 2;
729          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
730       } else {
731          int t0, t1, t2, t3, x0, x1, x2, x3;
732          IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
733                  d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56],
734                  t0, t1, t2, t3, x0, x1, x2, x3);
735          // constants scaled things up by 1<<12; let's bring them back
736          // down, but keep 2 extra bits of precision
737          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
738          v[ 0] = (x0+t3) >> 10;
739          v[56] = (x0-t3) >> 10;
740          v[ 8] = (x1+t2) >> 10;
741          v[48] = (x1-t2) >> 10;
742          v[16] = (x2+t1) >> 10;
743          v[40] = (x2-t1) >> 10;
744          v[24] = (x3+t0) >> 10;
745          v[32] = (x3-t0) >> 10;
746       }
747    }
748 
749    for (i=0, v=val.ptr, o=out_; i < 8; ++i,v+=8,o+=out_stride) {
750 
751       // no fast case since the first 1D IDCT spread components out
752       int t0, t1, t2, t3, x0, x1, x2, x3;
753       IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7], t0, t1, t2, t3, x0, x1, x2, x3);
754       // constants scaled things up by 1<<12, plus we had 1<<2 from first
755       // loop, plus horizontal and vertical each scale by sqrt(8) so together
756       // we've got an extra 1<<3, so 1<<17 total we need to remove.
757       // so we want to round that, which means adding 0.5 * 1<<17,
758       // aka 65536. Also, we'll end up with -128 to 127 that we want
759       // to encode as 0..255 by adding 128, so we'll add that before the shift
760       x0 += 65536 + (128<<17);
761       x1 += 65536 + (128<<17);
762       x2 += 65536 + (128<<17);
763       x3 += 65536 + (128<<17);
764       // tried computing the shifts into temps, or'ing the temps to see
765       // if any were out of range, but that was slower
766       o[0] = clamp((x0+t3) >> 17);
767       o[7] = clamp((x0-t3) >> 17);
768       o[1] = clamp((x1+t2) >> 17);
769       o[6] = clamp((x1-t2) >> 17);
770       o[2] = clamp((x2+t1) >> 17);
771       o[5] = clamp((x2-t1) >> 17);
772       o[3] = clamp((x3+t0) >> 17);
773       o[4] = clamp((x3-t0) >> 17);
774    }
775 }
776 
777 
778 enum MARKER_none = 0xff;
779 
780 // if there's a pending marker from the entropy stream, return that
781 // otherwise, fetch from the stream and get a marker. if there's no
782 // marker, return 0xff, which is never a valid marker value
783 ubyte get_marker(jpeg *j)
784 {
785    ubyte x;
786    if (j.marker != MARKER_none) { x = j.marker; j.marker = MARKER_none; return x; }
787    x = get8u(j.s);
788    if (x != 0xff) return MARKER_none;
789    while (x == 0xff)
790       x = get8u(j.s);
791    return x;
792 }
793 
794 // in each scan, we'll have scan_n components, and the order
795 // of the components is specified by order[]
796 bool RESTART(int x)
797 {
798     return (x >= 0xd0) && (x <= 0xd7);
799 }
800 
801 // after a restart interval, reset the entropy decoder and
802 // the dc prediction
803 void reset(jpeg *j)
804 {
805    j.code_bits = 0;
806    j.code_buffer = 0;
807    j.nomore = 0;
808    j.img_comp[0].dc_pred = j.img_comp[1].dc_pred = j.img_comp[2].dc_pred = 0;
809    j.marker = MARKER_none;
810    j.todo = j.restart_interval ? j.restart_interval : 0x7fffffff;
811    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
812    // since we don't even allow 1<<30 pixels
813 }
814 
815 int parse_entropy_coded_data(jpeg *z)
816 {
817    reset(z);
818    if (z.scan_n == 1) {
819       int i,j;
820       short[64] data;
821       int n = z.order[0];
822       // non-interleaved data, we just need to process one block at a time,
823       // in trivial scanline order
824       // number of blocks to do just depends on how many actual "pixels" this
825       // component has, independent of interleaved MCU blocking and such
826       int w = (z.img_comp[n].x+7) >> 3;
827       int h = (z.img_comp[n].y+7) >> 3;
828       for (j=0; j < h; ++j) {
829          for (i=0; i < w; ++i) {
830             if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
831             idct_block(z.img_comp[n].data+z.img_comp[n].w2*j*8+i*8, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
832             // every data block is an MCU, so countdown the restart interval
833             if (--z.todo <= 0) {
834                if (z.code_bits < 24) grow_buffer_unsafe(z);
835                // if it's NOT a restart, then just bail, so we get corrupt data
836                // rather than no data
837                if (!RESTART(z.marker)) return 1;
838                reset(z);
839             }
840          }
841       }
842    } else { // interleaved!
843       int i,j,k,x,y;
844       short[64] data;
845       for (j=0; j < z.img_mcu_y; ++j) {
846          for (i=0; i < z.img_mcu_x; ++i) {
847             // scan an interleaved mcu... process scan_n components in order
848             for (k=0; k < z.scan_n; ++k) {
849                int n = z.order[k];
850                // scan out an mcu's worth of this component; that's just determined
851                // by the basic H and V specified for the component
852                for (y=0; y < z.img_comp[n].v; ++y) {
853                   for (x=0; x < z.img_comp[n].h; ++x) {
854                      int x2 = (i*z.img_comp[n].h + x)*8;
855                      int y2 = (j*z.img_comp[n].v + y)*8;
856                      if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
857                      idct_block(z.img_comp[n].data+z.img_comp[n].w2*y2+x2, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
858                   }
859                }
860             }
861             // after all interleaved components, that's an interleaved MCU,
862             // so now count down the restart interval
863             if (--z.todo <= 0) {
864                if (z.code_bits < 24) grow_buffer_unsafe(z);
865                // if it's NOT a restart, then just bail, so we get corrupt data
866                // rather than no data
867                if (!RESTART(z.marker)) return 1;
868                reset(z);
869             }
870          }
871       }
872    }
873    return 1;
874 }
875 
876 int process_marker(jpeg *z, int m)
877 {
878    int L;
879    switch (m) {
880 
881       case MARKER_none: // no marker found
882          throw new STBImageException("Expected marker, corrupt JPEG");
883 
884       case 0xC2: // SOF - progressive
885           throw new STBImageException("JPEG format not supported (progressive)");
886 
887       case 0xDD: // DRI - specify restart interval
888          if (get16(z.s) != 4)
889              throw new STBImageException("Bad DRI len, corrupt JPEG");
890          z.restart_interval = get16(z.s);
891          return 1;
892 
893       case 0xDB: // DQT - define quantization table
894          L = get16(z.s)-2;
895          while (L > 0) {
896             int q = get8(z.s);
897             int p = q >> 4;
898             int t = q & 15,i;
899             if (p != 0)
900                throw new STBImageException("Bad DQT type, corrupt JPEG");
901             if (t > 3)
902                throw new STBImageException("Bad DQT table, corrupt JPEG");
903             for (i=0; i < 64; ++i)
904                z.dequant[t][dezigzag[i]] = get8u(z.s);
905             L -= 65;
906          }
907          return L==0;
908 
909       case 0xC4: // DHT - define huffman table
910          L = get16(z.s)-2;
911          while (L > 0) {
912             ubyte *v;
913             int[16] sizes;
914             int i;
915             int m_ = 0;
916             int q = get8(z.s);
917             int tc = q >> 4;
918             int th = q & 15;
919             if (tc > 1 || th > 3)
920                 throw new STBImageException("Bad DHT header, corrupt JPEG");
921             for (i=0; i < 16; ++i) {
922                sizes[i] = get8(z.s);
923                m_ += sizes[i];
924             }
925             L -= 17;
926             if (tc == 0) {
927                if (!build_huffman(z.huff_dc.ptr+th, sizes.ptr)) return 0;
928                v = z.huff_dc[th].values.ptr;
929             } else {
930                if (!build_huffman(z.huff_ac.ptr+th, sizes.ptr)) return 0;
931                v = z.huff_ac[th].values.ptr;
932             }
933             for (i=0; i < m_; ++i)
934                v[i] = get8u(z.s);
935             L -= m_;
936          }
937          return L==0;
938 
939       default:
940          break;
941    }
942    // check for comment block or APP blocks
943    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
944       skip(z.s, get16(z.s)-2);
945       return 1;
946    }
947    return 0;
948 }
949 
950 // after we see SOS
951 int process_scan_header(jpeg *z)
952 {
953    int i;
954    int Ls = get16(z.s);
955    z.scan_n = get8(z.s);
956    if (z.scan_n < 1 || z.scan_n > 4 || z.scan_n > cast(int) z.s.img_n)
957       throw new STBImageException("Bad SOS component count, Corrupt JPEG");
958 
959    if (Ls != 6+2*z.scan_n)
960       throw new STBImageException("Bad SOS length, Corrupt JPEG");
961 
962    for (i=0; i < z.scan_n; ++i) {
963       int id = get8(z.s), which;
964       int q = get8(z.s);
965       for (which = 0; which < z.s.img_n; ++which)
966          if (z.img_comp[which].id == id)
967             break;
968       if (which == z.s.img_n) return 0;
969       z.img_comp[which].hd = q >> 4;
970       if (z.img_comp[which].hd > 3)
971          throw new STBImageException("Bad DC huff, Corrupt JPEG");
972       z.img_comp[which].ha = q & 15;
973       if (z.img_comp[which].ha > 3)
974          throw new STBImageException("Bad AC huff, Corrupt JPEG");
975       z.order[i] = which;
976    }
977    if (get8(z.s) != 0)
978       throw new STBImageException("Bad SOS, Corrupt JPEG");
979    get8(z.s); // should be 63, but might be 0
980    if (get8(z.s) != 0)
981       throw new STBImageException("Bad SOS, Corrupt JPEG");
982 
983    return 1;
984 }
985 
986 int process_frame_header(jpeg *z, int scan)
987 {
988    stbi *s = z.s;
989    int Lf,p,i,q, h_max=1,v_max=1,c;
990    Lf = get16(s);         if (Lf < 11) throw new STBImageException("Bad SOF len, Corrupt JPEG");
991    p  = get8(s);          if (p != 8) throw new STBImageException("JPEG format not supported: 8-bit only"); // JPEG baseline
992    s.img_y = get16(s);   if (s.img_y == 0) throw new STBImageException("No header height, JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
993    s.img_x = get16(s);   if (s.img_x == 0) throw new STBImageException("0 width, corrupt JPEG"); // JPEG requires
994    c = get8(s);
995    if (c != 3 && c != 1) throw new STBImageException("Bad component count, corrupt JPEG");    // JFIF requires
996    s.img_n = c;
997    for (i=0; i < c; ++i) {
998       z.img_comp[i].data = null;
999       z.img_comp[i].linebuf = null;
1000    }
1001 
1002    if (Lf != 8+3*s.img_n) throw new STBImageException("Bad SOF len, corrupt JPEG");
1003 
1004    for (i=0; i < s.img_n; ++i) {
1005       z.img_comp[i].id = get8(s);
1006       if (z.img_comp[i].id != i+1)   // JFIF requires
1007          if (z.img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
1008             throw new STBImageException("Bad component ID, corrupt JPEG");
1009       q = get8(s);
1010       z.img_comp[i].h = (q >> 4);  if (!z.img_comp[i].h || z.img_comp[i].h > 4) throw new STBImageException("Bad H, corrupt JPEG");
1011       z.img_comp[i].v = q & 15;    if (!z.img_comp[i].v || z.img_comp[i].v > 4) throw new STBImageException("Bad V, corrupt JPEG");
1012       z.img_comp[i].tq = get8(s);  if (z.img_comp[i].tq > 3) throw new STBImageException("Bad TQ, corrupt JPEG");
1013    }
1014 
1015    if (scan != SCAN_load) return 1;
1016 
1017    if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
1018 
1019    for (i=0; i < s.img_n; ++i) {
1020       if (z.img_comp[i].h > h_max) h_max = z.img_comp[i].h;
1021       if (z.img_comp[i].v > v_max) v_max = z.img_comp[i].v;
1022    }
1023 
1024    // compute interleaved mcu info
1025    z.img_h_max = h_max;
1026    z.img_v_max = v_max;
1027    z.img_mcu_w = h_max * 8;
1028    z.img_mcu_h = v_max * 8;
1029    z.img_mcu_x = (s.img_x + z.img_mcu_w-1) / z.img_mcu_w;
1030    z.img_mcu_y = (s.img_y + z.img_mcu_h-1) / z.img_mcu_h;
1031 
1032    for (i=0; i < s.img_n; ++i) {
1033       // number of effective pixels (e.g. for non-interleaved MCU)
1034       z.img_comp[i].x = (s.img_x * z.img_comp[i].h + h_max-1) / h_max;
1035       z.img_comp[i].y = (s.img_y * z.img_comp[i].v + v_max-1) / v_max;
1036       // to simplify generation, we'll allocate enough memory to decode
1037       // the bogus oversized data from using interleaved MCUs and their
1038       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1039       // discard the extra data until colorspace conversion
1040       z.img_comp[i].w2 = z.img_mcu_x * z.img_comp[i].h * 8;
1041       z.img_comp[i].h2 = z.img_mcu_y * z.img_comp[i].v * 8;
1042       z.img_comp[i].raw_data = malloc(z.img_comp[i].w2 * z.img_comp[i].h2+15);
1043       if (z.img_comp[i].raw_data == null) {
1044          for(--i; i >= 0; --i) {
1045             free(z.img_comp[i].raw_data);
1046             z.img_comp[i].data = null;
1047          }
1048          throw new STBImageException("Out of memory");
1049       }
1050       // align blocks for installable-idct using mmx/sse
1051       z.img_comp[i].data = cast(ubyte*) (( cast(size_t) z.img_comp[i].raw_data + 15) & ~15);
1052       z.img_comp[i].linebuf = null;
1053    }
1054 
1055    return 1;
1056 }
1057 
1058 // use comparisons since in some cases we handle more than one case (e.g. SOF)
1059 bool DNL(int x) { return x == 0xdc; }
1060 bool SOI(int x) { return x == 0xd8; }
1061 bool EOI(int x) { return x == 0xd9; }
1062 bool SOF(int x) { return x == 0xc0 || x == 0xc1; }
1063 bool SOS(int x) { return x == 0xda; }
1064 
1065 int decode_jpeg_header(jpeg *z, int scan)
1066 {
1067    int m;
1068    z.marker = MARKER_none; // initialize cached marker to empty
1069    m = get_marker(z);
1070    if (!SOI(m)) throw new STBImageException("No SOI, corrupt JPEG");
1071    if (scan == SCAN_type) return 1;
1072    m = get_marker(z);
1073    while (!SOF(m))
1074    {
1075 
1076       if (!process_marker(z,m)) return 0;
1077       m = get_marker(z);
1078 
1079 
1080 
1081       while (m == MARKER_none)
1082       {
1083          // some files have extra padding after their blocks, so ok, we'll scan
1084          if (at_eof(z.s)) throw new STBImageException("No SOF, corrupt JPEG");
1085          m = get_marker(z);
1086       }
1087    }
1088    if (!process_frame_header(z, scan)) return 0;
1089    return 1;
1090 }
1091 
1092 int decode_jpeg_image(jpeg *j)
1093 {
1094    int m;
1095    j.restart_interval = 0;
1096    if (!decode_jpeg_header(j, SCAN_load)) return 0;
1097    m = get_marker(j);
1098    while (!EOI(m)) {
1099       if (SOS(m)) {
1100          if (!process_scan_header(j)) return 0;
1101          if (!parse_entropy_coded_data(j)) return 0;
1102          if (j.marker == MARKER_none ) {
1103             // handle 0s at the end of image data from IP Kamera 9060
1104             while (!at_eof(j.s)) {
1105                int x = get8(j.s);
1106                if (x == 255) {
1107                   j.marker = get8u(j.s);
1108                   break;
1109                } else if (x != 0) {
1110                   return 0;
1111                }
1112             }
1113             // if we reach eof without hitting a marker, get_marker() below will fail and we'll eventually return 0
1114          }
1115       } else {
1116          if (!process_marker(j, m)) return 0;
1117       }
1118       m = get_marker(j);
1119    }
1120    return 1;
1121 }
1122 
1123 // static jfif-centered resampling (across block boundaries)
1124 
1125 alias resample_row_func = ubyte* function(ubyte *out_, ubyte *in0, ubyte *in1, int w, int hs);
1126 
1127 ubyte div4(int x)
1128 {
1129     return cast(ubyte)(x >> 2);
1130 }
1131 
1132 ubyte *resample_row_1(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1133 {
1134    return in_near;
1135 }
1136 
1137 ubyte* resample_row_v_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1138 {
1139    // need to generate two samples vertically for every one in input
1140    int i;
1141    for (i=0; i < w; ++i)
1142       out_[i] = div4(3*in_near[i] + in_far[i] + 2);
1143    return out_;
1144 }
1145 
1146 ubyte*  resample_row_h_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1147 {
1148    // need to generate two samples horizontally for every one in input
1149    int i;
1150    ubyte *input = in_near;
1151 
1152    if (w == 1) {
1153       // if only one sample, can't do any interpolation
1154       out_[0] = out_[1] = input[0];
1155       return out_;
1156    }
1157 
1158    out_[0] = input[0];
1159    out_[1] = div4(input[0]*3 + input[1] + 2);
1160    for (i=1; i < w-1; ++i) {
1161       int n = 3*input[i]+2;
1162       out_[i*2+0] = div4(n+input[i-1]);
1163       out_[i*2+1] = div4(n+input[i+1]);
1164    }
1165    out_[i*2+0] = div4(input[w-2]*3 + input[w-1] + 2);
1166    out_[i*2+1] = input[w-1];
1167 
1168    return out_;
1169 }
1170 
1171 ubyte div16(int x)
1172 {
1173     return cast(ubyte)(x >> 4);
1174 }
1175 
1176 
1177 ubyte *resample_row_hv_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1178 {
1179    // need to generate 2x2 samples for every one in input
1180    int i,t0,t1;
1181    if (w == 1) {
1182       out_[0] = out_[1] = div4(3*in_near[0] + in_far[0] + 2);
1183       return out_;
1184    }
1185 
1186    t1 = 3*in_near[0] + in_far[0];
1187    out_[0] = div4(t1+2);
1188    for (i=1; i < w; ++i) {
1189       t0 = t1;
1190       t1 = 3*in_near[i]+in_far[i];
1191       out_[i*2-1] = div16(3*t0 + t1 + 8);
1192       out_[i*2  ] = div16(3*t1 + t0 + 8);
1193    }
1194    out_[w*2-1] = div4(t1+2);
1195 
1196    return out_;
1197 }
1198 
1199 ubyte *resample_row_generic(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1200 {
1201    // resample with nearest-neighbor
1202    int i,j;
1203    in_far = in_far;
1204    for (i=0; i < w; ++i)
1205       for (j=0; j < hs; ++j)
1206          out_[i*hs+j] = in_near[i];
1207    return out_;
1208 }
1209 
1210 int float2fixed(double x)
1211 {
1212     return cast(int)((x) * 65536 + 0.5);
1213 }
1214 
1215 // 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
1216 // VC6 without processor=Pro is generating multiple LEAs per multiply!
1217 void YCbCr_to_RGB_row(ubyte *out_, const ubyte *y, const ubyte *pcb, const ubyte *pcr, int count, int step)
1218 {
1219    int i;
1220    for (i=0; i < count; ++i) {
1221       int y_fixed = (y[i] << 16) + 32768; // rounding
1222       int r,g,b;
1223       int cr = pcr[i] - 128;
1224       int cb = pcb[i] - 128;
1225       r = y_fixed + cr*float2fixed(1.40200f);
1226       g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
1227       b = y_fixed                            + cb*float2fixed(1.77200f);
1228       r >>= 16;
1229       g >>= 16;
1230       b >>= 16;
1231       if (cast(uint) r > 255) { if (r < 0) r = 0; else r = 255; }
1232       if (cast(uint) g > 255) { if (g < 0) g = 0; else g = 255; }
1233       if (cast(uint) b > 255) { if (b < 0) b = 0; else b = 255; }
1234       out_[0] = cast(ubyte)r;
1235       out_[1] = cast(ubyte)g;
1236       out_[2] = cast(ubyte)b;
1237       out_[3] = 255;
1238       out_ += step;
1239    }
1240 }
1241 
1242 // clean up the temporary component buffers
1243 void cleanup_jpeg(jpeg *j)
1244 {
1245    int i;
1246    for (i=0; i < j.s.img_n; ++i) {
1247       if (j.img_comp[i].data) {
1248          free(j.img_comp[i].raw_data);
1249          j.img_comp[i].data = null;
1250       }
1251       if (j.img_comp[i].linebuf) {
1252          free(j.img_comp[i].linebuf);
1253          j.img_comp[i].linebuf = null;
1254       }
1255    }
1256 }
1257 
1258 struct stbi_resample
1259 {
1260    resample_row_func resample;
1261    ubyte* line0;
1262    ubyte* line1;
1263    int hs,vs;   // expansion factor in each axis
1264    int w_lores; // horizontal pixels pre-expansion
1265    int ystep;   // how far through vertical expansion we are
1266    int ypos;    // which pre-expansion row we're on
1267 } ;
1268 
1269 ubyte *load_jpeg_image(jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
1270 {
1271    int n, decode_n;
1272    // validate req_comp
1273    if (req_comp < 0 || req_comp > 4)
1274        throw new STBImageException("Internal error: bad req_comp");
1275    z.s.img_n = 0;
1276 
1277    // load a jpeg image from whichever source
1278    if (!decode_jpeg_image(z)) { cleanup_jpeg(z); return null; }
1279 
1280    // determine actual number of components to generate
1281    n = req_comp ? req_comp : z.s.img_n;
1282 
1283    if (z.s.img_n == 3 && n < 3)
1284       decode_n = 1;
1285    else
1286       decode_n = z.s.img_n;
1287 
1288    // resample and color-convert
1289    {
1290       int k;
1291       uint i,j;
1292       ubyte *output;
1293       ubyte*[4] coutput;
1294 
1295       stbi_resample[4] res_comp;
1296 
1297       for (k=0; k < decode_n; ++k) {
1298          stbi_resample *r = &res_comp[k];
1299 
1300          // allocate line buffer big enough for upsampling off the edges
1301          // with upsample factor of 4
1302          z.img_comp[k].linebuf = cast(ubyte*) malloc(z.s.img_x + 3);
1303          if (!z.img_comp[k].linebuf)
1304          {
1305              cleanup_jpeg(z);
1306              throw new STBImageException("Out of memory");
1307          }
1308 
1309          r.hs      = z.img_h_max / z.img_comp[k].h;
1310          r.vs      = z.img_v_max / z.img_comp[k].v;
1311          r.ystep   = r.vs >> 1;
1312          r.w_lores = (z.s.img_x + r.hs-1) / r.hs;
1313          r.ypos    = 0;
1314          r.line0   = r.line1 = z.img_comp[k].data;
1315 
1316          if      (r.hs == 1 && r.vs == 1) r.resample = &resample_row_1;
1317          else if (r.hs == 1 && r.vs == 2) r.resample = &resample_row_v_2;
1318          else if (r.hs == 2 && r.vs == 1) r.resample = &resample_row_h_2;
1319          else if (r.hs == 2 && r.vs == 2) r.resample = &resample_row_hv_2;
1320          else                               r.resample = &resample_row_generic;
1321       }
1322 
1323       // can't error after this so, this is safe
1324       output = cast(ubyte*) malloc(n * z.s.img_x * z.s.img_y + 1);
1325       if (!output) { cleanup_jpeg(z); throw new STBImageException("Out of memory"); }
1326 
1327       // now go ahead and resample
1328       for (j=0; j < z.s.img_y; ++j) {
1329          ubyte *out_ = output + n * z.s.img_x * j;
1330          for (k=0; k < decode_n; ++k) {
1331             stbi_resample *r = &res_comp[k];
1332             int y_bot = r.ystep >= (r.vs >> 1);
1333             coutput[k] = r.resample(z.img_comp[k].linebuf,
1334                                      y_bot ? r.line1 : r.line0,
1335                                      y_bot ? r.line0 : r.line1,
1336                                      r.w_lores, r.hs);
1337             if (++r.ystep >= r.vs) {
1338                r.ystep = 0;
1339                r.line0 = r.line1;
1340                if (++r.ypos < z.img_comp[k].y)
1341                   r.line1 += z.img_comp[k].w2;
1342             }
1343          }
1344          if (n >= 3) {
1345             ubyte *y = coutput[0];
1346             if (z.s.img_n == 3) {
1347                YCbCr_to_RGB_row(out_, y, coutput[1], coutput[2], z.s.img_x, n);
1348             } else
1349                for (i=0; i < z.s.img_x; ++i) {
1350                   out_[0] = out_[1] = out_[2] = y[i];
1351                   out_[3] = 255; // not used if n==3
1352                   out_ += n;
1353                }
1354          } else {
1355             ubyte *y = coutput[0];
1356             if (n == 1)
1357                for (i=0; i < z.s.img_x; ++i) out_[i] = y[i];
1358             else
1359                for (i=0; i < z.s.img_x; ++i) *out_++ = y[i], *out_++ = 255;
1360          }
1361       }
1362       cleanup_jpeg(z);
1363       *out_x = z.s.img_x;
1364       *out_y = z.s.img_y;
1365       if (comp) *comp  = z.s.img_n; // report original components, not output
1366       return output;
1367    }
1368 }
1369 
1370 ubyte* stbi_jpeg_load(stbi *s, int *x, int *y, int *comp, int req_comp)
1371 {
1372    jpeg j;
1373    j.s = s;
1374    return load_jpeg_image(&j, x,y,comp,req_comp);
1375 }
1376 
1377 void stbi_jpeg_test(stbi *s)
1378 {
1379    jpeg j;
1380    j.s = s;
1381    int r = decode_jpeg_header(&j, SCAN_type);
1382    if (r == 0)
1383        throw new STBImageException("Couldn't decode JPEG header");
1384 }
1385 
1386 
1387 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
1388 //    simple implementation
1389 //      - all input must be provided in an upfront buffer
1390 //      - all output is written to a single output buffer (can malloc/realloc)
1391 //    performance
1392 //      - fast huffman
1393 
1394 // fast-way is faster to check than jpeg huffman, but slow way is slower
1395 enum ZFAST_BITS = 9; // accelerate all cases in default tables
1396 enum ZFAST_MASK = ((1 << ZFAST_BITS) - 1);
1397 
1398 // zlib-style huffman encoding
1399 // (jpegs packs from left, zlib from right, so can't share code)
1400 struct zhuffman
1401 {
1402    ushort[1 << ZFAST_BITS] fast;
1403    ushort[16] firstcode;
1404    int[17] maxcode;
1405    ushort[16] firstsymbol;
1406    ubyte[288] size;
1407    ushort[288] value;
1408 } ;
1409 
1410 int bitreverse16(int n)
1411 {
1412   n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
1413   n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
1414   n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
1415   n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
1416   return n;
1417 }
1418 
1419 int bit_reverse(int v, int bits)
1420 {
1421    assert(bits <= 16);
1422    // to bit reverse n bits, reverse 16 and shift
1423    // e.g. 11 bits, bit reverse and shift away 5
1424    return bitreverse16(v) >> (16-bits);
1425 }
1426 
1427 int zbuild_huffman(zhuffman *z, ubyte *sizelist, int num)
1428 {
1429    int i,k=0;
1430    int code;
1431    int[16] next_code;
1432    int[17] sizes;
1433 
1434    // DEFLATE spec for generating codes
1435    memset(sizes.ptr, 0, sizes.sizeof);
1436    memset(z.fast.ptr, 255, z.fast.sizeof);
1437    for (i=0; i < num; ++i)
1438       ++sizes[sizelist[i]];
1439    sizes[0] = 0;
1440    for (i=1; i < 16; ++i)
1441       assert(sizes[i] <= (1 << i));
1442    code = 0;
1443    for (i=1; i < 16; ++i) {
1444       next_code[i] = code;
1445       z.firstcode[i] = cast(ushort) code;
1446       z.firstsymbol[i] = cast(ushort) k;
1447       code = (code + sizes[i]);
1448       if (sizes[i])
1449          if (code-1 >= (1 << i))
1450             throw new STBImageException("Bad codelength, corrupt JPEG");
1451       z.maxcode[i] = code << (16-i); // preshift for inner loop
1452       code <<= 1;
1453       k += sizes[i];
1454    }
1455    z.maxcode[16] = 0x10000; // sentinel
1456    for (i=0; i < num; ++i) {
1457       int s = sizelist[i];
1458       if (s) {
1459          int c = next_code[s] - z.firstcode[s] + z.firstsymbol[s];
1460          z.size[c] = cast(ubyte)s;
1461          z.value[c] = cast(ushort)i;
1462          if (s <= ZFAST_BITS) {
1463             int k_ = bit_reverse(next_code[s],s);
1464             while (k_ < (1 << ZFAST_BITS)) {
1465                z.fast[k_] = cast(ushort) c;
1466                k_ += (1 << s);
1467             }
1468          }
1469          ++next_code[s];
1470       }
1471    }
1472    return 1;
1473 }
1474 
1475 // zlib-from-memory implementation for PNG reading
1476 //    because PNG allows splitting the zlib stream arbitrarily,
1477 //    and it's annoying structurally to have PNG call ZLIB call PNG,
1478 //    we require PNG read all the IDATs and combine them into a single
1479 //    memory buffer
1480 
1481 struct zbuf
1482 {
1483    const(ubyte) *zbuffer;
1484    const(ubyte) *zbuffer_end;
1485    int num_bits;
1486    uint code_buffer;
1487 
1488    ubyte *zout;
1489    ubyte *zout_start;
1490    ubyte *zout_end;
1491    int   z_expandable;
1492 
1493    zhuffman z_length, z_distance;
1494 } ;
1495 
1496 int zget8(zbuf *z)
1497 {
1498    if (z.zbuffer >= z.zbuffer_end) return 0;
1499    return *z.zbuffer++;
1500 }
1501 
1502 void fill_bits(zbuf *z)
1503 {
1504    do {
1505       assert(z.code_buffer < (1U << z.num_bits));
1506       z.code_buffer |= zget8(z) << z.num_bits;
1507       z.num_bits += 8;
1508    } while (z.num_bits <= 24);
1509 }
1510 
1511 uint zreceive(zbuf *z, int n)
1512 {
1513    uint k;
1514    if (z.num_bits < n) fill_bits(z);
1515    k = z.code_buffer & ((1 << n) - 1);
1516    z.code_buffer >>= n;
1517    z.num_bits -= n;
1518    return k;
1519 }
1520 
1521 int zhuffman_decode(zbuf *a, zhuffman *z)
1522 {
1523    int b,s,k;
1524    if (a.num_bits < 16) fill_bits(a);
1525    b = z.fast[a.code_buffer & ZFAST_MASK];
1526    if (b < 0xffff) {
1527       s = z.size[b];
1528       a.code_buffer >>= s;
1529       a.num_bits -= s;
1530       return z.value[b];
1531    }
1532 
1533    // not resolved by fast table, so compute it the slow way
1534    // use jpeg approach, which requires MSbits at top
1535    k = bit_reverse(a.code_buffer, 16);
1536    for (s=ZFAST_BITS+1; ; ++s)
1537       if (k < z.maxcode[s])
1538          break;
1539    if (s == 16) return -1; // invalid code!
1540    // code size is s, so:
1541    b = (k >> (16-s)) - z.firstcode[s] + z.firstsymbol[s];
1542    assert(z.size[b] == s);
1543    a.code_buffer >>= s;
1544    a.num_bits -= s;
1545    return z.value[b];
1546 }
1547 
1548 int expand(zbuf *z, int n)  // need to make room for n bytes
1549 {
1550    ubyte *q;
1551    int cur, limit;
1552    if (!z.z_expandable)
1553       throw new STBImageException("Output buffer limit, corrupt PNG");
1554    cur   = cast(int) (z.zout     - z.zout_start);
1555    limit = cast(int) (z.zout_end - z.zout_start);
1556    while (cur + n > limit)
1557       limit *= 2;
1558    q = cast(ubyte*) realloc(z.zout_start, limit);
1559    if (q == null)
1560       throw new STBImageException("Out of memory");
1561    z.zout_start = q;
1562    z.zout       = q + cur;
1563    z.zout_end   = q + limit;
1564    return 1;
1565 }
1566 
1567 static immutable int[31] length_base = [
1568    3,4,5,6,7,8,9,10,11,13,
1569    15,17,19,23,27,31,35,43,51,59,
1570    67,83,99,115,131,163,195,227,258,0,0 ];
1571 
1572 static immutable int[31] length_extra =
1573 [ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 ];
1574 
1575 static immutable int[32] dist_base = [ 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
1576 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0];
1577 
1578 static immutable int[32] dist_extra =
1579 [ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13];
1580 
1581 int parse_huffman_block(zbuf *a)
1582 {
1583    for(;;) {
1584       int z = zhuffman_decode(a, &a.z_length);
1585       if (z < 256) {
1586          if (z < 0)
1587              throw new STBImageException("Bad Huffman code, corrupt PNG");
1588          if (a.zout >= a.zout_end) if (!expand(a, 1)) return 0;
1589          *a.zout++ = cast(ubyte) z;
1590       } else {
1591          ubyte *p;
1592          int len,dist;
1593          if (z == 256) return 1;
1594          z -= 257;
1595          len = length_base[z];
1596          if (length_extra[z]) len += zreceive(a, length_extra[z]);
1597          z = zhuffman_decode(a, &a.z_distance);
1598          if (z < 0) throw new STBImageException("Bad Huffman code, corrupt PNG");
1599          dist = dist_base[z];
1600          if (dist_extra[z]) dist += zreceive(a, dist_extra[z]);
1601          if (a.zout - a.zout_start < dist) throw new STBImageException("Bad dist, corrupt PNG");
1602          if (a.zout + len > a.zout_end) if (!expand(a, len)) return 0;
1603          p = a.zout - dist;
1604          while (len--)
1605             *a.zout++ = *p++;
1606       }
1607    }
1608 }
1609 
1610 int compute_huffman_codes(zbuf *a)
1611 {
1612    static immutable ubyte[19] length_dezigzag = [ 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 ];
1613    zhuffman z_codelength;
1614    ubyte[286+32+137] lencodes;//padding for maximum single op
1615    ubyte[19] codelength_sizes;
1616    int i,n;
1617 
1618    int hlit  = zreceive(a,5) + 257;
1619    int hdist = zreceive(a,5) + 1;
1620    int hclen = zreceive(a,4) + 4;
1621 
1622    memset(codelength_sizes.ptr, 0, codelength_sizes.sizeof);
1623    for (i=0; i < hclen; ++i) {
1624       int s = zreceive(a,3);
1625       codelength_sizes[length_dezigzag[i]] = cast(ubyte) s;
1626    }
1627    if (!zbuild_huffman(&z_codelength, codelength_sizes.ptr, 19)) return 0;
1628 
1629    n = 0;
1630    while (n < hlit + hdist) {
1631       int c = zhuffman_decode(a, &z_codelength);
1632       assert(c >= 0 && c < 19);
1633       if (c < 16)
1634          lencodes[n++] = cast(ubyte) c;
1635       else if (c == 16) {
1636          c = zreceive(a,2)+3;
1637          memset(lencodes.ptr+n, lencodes[n-1], c);
1638          n += c;
1639       } else if (c == 17) {
1640          c = zreceive(a,3)+3;
1641          memset(lencodes.ptr+n, 0, c);
1642          n += c;
1643       } else {
1644          assert(c == 18);
1645          c = zreceive(a,7)+11;
1646          memset(lencodes.ptr+n, 0, c);
1647          n += c;
1648       }
1649    }
1650    if (n != hlit+hdist) throw new STBImageException("Bad codelengths, corrupt PNG");
1651    if (!zbuild_huffman(&a.z_length, lencodes.ptr, hlit)) return 0;
1652    if (!zbuild_huffman(&a.z_distance, lencodes.ptr+hlit, hdist)) return 0;
1653    return 1;
1654 }
1655 
1656 int parse_uncompressed_block(zbuf *a)
1657 {
1658    ubyte[4] header;
1659    int len,nlen,k;
1660    if (a.num_bits & 7)
1661       zreceive(a, a.num_bits & 7); // discard
1662    // drain the bit-packed data into header
1663    k = 0;
1664    while (a.num_bits > 0) {
1665       header[k++] = cast(ubyte) (a.code_buffer & 255); // wtf this warns?
1666       a.code_buffer >>= 8;
1667       a.num_bits -= 8;
1668    }
1669    assert(a.num_bits == 0);
1670    // now fill header the normal way
1671    while (k < 4)
1672       header[k++] = cast(ubyte) zget8(a);
1673    len  = header[1] * 256 + header[0];
1674    nlen = header[3] * 256 + header[2];
1675    if (nlen != (len ^ 0xffff)) throw new STBImageException("Zlib corrupt, corrupt PNG");
1676    if (a.zbuffer + len > a.zbuffer_end) throw new STBImageException("Read past buffer, corrupt PNG");
1677    if (a.zout + len > a.zout_end)
1678       if (!expand(a, len)) return 0;
1679    memcpy(a.zout, a.zbuffer, len);
1680    a.zbuffer += len;
1681    a.zout += len;
1682    return 1;
1683 }
1684 
1685 int parse_zlib_header(zbuf *a)
1686 {
1687    int cmf   = zget8(a);
1688    int cm    = cmf & 15;
1689    /* int cinfo = cmf >> 4; */
1690    int flg   = zget8(a);
1691    if ((cmf*256+flg) % 31 != 0) throw new STBImageException("Bad zlib header, corrupt PNG"); // zlib spec
1692    if (flg & 32) throw new STBImageException("No preset dict, corrupt PNG"); // preset dictionary not allowed in png
1693    if (cm != 8) throw new STBImageException("Bad compression, corrupt PNG");  // DEFLATE required for png
1694    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
1695    return 1;
1696 }
1697 
1698 // @TODO: should statically initialize these for optimal thread safety
1699 __gshared ubyte[288] default_length;
1700 __gshared ubyte[32] default_distance;
1701 
1702 void init_defaults()
1703 {
1704    int i;   // use <= to match clearly with spec
1705    for (i=0; i <= 143; ++i)     default_length[i]   = 8;
1706    for (   ; i <= 255; ++i)     default_length[i]   = 9;
1707    for (   ; i <= 279; ++i)     default_length[i]   = 7;
1708    for (   ; i <= 287; ++i)     default_length[i]   = 8;
1709 
1710    for (i=0; i <=  31; ++i)     default_distance[i] = 5;
1711 }
1712 
1713 __gshared int stbi_png_partial; // a quick hack to only allow decoding some of a PNG... I should implement real streaming support instead
1714 int parse_zlib(zbuf *a, int parse_header)
1715 {
1716    int final_, type;
1717    if (parse_header)
1718       if (!parse_zlib_header(a)) return 0;
1719    a.num_bits = 0;
1720    a.code_buffer = 0;
1721    do {
1722       final_ = zreceive(a,1);
1723       type = zreceive(a,2);
1724       if (type == 0) {
1725          if (!parse_uncompressed_block(a)) return 0;
1726       } else if (type == 3) {
1727          return 0;
1728       } else {
1729          if (type == 1) {
1730             // use fixed code lengths
1731             if (!default_distance[31]) init_defaults();
1732             if (!zbuild_huffman(&a.z_length  , default_length.ptr  , 288)) return 0;
1733             if (!zbuild_huffman(&a.z_distance, default_distance.ptr,  32)) return 0;
1734          } else {
1735             if (!compute_huffman_codes(a)) return 0;
1736          }
1737          if (!parse_huffman_block(a)) return 0;
1738       }
1739       if (stbi_png_partial && a.zout - a.zout_start > 65536)
1740          break;
1741    } while (!final_);
1742    return 1;
1743 }
1744 
1745 int do_zlib(zbuf *a, ubyte *obuf, int olen, int exp, int parse_header)
1746 {
1747    a.zout_start = obuf;
1748    a.zout       = obuf;
1749    a.zout_end   = obuf + olen;
1750    a.z_expandable = exp;
1751 
1752    return parse_zlib(a, parse_header);
1753 }
1754 
1755 ubyte *stbi_zlib_decode_malloc_guesssize(const(ubyte) *buffer, int len, int initial_size, int *outlen)
1756 {
1757    zbuf a;
1758    ubyte *p = cast(ubyte*) malloc(initial_size);
1759    if (p == null) return null;
1760    a.zbuffer = buffer;
1761    a.zbuffer_end = buffer + len;
1762    if (do_zlib(&a, p, initial_size, 1, 1)) {
1763       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1764       return a.zout_start;
1765    } else {
1766       free(a.zout_start);
1767       return null;
1768    }
1769 }
1770 
1771 ubyte *stbi_zlib_decode_malloc(const(ubyte) *buffer, int len, int *outlen)
1772 {
1773    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
1774 }
1775 
1776 ubyte *stbi_zlib_decode_malloc_guesssize_headerflag(const(ubyte) *buffer, int len, int initial_size, int *outlen, int parse_header)
1777 {
1778    zbuf a;
1779    ubyte *p = cast(ubyte*) malloc(initial_size);
1780    if (p == null) return null;
1781    a.zbuffer = buffer;
1782    a.zbuffer_end = buffer + len;
1783    if (do_zlib(&a, p, initial_size, 1, parse_header)) {
1784       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1785       return a.zout_start;
1786    } else {
1787       free(a.zout_start);
1788       return null;
1789    }
1790 }
1791 
1792 int stbi_zlib_decode_buffer(ubyte* obuffer, int olen, const(ubyte)* ibuffer, int ilen)
1793 {
1794    zbuf a;
1795    a.zbuffer = ibuffer;
1796    a.zbuffer_end = ibuffer + ilen;
1797    if (do_zlib(&a, obuffer, olen, 0, 1))
1798       return cast(int) (a.zout - a.zout_start);
1799    else
1800       return -1;
1801 }
1802 
1803 ubyte *stbi_zlib_decode_noheader_malloc(const(ubyte) *buffer, int len, int *outlen)
1804 {
1805    zbuf a;
1806    ubyte *p = cast(ubyte*) malloc(16384);
1807    if (p == null) return null;
1808    a.zbuffer = buffer;
1809    a.zbuffer_end = buffer+len;
1810    if (do_zlib(&a, p, 16384, 1, 0)) {
1811       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1812       return a.zout_start;
1813    } else {
1814       free(a.zout_start);
1815       return null;
1816    }
1817 }
1818 
1819 int stbi_zlib_decode_noheader_buffer(ubyte *obuffer, int olen, const(ubyte) *ibuffer, int ilen)
1820 {
1821    zbuf a;
1822    a.zbuffer = ibuffer;
1823    a.zbuffer_end = ibuffer + ilen;
1824    if (do_zlib(&a, obuffer, olen, 0, 0))
1825       return cast(int) (a.zout - a.zout_start);
1826    else
1827       return -1;
1828 }
1829 
1830 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
1831 //    simple implementation
1832 //      - only 8-bit samples
1833 //      - no CRC checking
1834 //      - allocates lots of intermediate memory
1835 //        - avoids problem of streaming data between subsystems
1836 //        - avoids explicit window management
1837 //    performance
1838 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
1839 
1840 
1841 struct chunk
1842 {
1843    uint length;
1844    uint type;
1845 }
1846 
1847 uint PNG_TYPE(ubyte a, ubyte b, ubyte c, ubyte d)
1848 {
1849    return (a << 24) + (b << 16) + (c << 8) + d;
1850 }
1851 
1852 chunk get_chunk_header(stbi *s)
1853 {
1854    chunk c;
1855    c.length = get32(s);
1856    c.type   = get32(s);
1857    return c;
1858 }
1859 
1860 static int check_png_header(stbi *s)
1861 {
1862    static immutable ubyte[8] png_sig = [ 137, 80, 78, 71, 13, 10, 26, 10 ];
1863    for (int i = 0; i < 8; ++i)
1864    {
1865        ubyte headerByte = get8u(s);
1866        ubyte expected = png_sig[i];
1867        if (headerByte != expected)
1868            throw new STBImageException("Bad PNG sig, not a PNG");
1869    }
1870    return 1;
1871 }
1872 
1873 struct png
1874 {
1875    stbi *s;
1876    ubyte *idata;
1877    ubyte *expanded;
1878    ubyte *out_;
1879 }
1880 
1881 
1882 enum : int
1883 {
1884    F_none=0, F_sub=1, F_up=2, F_avg=3, F_paeth=4,
1885    F_avg_first, F_paeth_first
1886 }
1887 
1888 static immutable ubyte[5] first_row_filter =
1889 [
1890    F_none, F_sub, F_none, F_avg_first, F_paeth_first
1891 ];
1892 
1893 static int paeth(int a, int b, int c)
1894 {
1895    int p = a + b - c;
1896    int pa = abs(p-a);
1897    int pb = abs(p-b);
1898    int pc = abs(p-c);
1899    if (pa <= pb && pa <= pc) return a;
1900    if (pb <= pc) return b;
1901    return c;
1902 }
1903 
1904 // create the png data from post-deflated data
1905 static int create_png_image_raw(png *a, ubyte *raw, uint raw_len, int out_n, uint x, uint y)
1906 {
1907    stbi *s = a.s;
1908    uint i,j,stride = x*out_n;
1909    int k;
1910    int img_n = s.img_n; // copy it into a local for later
1911    assert(out_n == s.img_n || out_n == s.img_n+1);
1912    if (stbi_png_partial) y = 1;
1913    a.out_ = cast(ubyte*) malloc(x * y * out_n);
1914    if (!a.out_) throw new STBImageException("Out of memory");
1915    if (!stbi_png_partial) {
1916       if (s.img_x == x && s.img_y == y) {
1917          if (raw_len != (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1918       } else { // interlaced:
1919          if (raw_len < (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1920       }
1921    }
1922    for (j=0; j < y; ++j) {
1923       ubyte *cur = a.out_ + stride*j;
1924       ubyte *prior = cur - stride;
1925       int filter = *raw++;
1926       if (filter > 4) throw new STBImageException("Invalid filter, corrupt PNG");
1927       // if first row, use special filter that doesn't sample previous row
1928       if (j == 0) filter = first_row_filter[filter];
1929       // handle first pixel explicitly
1930       for (k=0; k < img_n; ++k) {
1931          switch (filter) {
1932             case F_none       : cur[k] = raw[k]; break;
1933             case F_sub        : cur[k] = raw[k]; break;
1934             case F_up         : cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1935             case F_avg        : cur[k] = cast(ubyte)(raw[k] + (prior[k]>>1)); break;
1936             case F_paeth      : cur[k] = cast(ubyte) (raw[k] + paeth(0,prior[k],0)); break;
1937             case F_avg_first  : cur[k] = raw[k]; break;
1938             case F_paeth_first: cur[k] = raw[k]; break;
1939             default: break;
1940          }
1941       }
1942       if (img_n != out_n) cur[img_n] = 255;
1943       raw += img_n;
1944       cur += out_n;
1945       prior += out_n;
1946       // this is a little gross, so that we don't switch per-pixel or per-component
1947       if (img_n == out_n) {
1948 
1949          for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n)
1950             for (k=0; k < img_n; ++k)
1951             {
1952                switch (filter) {
1953                   case F_none:  cur[k] = raw[k]; break;
1954                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-img_n]); break;
1955                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1956                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-img_n])>>1)); break;
1957                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
1958                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-img_n] >> 1)); break;
1959                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],0,0)); break;
1960                   default: break;
1961                }
1962             }
1963       } else {
1964          assert(img_n+1 == out_n);
1965 
1966          for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n)
1967             for (k=0; k < img_n; ++k)
1968             {
1969                switch (filter) {
1970                   case F_none:  cur[k] = raw[k]; break;
1971                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-out_n]); break;
1972                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1973                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
1974                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
1975                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-out_n] >> 1)); break;
1976                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],0,0)); break;
1977                   default: break;
1978                }
1979             }
1980       }
1981    }
1982    return 1;
1983 }
1984 
1985 int create_png_image(png *a, ubyte *raw, uint raw_len, int out_n, int interlaced)
1986 {
1987    ubyte *final_;
1988    int p;
1989    int save;
1990    if (!interlaced)
1991       return create_png_image_raw(a, raw, raw_len, out_n, a.s.img_x, a.s.img_y);
1992    save = stbi_png_partial;
1993    stbi_png_partial = 0;
1994 
1995    // de-interlacing
1996    final_ = cast(ubyte*) malloc(a.s.img_x * a.s.img_y * out_n);
1997    for (p=0; p < 7; ++p) {
1998       static immutable int[7] xorig = [ 0,4,0,2,0,1,0 ];
1999       static immutable int[7] yorig = [ 0,0,4,0,2,0,1 ];
2000       static immutable int[7] xspc = [ 8,8,4,4,2,2,1 ];
2001       static immutable int[7] yspc = [ 8,8,8,4,4,2,2 ];
2002       int i,j,x,y;
2003       // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
2004       x = (a.s.img_x - xorig[p] + xspc[p]-1) / xspc[p];
2005       y = (a.s.img_y - yorig[p] + yspc[p]-1) / yspc[p];
2006       if (x && y) {
2007          if (!create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
2008             free(final_);
2009             return 0;
2010          }
2011          for (j=0; j < y; ++j)
2012             for (i=0; i < x; ++i)
2013                memcpy(final_ + (j*yspc[p]+yorig[p])*a.s.img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
2014                       a.out_ + (j*x+i)*out_n, out_n);
2015          free(a.out_);
2016          raw += (x*out_n+1)*y;
2017          raw_len -= (x*out_n+1)*y;
2018       }
2019    }
2020    a.out_ = final_;
2021 
2022    stbi_png_partial = save;
2023    return 1;
2024 }
2025 
2026 static int compute_transparency(png *z, ubyte[3] tc, int out_n)
2027 {
2028    stbi *s = z.s;
2029    uint i, pixel_count = s.img_x * s.img_y;
2030    ubyte *p = z.out_;
2031 
2032    // compute color-based transparency, assuming we've
2033    // already got 255 as the alpha value in the output
2034    assert(out_n == 2 || out_n == 4);
2035 
2036    if (out_n == 2) {
2037       for (i=0; i < pixel_count; ++i) {
2038          p[1] = (p[0] == tc[0] ? 0 : 255);
2039          p += 2;
2040       }
2041    } else {
2042       for (i=0; i < pixel_count; ++i) {
2043          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
2044             p[3] = 0;
2045          p += 4;
2046       }
2047    }
2048    return 1;
2049 }
2050 
2051 int expand_palette(png *a, ubyte *palette, int len, int pal_img_n)
2052 {
2053    uint i, pixel_count = a.s.img_x * a.s.img_y;
2054    ubyte *p;
2055    ubyte *temp_out;
2056    ubyte *orig = a.out_;
2057 
2058    p = cast(ubyte*) malloc(pixel_count * pal_img_n);
2059    if (p == null)
2060       throw new STBImageException("Out of memory");
2061 
2062    // between here and free(out) below, exitting would leak
2063    temp_out = p;
2064 
2065    if (pal_img_n == 3) {
2066       for (i=0; i < pixel_count; ++i) {
2067          int n = orig[i]*4;
2068          p[0] = palette[n  ];
2069          p[1] = palette[n+1];
2070          p[2] = palette[n+2];
2071          p += 3;
2072       }
2073    } else {
2074       for (i=0; i < pixel_count; ++i) {
2075          int n = orig[i]*4;
2076          p[0] = palette[n  ];
2077          p[1] = palette[n+1];
2078          p[2] = palette[n+2];
2079          p[3] = palette[n+3];
2080          p += 4;
2081       }
2082    }
2083    free(a.out_);
2084    a.out_ = temp_out;
2085 
2086    return 1;
2087 }
2088 
2089 int parse_png_file(png *z, int scan, int req_comp)
2090 {
2091    ubyte[1024] palette;
2092    ubyte pal_img_n=0;
2093    ubyte has_trans=0;
2094    ubyte[3] tc;
2095    uint ioff=0, idata_limit=0, i, pal_len=0;
2096    int first=1,k,interlace=0;
2097    stbi *s = z.s;
2098 
2099    z.expanded = null;
2100    z.idata = null;
2101    z.out_ = null;
2102 
2103    if (!check_png_header(s)) return 0;
2104 
2105    if (scan == SCAN_type) return 1;
2106 
2107    for (;;) {
2108       chunk c = get_chunk_header(s);
2109       switch (c.type) {
2110          case PNG_TYPE('I','H','D','R'): {
2111             int depth,color,comp,filter;
2112             if (!first) throw new STBImageException("Multiple IHDR, corrupt PNG");
2113             first = 0;
2114             if (c.length != 13) throw new STBImageException("Bad IHDR len, corrupt PNG");
2115             s.img_x = get32(s); if (s.img_x > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2116             s.img_y = get32(s); if (s.img_y > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2117             depth = get8(s);  if (depth != 8)        throw new STBImageException("8bit only, PNG not supported: 8-bit only");
2118             color = get8(s);  if (color > 6)         throw new STBImageException("Bad ctype, corrupt PNG");
2119             if (color == 3) pal_img_n = 3; else if (color & 1) throw new STBImageException("Bad ctype, corrupt PNG");
2120             comp  = get8(s);  if (comp) throw new STBImageException("Bad comp method, corrupt PNG");
2121             filter= get8(s);  if (filter) throw new STBImageException("Bad filter method, corrupt PNG");
2122             interlace = get8(s); if (interlace>1) throw new STBImageException("Bad interlace method, corrupt PNG");
2123             if (!s.img_x || !s.img_y) throw new STBImageException("0-pixel image, corrupt PNG");
2124             if (!pal_img_n) {
2125                s.img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
2126                if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
2127                if (scan == SCAN_header) return 1;
2128             } else {
2129                // if paletted, then pal_n is our final components, and
2130                // img_n is # components to decompress/filter.
2131                s.img_n = 1;
2132                if ((1 << 30) / s.img_x / 4 < s.img_y) throw new STBImageException("Too large, corrupt PNG");
2133                // if SCAN_header, have to scan to see if we have a tRNS
2134             }
2135             break;
2136          }
2137 
2138          case PNG_TYPE('P','L','T','E'):  {
2139             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2140             if (c.length > 256*3) throw new STBImageException("invalid PLTE, corrupt PNG");
2141             pal_len = c.length / 3;
2142             if (pal_len * 3 != c.length) throw new STBImageException("invalid PLTE, corrupt PNG");
2143             for (i=0; i < pal_len; ++i) {
2144                palette[i*4+0] = get8u(s);
2145                palette[i*4+1] = get8u(s);
2146                palette[i*4+2] = get8u(s);
2147                palette[i*4+3] = 255;
2148             }
2149             break;
2150          }
2151 
2152          case PNG_TYPE('t','R','N','S'): {
2153             if (first) throw new STBImageException("first not IHDR, cCorrupt PNG");
2154             if (z.idata) throw new STBImageException("tRNS after IDAT, corrupt PNG");
2155             if (pal_img_n) {
2156                if (scan == SCAN_header) { s.img_n = 4; return 1; }
2157                if (pal_len == 0) throw new STBImageException("tRNS before PLTE, corrupt PNG");
2158                if (c.length > pal_len) throw new STBImageException("bad tRNS len, corrupt PNG");
2159                pal_img_n = 4;
2160                for (i=0; i < c.length; ++i)
2161                   palette[i*4+3] = get8u(s);
2162             } else {
2163                if (!(s.img_n & 1)) throw new STBImageException("tRNS with alpha, corrupt PNG");
2164                if (c.length != cast(uint) s.img_n*2) throw new STBImageException("bad tRNS len, corrupt PNG");
2165                has_trans = 1;
2166                for (k=0; k < s.img_n; ++k)
2167                   tc[k] = cast(ubyte) get16(s); // non 8-bit images will be larger
2168             }
2169             break;
2170          }
2171 
2172          case PNG_TYPE('I','D','A','T'): {
2173             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2174             if (pal_img_n && !pal_len) throw new STBImageException("no PLTE, corrupt PNG");
2175             if (scan == SCAN_header) { s.img_n = pal_img_n; return 1; }
2176             if (ioff + c.length > idata_limit) {
2177                ubyte *p;
2178                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
2179                while (ioff + c.length > idata_limit)
2180                   idata_limit *= 2;
2181                p = cast(ubyte*) realloc(z.idata, idata_limit); if (p == null) throw new STBImageException("outofmem, cOut of memory");
2182                z.idata = p;
2183             }
2184             if (!getn(s, z.idata+ioff,c.length)) throw new STBImageException("outofdata, corrupt PNG");
2185             ioff += c.length;
2186             break;
2187          }
2188 
2189          case PNG_TYPE('I','E','N','D'): {
2190             uint raw_len;
2191             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2192             if (scan != SCAN_load) return 1;
2193             if (z.idata == null) throw new STBImageException("no IDAT, corrupt PNG");
2194             z.expanded = stbi_zlib_decode_malloc_guesssize_headerflag(z.idata, ioff, 16384, cast(int *) &raw_len, 1);
2195             if (z.expanded == null) return 0; // zlib should set error
2196             free(z.idata); z.idata = null;
2197             if ((req_comp == s.img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
2198                s.img_out_n = s.img_n+1;
2199             else
2200                s.img_out_n = s.img_n;
2201             if (!create_png_image(z, z.expanded, raw_len, s.img_out_n, interlace)) return 0;
2202             if (has_trans)
2203                if (!compute_transparency(z, tc, s.img_out_n)) return 0;
2204             if (pal_img_n) {
2205                // pal_img_n == 3 or 4
2206                s.img_n = pal_img_n; // record the actual colors we had
2207                s.img_out_n = pal_img_n;
2208                if (req_comp >= 3) s.img_out_n = req_comp;
2209                if (!expand_palette(z, palette.ptr, pal_len, s.img_out_n))
2210                   return 0;
2211             }
2212             free(z.expanded); z.expanded = null;
2213             return 1;
2214          }
2215 
2216          default:
2217             // if critical, fail
2218             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2219             if ((c.type & (1 << 29)) == 0) {
2220 
2221                throw new STBImageException("PNG not supported: unknown chunk type");
2222             }
2223             skip(s, c.length);
2224             break;
2225       }
2226       // end of chunk, read and skip CRC
2227       get32(s);
2228    }
2229 }
2230 
2231 ubyte *do_png(png *p, int *x, int *y, int *n, int req_comp)
2232 {
2233    ubyte *result=null;
2234    if (req_comp < 0 || req_comp > 4)
2235       throw new STBImageException("Internal error: bad req_comp");
2236    if (parse_png_file(p, SCAN_load, req_comp)) {
2237       result = p.out_;
2238       p.out_ = null;
2239       if (req_comp && req_comp != p.s.img_out_n) {
2240          result = convert_format(result, p.s.img_out_n, req_comp, p.s.img_x, p.s.img_y);
2241          p.s.img_out_n = req_comp;
2242          if (result == null) return result;
2243       }
2244       *x = p.s.img_x;
2245       *y = p.s.img_y;
2246       if (n) *n = p.s.img_n;
2247    }
2248    free(p.out_);      p.out_    = null;
2249    free(p.expanded); p.expanded = null;
2250    free(p.idata);    p.idata    = null;
2251 
2252    return result;
2253 }
2254 
2255 ubyte *stbi_png_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2256 {
2257    png p;
2258    p.s = s;
2259    return do_png(&p, x,y,comp,req_comp);
2260 }
2261 
2262 void stbi_png_test(stbi *s)
2263 {
2264    int r = check_png_header(s);
2265    if (r == 0)
2266        throw new STBImageException("Couldn't decode PNG header");
2267 }
2268 
2269 // Microsoft/Windows BMP image
2270 
2271 void stbi_bmp_test(stbi *s)
2272 {
2273     if (get8(s) != 'B') throw new STBImageException("Couldn't decode BMP header");
2274     if (get8(s) != 'M') throw new STBImageException("Couldn't decode BMP header");
2275     get32le(s); // discard filesize
2276     get16le(s); // discard reserved
2277     get16le(s); // discard reserved
2278     get32le(s); // discard data offset
2279     int sz = get32le(s);
2280     if (sz == 12 || sz == 40 || sz == 56 || sz == 108)
2281         return;
2282 
2283     throw new STBImageException("Couldn't decode BMP header");
2284 }
2285 
2286 
2287 // returns 0..31 for the highest set bit
2288 int high_bit(uint z)
2289 {
2290    int n=0;
2291    if (z == 0) return -1;
2292    if (z >= 0x10000) n += 16, z >>= 16;
2293    if (z >= 0x00100) n +=  8, z >>=  8;
2294    if (z >= 0x00010) n +=  4, z >>=  4;
2295    if (z >= 0x00004) n +=  2, z >>=  2;
2296    if (z >= 0x00002) n +=  1, z >>=  1;
2297    return n;
2298 }
2299 
2300 int bitcount(uint a)
2301 {
2302    a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
2303    a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
2304    a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
2305    a = (a + (a >> 8)); // max 16 per 8 bits
2306    a = (a + (a >> 16)); // max 32 per 8 bits
2307    return a & 0xff;
2308 }
2309 
2310 int shiftsigned(int v, int shift, int bits)
2311 {
2312    int result;
2313    int z=0;
2314 
2315    if (shift < 0) v <<= -shift;
2316    else v >>= shift;
2317    result = v;
2318 
2319    z = bits;
2320    while (z < 8) {
2321       result += v >> z;
2322       z += bits;
2323    }
2324    return result;
2325 }
2326 
2327 ubyte *bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2328 {
2329    ubyte *out_;
2330    uint mr=0,mg=0,mb=0,ma=0, fake_a=0;
2331    ubyte[4][256] pal;
2332    int psize=0,i,j,compress=0,width;
2333    int bpp, flip_vertically, pad, target, offset, hsz;
2334    if (get8(s) != 'B' || get8(s) != 'M') throw new STBImageException("not BMP, Corrupt BMP");
2335    get32le(s); // discard filesize
2336    get16le(s); // discard reserved
2337    get16le(s); // discard reserved
2338    offset = get32le(s);
2339    hsz = get32le(s);
2340    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108) throw new STBImageException("unknown BMP, BMP type not supported: unknown");
2341    if (hsz == 12) {
2342       s.img_x = get16le(s);
2343       s.img_y = get16le(s);
2344    } else {
2345       s.img_x = get32le(s);
2346       s.img_y = get32le(s);
2347    }
2348    if (get16le(s) != 1) throw new STBImageException("bad BMP");
2349    bpp = get16le(s);
2350    if (bpp == 1) throw new STBImageException("monochrome, BMP type not supported: 1-bit");
2351    flip_vertically = (cast(int) s.img_y) > 0;
2352    s.img_y = abs(cast(int) s.img_y);
2353    if (hsz == 12) {
2354       if (bpp < 24)
2355          psize = (offset - 14 - 24) / 3;
2356    } else {
2357       compress = get32le(s);
2358       if (compress == 1 || compress == 2) throw new STBImageException("BMP RLE, BMP type not supported: RLE");
2359       get32le(s); // discard sizeof
2360       get32le(s); // discard hres
2361       get32le(s); // discard vres
2362       get32le(s); // discard colorsused
2363       get32le(s); // discard max important
2364       if (hsz == 40 || hsz == 56) {
2365          if (hsz == 56) {
2366             get32le(s);
2367             get32le(s);
2368             get32le(s);
2369             get32le(s);
2370          }
2371          if (bpp == 16 || bpp == 32) {
2372             mr = mg = mb = 0;
2373             if (compress == 0) {
2374                if (bpp == 32) {
2375                   mr = 0xffu << 16;
2376                   mg = 0xffu <<  8;
2377                   mb = 0xffu <<  0;
2378                   ma = 0xffu << 24;
2379                   fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
2380                } else {
2381                   mr = 31u << 10;
2382                   mg = 31u <<  5;
2383                   mb = 31u <<  0;
2384                }
2385             } else if (compress == 3) {
2386                mr = get32le(s);
2387                mg = get32le(s);
2388                mb = get32le(s);
2389                // not documented, but generated by photoshop and handled by mspaint
2390                if (mr == mg && mg == mb) {
2391                   // ?!?!?
2392                   throw new STBImageException("bad BMP");
2393                }
2394             } else
2395                throw new STBImageException("bad BMP");
2396          }
2397       } else {
2398          assert(hsz == 108);
2399          mr = get32le(s);
2400          mg = get32le(s);
2401          mb = get32le(s);
2402          ma = get32le(s);
2403          get32le(s); // discard color space
2404          for (i=0; i < 12; ++i)
2405             get32le(s); // discard color space parameters
2406       }
2407       if (bpp < 16)
2408          psize = (offset - 14 - hsz) >> 2;
2409    }
2410    s.img_n = ma ? 4 : 3;
2411    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
2412       target = req_comp;
2413    else
2414       target = s.img_n; // if they want monochrome, we'll post-convert
2415    out_ = cast(ubyte*) malloc(target * s.img_x * s.img_y);
2416    if (!out_) throw new STBImageException("Out of memory");
2417    if (bpp < 16) {
2418       int z=0;
2419       if (psize == 0 || psize > 256) { free(out_); throw new STBImageException("invalid, Corrupt BMP"); }
2420       for (i=0; i < psize; ++i) {
2421          pal[i][2] = get8u(s);
2422          pal[i][1] = get8u(s);
2423          pal[i][0] = get8u(s);
2424          if (hsz != 12) get8(s);
2425          pal[i][3] = 255;
2426       }
2427       skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
2428       if (bpp == 4) width = (s.img_x + 1) >> 1;
2429       else if (bpp == 8) width = s.img_x;
2430       else { free(out_); throw new STBImageException("bad bpp, corrupt BMP"); }
2431       pad = (-width)&3;
2432       for (j=0; j < cast(int) s.img_y; ++j) {
2433          for (i=0; i < cast(int) s.img_x; i += 2) {
2434             int v=get8(s),v2=0;
2435             if (bpp == 4) {
2436                v2 = v & 15;
2437                v >>= 4;
2438             }
2439             out_[z++] = pal[v][0];
2440             out_[z++] = pal[v][1];
2441             out_[z++] = pal[v][2];
2442             if (target == 4) out_[z++] = 255;
2443             if (i+1 == cast(int) s.img_x) break;
2444             v = (bpp == 8) ? get8(s) : v2;
2445             out_[z++] = pal[v][0];
2446             out_[z++] = pal[v][1];
2447             out_[z++] = pal[v][2];
2448             if (target == 4) out_[z++] = 255;
2449          }
2450          skip(s, pad);
2451       }
2452    } else {
2453       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
2454       int z = 0;
2455       int easy=0;
2456       skip(s, offset - 14 - hsz);
2457       if (bpp == 24) width = 3 * s.img_x;
2458       else if (bpp == 16) width = 2*s.img_x;
2459       else /* bpp = 32 and pad = 0 */ width=0;
2460       pad = (-width) & 3;
2461       if (bpp == 24) {
2462          easy = 1;
2463       } else if (bpp == 32) {
2464          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
2465             easy = 2;
2466       }
2467       if (!easy) {
2468          if (!mr || !mg || !mb) { free(out_); throw new STBImageException("bad masks, corrupt BMP"); }
2469          // right shift amt to put high bit in position #7
2470          rshift = high_bit(mr)-7; rcount = bitcount(mr);
2471          gshift = high_bit(mg)-7; gcount = bitcount(mr);
2472          bshift = high_bit(mb)-7; bcount = bitcount(mr);
2473          ashift = high_bit(ma)-7; acount = bitcount(mr);
2474       }
2475       for (j=0; j < cast(int) s.img_y; ++j) {
2476          if (easy) {
2477             for (i=0; i < cast(int) s.img_x; ++i) {
2478                int a;
2479                out_[z+2] = get8u(s);
2480                out_[z+1] = get8u(s);
2481                out_[z+0] = get8u(s);
2482                z += 3;
2483                a = (easy == 2 ? get8(s) : 255);
2484                if (target == 4) out_[z++] = cast(ubyte) a;
2485             }
2486          } else {
2487             for (i=0; i < cast(int) s.img_x; ++i) {
2488                uint v = (bpp == 16 ? get16le(s) : get32le(s));
2489                int a;
2490                out_[z++] = cast(ubyte) shiftsigned(v & mr, rshift, rcount);
2491                out_[z++] = cast(ubyte) shiftsigned(v & mg, gshift, gcount);
2492                out_[z++] = cast(ubyte) shiftsigned(v & mb, bshift, bcount);
2493                a = (ma ? shiftsigned(v & ma, ashift, acount) : 255);
2494                if (target == 4) out_[z++] = cast(ubyte) a;
2495             }
2496          }
2497          skip(s, pad);
2498       }
2499    }
2500    if (flip_vertically) {
2501       ubyte t;
2502       for (j=0; j < cast(int) s.img_y>>1; ++j) {
2503          ubyte *p1 = out_ +      j     *s.img_x*target;
2504          ubyte *p2 = out_ + (s.img_y-1-j)*s.img_x*target;
2505          for (i=0; i < cast(int) s.img_x*target; ++i) {
2506             t = p1[i], p1[i] = p2[i], p2[i] = t;
2507          }
2508       }
2509    }
2510 
2511    if (req_comp && req_comp != target) {
2512       out_ = convert_format(out_, target, req_comp, s.img_x, s.img_y);
2513       if (out_ == null) return out_; // convert_format frees input on failure
2514    }
2515 
2516    *x = s.img_x;
2517    *y = s.img_y;
2518    if (comp) *comp = s.img_n;
2519    return out_;
2520 }
2521 
2522 ubyte *stbi_bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2523 {
2524    return bmp_load(s, x,y,comp,req_comp);
2525 }
2526 
2527 // *************************************************************************************************
2528 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
2529 struct stbi_gif_lzw
2530 {
2531    short prefix;
2532    ubyte first;
2533    ubyte suffix;
2534 }
2535 
2536 struct stbi_gif
2537 {
2538    int w,h;
2539    ubyte *out_;                 // output buffer (always 4 components)
2540    int flags, bgindex, ratio, transparent, eflags;
2541    ubyte[4][256]  pal;
2542    ubyte[4][256] lpal;
2543    stbi_gif_lzw[4096] codes;
2544    ubyte *color_table;
2545    int parse, step;
2546    int lflags;
2547    int start_x, start_y;
2548    int max_x, max_y;
2549    int cur_x, cur_y;
2550    int line_size;
2551 }
2552 
2553 void stbi_gif_test(stbi *s)
2554 {
2555     int sz;
2556     if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2557         throw new STBImageException("Couldn't decode GIF header");
2558     sz = get8(s);
2559     if (sz != '9' && sz != '7')
2560         throw new STBImageException("Couldn't decode GIF header");
2561     if (get8(s) != 'a')
2562         throw new STBImageException("Couldn't decode GIF header");
2563 }
2564 
2565 void stbi_gif_parse_colortable(stbi *s, ubyte[4][256] pal, int num_entries, int transp)
2566 {
2567    int i;
2568    for (i=0; i < num_entries; ++i) {
2569       pal[i][2] = get8u(s);
2570       pal[i][1] = get8u(s);
2571       pal[i][0] = get8u(s);
2572       pal[i][3] = transp ? 0 : 255;
2573    }
2574 }
2575 
2576 int stbi_gif_header(stbi *s, stbi_gif *g, int *comp, int is_info)
2577 {
2578    ubyte version_;
2579    if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2580       throw new STBImageException("not GIF, corrupt GIF");
2581 
2582    version_ = get8u(s);
2583    if (version_ != '7' && version_ != '9')    throw new STBImageException("not GIF, corrupt GIF");
2584    if (get8(s) != 'a')                      throw new STBImageException("not GIF, corrupt GIF");
2585 
2586    g.w = get16le(s);
2587    g.h = get16le(s);
2588    g.flags = get8(s);
2589    g.bgindex = get8(s);
2590    g.ratio = get8(s);
2591    g.transparent = -1;
2592 
2593    if (comp != null) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
2594 
2595    if (is_info) return 1;
2596 
2597    if (g.flags & 0x80)
2598       stbi_gif_parse_colortable(s,g.pal, 2 << (g.flags & 7), -1);
2599 
2600    return 1;
2601 }
2602 
2603 void stbi_out_gif_code(stbi_gif *g, ushort code)
2604 {
2605    ubyte *p;
2606    ubyte *c;
2607 
2608    // recurse to decode the prefixes, since the linked-list is backwards,
2609    // and working backwards through an interleaved image would be nasty
2610    if (g.codes[code].prefix >= 0)
2611       stbi_out_gif_code(g, g.codes[code].prefix);
2612 
2613    if (g.cur_y >= g.max_y) return;
2614 
2615    p = (&g.out_[g.cur_x + g.cur_y]);
2616    c = &g.color_table[g.codes[code].suffix * 4];
2617 
2618    if (c[3] >= 128) {
2619       p[0] = c[2];
2620       p[1] = c[1];
2621       p[2] = c[0];
2622       p[3] = c[3];
2623    }
2624    g.cur_x += 4;
2625 
2626    if (g.cur_x >= g.max_x) {
2627       g.cur_x = g.start_x;
2628       g.cur_y += g.step;
2629 
2630       while (g.cur_y >= g.max_y && g.parse > 0) {
2631          g.step = (1 << g.parse) * g.line_size;
2632          g.cur_y = g.start_y + (g.step >> 1);
2633          --g.parse;
2634       }
2635    }
2636 }
2637 
2638 ubyte *stbi_process_gif_raster(stbi *s, stbi_gif *g)
2639 {
2640    ubyte lzw_cs;
2641    int len, code;
2642    uint first;
2643    int codesize, codemask, avail, oldcode, bits, valid_bits, clear;
2644    stbi_gif_lzw *p;
2645 
2646    lzw_cs = get8u(s);
2647    clear = 1 << lzw_cs;
2648    first = 1;
2649    codesize = lzw_cs + 1;
2650    codemask = (1 << codesize) - 1;
2651    bits = 0;
2652    valid_bits = 0;
2653    for (code = 0; code < clear; code++) {
2654       g.codes[code].prefix = -1;
2655       g.codes[code].first = cast(ubyte) code;
2656       g.codes[code].suffix = cast(ubyte) code;
2657    }
2658 
2659    // support no starting clear code
2660    avail = clear+2;
2661    oldcode = -1;
2662 
2663    len = 0;
2664    for(;;) {
2665       if (valid_bits < codesize) {
2666          if (len == 0) {
2667             len = get8(s); // start new block
2668             if (len == 0)
2669                return g.out_;
2670          }
2671          --len;
2672          bits |= cast(int) get8(s) << valid_bits;
2673          valid_bits += 8;
2674       } else {
2675          int code_ = bits & codemask;
2676          bits >>= codesize;
2677          valid_bits -= codesize;
2678          // @OPTIMIZE: is there some way we can accelerate the non-clear path?
2679          if (code_ == clear) {  // clear code
2680             codesize = lzw_cs + 1;
2681             codemask = (1 << codesize) - 1;
2682             avail = clear + 2;
2683             oldcode = -1;
2684             first = 0;
2685          } else if (code_ == clear + 1) { // end of stream code
2686             skip(s, len);
2687             while ((len = get8(s)) > 0)
2688                skip(s,len);
2689             return g.out_;
2690          } else if (code_ <= avail) {
2691             if (first) throw new STBImageException("no clear code, corrupt GIF");
2692 
2693             if (oldcode >= 0) {
2694                p = &g.codes[avail++];
2695                if (avail > 4096)        throw new STBImageException("too many codes, corrupt GIF");
2696                p.prefix = cast(short) oldcode;
2697                p.first = g.codes[oldcode].first;
2698                p.suffix = (code_ == avail) ? p.first : g.codes[code_].first;
2699             } else if (code_ == avail)
2700                throw new STBImageException("illegal code in raster, corrupt GIF");
2701 
2702             stbi_out_gif_code(g, cast(ushort) code);
2703 
2704             if ((avail & codemask) == 0 && avail <= 0x0FFF) {
2705                codesize++;
2706                codemask = (1 << codesize) - 1;
2707             }
2708 
2709             oldcode = code_;
2710          } else {
2711             throw new STBImageException("illegal code in raster, corrupt GIF");
2712          }
2713       }
2714    }
2715 }
2716 
2717 void stbi_fill_gif_background(stbi_gif *g)
2718 {
2719    int i;
2720    ubyte *c = g.pal[g.bgindex].ptr;
2721    // @OPTIMIZE: write a dword at a time
2722    for (i = 0; i < g.w * g.h * 4; i += 4) {
2723       ubyte *p  = &g.out_[i];
2724       p[0] = c[2];
2725       p[1] = c[1];
2726       p[2] = c[0];
2727       p[3] = c[3];
2728    }
2729 }
2730 
2731 // this function is designed to support animated gifs, although stb_image doesn't support it
2732 ubyte *stbi_gif_load_next(stbi *s, stbi_gif *g, int *comp, int req_comp)
2733 {
2734    int i;
2735    ubyte *old_out = null;
2736 
2737    if (g.out_ == null) {
2738       if (!stbi_gif_header(s, g, comp,0))     return null; // failure_reason set by stbi_gif_header
2739       g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2740       if (g.out_ == null)                      throw new STBImageException("Out of memory");
2741       stbi_fill_gif_background(g);
2742    } else {
2743       // animated-gif-only path
2744       if (((g.eflags & 0x1C) >> 2) == 3) {
2745          old_out = g.out_;
2746          g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2747          if (g.out_ == null)                   throw new STBImageException("Out of memory");
2748          memcpy(g.out_, old_out, g.w*g.h*4);
2749       }
2750    }
2751 
2752    for (;;) {
2753       switch (get8(s)) {
2754          case 0x2C: /* Image Descriptor */
2755          {
2756             int x, y, w, h;
2757             ubyte *o;
2758 
2759             x = get16le(s);
2760             y = get16le(s);
2761             w = get16le(s);
2762             h = get16le(s);
2763             if (((x + w) > (g.w)) || ((y + h) > (g.h)))
2764                throw new STBImageException("bad Image Descriptor, corrupt GIF");
2765 
2766             g.line_size = g.w * 4;
2767             g.start_x = x * 4;
2768             g.start_y = y * g.line_size;
2769             g.max_x   = g.start_x + w * 4;
2770             g.max_y   = g.start_y + h * g.line_size;
2771             g.cur_x   = g.start_x;
2772             g.cur_y   = g.start_y;
2773 
2774             g.lflags = get8(s);
2775 
2776             if (g.lflags & 0x40) {
2777                g.step = 8 * g.line_size; // first interlaced spacing
2778                g.parse = 3;
2779             } else {
2780                g.step = g.line_size;
2781                g.parse = 0;
2782             }
2783 
2784             if (g.lflags & 0x80) {
2785                stbi_gif_parse_colortable(s,g.lpal, 2 << (g.lflags & 7), g.eflags & 0x01 ? g.transparent : -1);
2786                g.color_table = &g.lpal[0][0];
2787             } else if (g.flags & 0x80) {
2788                for (i=0; i < 256; ++i)  // @OPTIMIZE: reset only the previous transparent
2789                   g.pal[i][3] = 255;
2790                if (g.transparent >= 0 && (g.eflags & 0x01))
2791                   g.pal[g.transparent][3] = 0;
2792                g.color_table = &g.pal[0][0];
2793             } else
2794                throw new STBImageException("missing color table, corrupt GIF");
2795 
2796             o = stbi_process_gif_raster(s, g);
2797             if (o == null) return null;
2798 
2799             if (req_comp && req_comp != 4)
2800                o = convert_format(o, 4, req_comp, g.w, g.h);
2801             return o;
2802          }
2803 
2804          case 0x21: // Comment Extension.
2805          {
2806             int len;
2807             if (get8(s) == 0xF9) { // Graphic Control Extension.
2808                len = get8(s);
2809                if (len == 4) {
2810                   g.eflags = get8(s);
2811                   get16le(s); // delay
2812                   g.transparent = get8(s);
2813                } else {
2814                   skip(s, len);
2815                   break;
2816                }
2817             }
2818             while ((len = get8(s)) != 0)
2819                skip(s, len);
2820             break;
2821          }
2822 
2823          case 0x3B: // gif stream termination code
2824             return cast(ubyte*) 1;
2825 
2826          default:
2827             throw new STBImageException("unknown code, corrupt GIF");
2828       }
2829    }
2830 }
2831 
2832 ubyte *stbi_gif_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2833 {
2834    ubyte *u = null;
2835    stbi_gif g={0};
2836 
2837    u = stbi_gif_load_next(s, &g, comp, req_comp);
2838    if (u == cast(void *) 1) u = null;  // end of animated gif marker
2839    if (u) {
2840       *x = g.w;
2841       *y = g.h;
2842    }
2843 
2844    return u;
2845 }
2846 
2847