1 
2 /// D translation of stb_image-1.33 (http://nothings.org/stb_image.c)
3 ///
4 /// This port only supports:
5 /// $(UL
6 ///   $(LI PNG 8-bit-per-channel only.)
7 ///   $(LI JPEG baseline (no JPEG progressive).)
8 ///   $(LI GIF.)
9 ///   $(LI BMP non-1bpp, non-RLE.)
10 /// )
11 ///
12 /// TODO:
13 /// $(UL
14 ///   $(LI Support a range as input.)
15 ///  )
16 
17 //============================    Contributors    =========================
18 //
19 // Image formats                                Optimizations & bugfixes
20 // Sean Barrett (jpeg, png, bmp)                Fabian "ryg" Giesen
21 // Nicolas Schulz (hdr, psd)                                                 
22 // Jonathan Dummer (tga)                     Bug fixes & warning fixes           
23 // Jean-Marc Lienher (gif)                      Marc LeBlanc               
24 // Tom Seddon (pic)                             Christpher Lloyd           
25 // Thatcher Ulrich (psd)                        Dave Moore                 
26 // Won Chun                   
27 // the Horde3D community      
28 // Extensions, features                            Janez Zemva                
29 // Jetro Lauha (stbi_info)                      Jonathan Blow              
30 // James "moose2000" Brown (iPhone PNG)         Laurent Gomila                             
31 // Ben "Disch" Wenger (io callbacks)            Aruelien Pocheville
32 // Martin "SpartanJ" Golini                     Ryamond Barbiero
33 // David Woo
34 
35 module gfm.image.stb_image;
36 
37 import core.stdc.stdlib;
38 import core.stdc.string;
39 
40 import gfm.math.vector,
41        gfm.image.bitmap;
42 
43 enum STBI_VERSION = 1;
44 
45 /// The exception type thrown when loading an image failed.
46 class STBImageException : Exception
47 {
48     public
49     {
50         this(string msg)
51         {
52             super(msg);
53         }
54     }
55 }
56 
57 enum : int
58 {
59    STBI_default    = 0, // only used for req_comp
60    STBI_grey       = 1,
61    STBI_grey_alpha = 2,
62    STBI_rgb        = 3,
63    STBI_rgb_alpha  = 4
64 };
65 
66 // define faster low-level operations (typically SIMD support)
67 
68 
69 uint stbi_lrot(uint x, uint y)
70 {
71     return (x << y) | (x >> (32 - y));
72 }
73 
74 // stbi structure is our basic context used by all images, so it
75 // contains all the IO context, plus some basic image information
76 struct stbi
77 {
78    uint img_x, img_y;
79    int img_n, img_out_n;
80    
81    int buflen;
82    ubyte buffer_start[128];
83 
84    const(ubyte) *img_buffer;
85    const(ubyte) *img_buffer_end;
86    const(ubyte) *img_buffer_original;
87 }
88 
89 
90 // initialize a memory-decode context
91 void start_mem(stbi *s, const(ubyte)*buffer, int len)
92 {
93    s.img_buffer = buffer;
94    s.img_buffer_original = buffer;
95    s.img_buffer_end = buffer+len;
96 }
97 
98 void stbi_rewind(stbi *s)
99 {
100    // conceptually rewind SHOULD rewind to the beginning of the stream,
101    // but we just rewind to the beginning of the initial buffer, because
102    // we only use it after doing 'test', which only ever looks at at most 92 bytes
103    s.img_buffer = s.img_buffer_original;
104 }
105 
106 
107 ubyte *stbi_load_main(stbi *s, int *x, int *y, int *comp, int req_comp)
108 {
109     try
110     {
111         stbi_jpeg_test(s);
112         stbi_rewind(s);
113         return stbi_jpeg_load(s,x,y,comp,req_comp);
114     }
115     catch(STBImageException e)
116     {
117         stbi_rewind(s);
118     }
119 
120     try
121     {
122         stbi_png_test(s);
123         stbi_rewind(s);
124         return stbi_png_load(s,x,y,comp,req_comp);
125     }
126     catch(STBImageException e)
127     {
128         stbi_rewind(s);
129     }
130 
131     try
132     {
133         stbi_bmp_test(s);
134         stbi_rewind(s);
135         return stbi_bmp_load(s,x,y,comp,req_comp);
136     }
137     catch(STBImageException e)
138     {
139         stbi_rewind(s);
140     }
141 
142     try
143     {
144         stbi_gif_test(s);
145         stbi_rewind(s);
146         return stbi_gif_load(s,x,y,comp,req_comp);
147     }
148     catch(STBImageException e)
149     {
150         stbi_rewind(s);
151     }
152 
153     throw new STBImageException("Image not of any known type, or corrupt");
154 }
155 
156 /// Loads an image from memory.
157 /// Throws: STBImageException on error.
158 ubyte* stbi_load_from_memory(void[] buffer, out int width, out int height, out int components, int requestedComponents)
159 {
160    stbi s;
161    start_mem(&s, cast(ubyte*)buffer.ptr, cast(int)(buffer.length));
162    return stbi_load_main(&s, &width, &height, &components, requestedComponents);
163 }
164 
165 /// Frees an image loaded by stb_image.
166 void stbi_image_free(void *retval_from_stbi_load)
167 {
168     free(retval_from_stbi_load);
169 }
170 
171 /// Load an image from memory and puts it in a Bitmap.
172 /// See_also: Bitmap.
173 /// Throws: STBImageException on error.
174 Bitmap!vec4ub stbiLoadImage(void[] buffer)
175 {
176     int width, height, components;
177     ubyte* data = stbi_load_from_memory(buffer, width, height, components, 4);
178     scope(exit) stbi_image_free(data);
179 
180     if(components != 4)
181         throw new STBImageException("Could't convert image to 4 components");
182 
183     auto result = Bitmap!vec4ub(vec2i(width, height));
184     memcpy(result.ptr, data, width * height);
185     return result;
186 }
187 
188 //
189 // Common code used by all image loaders
190 //
191 
192 enum : int
193 {
194    SCAN_load=0,
195    SCAN_type,
196    SCAN_header
197 };
198 
199 
200 int get8(stbi *s)
201 {
202    if (s.img_buffer < s.img_buffer_end)
203       return *s.img_buffer++;
204    
205    return 0;
206 }
207 
208 int at_eof(stbi *s)
209 {
210    return s.img_buffer >= s.img_buffer_end;   
211 }
212 
213 ubyte get8u(stbi *s)
214 {
215    return cast(ubyte) get8(s);
216 }
217 
218 void skip(stbi *s, int n)
219 {
220    s.img_buffer += n;
221 }
222 
223 int getn(stbi *s, ubyte *buffer, int n)
224 {
225    if (s.img_buffer+n <= s.img_buffer_end) {
226       memcpy(buffer, s.img_buffer, n);
227       s.img_buffer += n;
228       return 1;
229    } else
230       return 0;
231 }
232 
233 int get16(stbi *s)
234 {
235    int z = get8(s);
236    return (z << 8) + get8(s);
237 }
238 
239 uint get32(stbi *s)
240 {
241    uint z = get16(s);
242    return (z << 16) + get16(s);
243 }
244 
245 int get16le(stbi *s)
246 {
247    int z = get8(s);
248    return z + (get8(s) << 8);
249 }
250 
251 uint get32le(stbi *s)
252 {
253    uint z = get16le(s);
254    return z + (get16le(s) << 16);
255 }
256 
257 //
258 //  generic converter from built-in img_n to req_comp
259 //    individual types do this automatically as much as possible (e.g. jpeg
260 //    does all cases internally since it needs to colorspace convert anyway,
261 //    and it never has alpha, so very few cases ). png can automatically
262 //    interleave an alpha=255 channel, but falls back to this for other cases
263 //
264 //  assume data buffer is malloced, so malloc a new one and free that one
265 //  only failure mode is malloc failing
266 
267 ubyte compute_y(int r, int g, int b)
268 {
269    return cast(ubyte) (((r*77) + (g*150) +  (29*b)) >> 8);
270 }
271 
272 ubyte *convert_format(ubyte *data, int img_n, int req_comp, uint x, uint y)
273 {
274     int i,j;
275     ubyte *good;
276 
277     if (req_comp == img_n) return data;
278     assert(req_comp >= 1 && req_comp <= 4);
279 
280     good = cast(ubyte*) malloc(req_comp * x * y);
281     if (good == null) {
282         free(data);
283         throw new STBImageException("Out of memory");
284     }
285 
286     for (j=0; j < cast(int) y; ++j) {
287         ubyte *src  = data + j * x * img_n   ;
288         ubyte *dest = good + j * x * req_comp;
289 
290         // convert source image with img_n components to one with req_comp components;
291         // avoid switch per pixel, so use switch per scanline and massive macros
292         switch (img_n * 8 + req_comp) 
293         {
294             case 1 * 8 + 2: 
295                 for(i=x-1; i >= 0; --i, src += 1, dest += 2)
296                     dest[0] = src[0], dest[1] = 255;
297                 break;
298             case 1 * 8 + 3: 
299                 for(i=x-1; i >= 0; --i, src += 1, dest += 3)
300                     dest[0]=dest[1]=dest[2]=src[0]; 
301                 break;
302             case 1 * 8 + 4: 
303                 for(i=x-1; i >= 0; --i, src += 1, dest += 4)
304                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; 
305                 break;
306             case 2 * 8 + 1: 
307                 for(i=x-1; i >= 0; --i, src += 2, dest += 1)
308                     dest[0]=src[0]; 
309                 break;
310             case 2 * 8 + 3: 
311                 for(i=x-1; i >= 0; --i, src += 2, dest += 3)
312                     dest[0]=dest[1]=dest[2]=src[0]; 
313                 break;
314             case 2 * 8 + 4: 
315                 for(i=x-1; i >= 0; --i, src += 2, dest += 4)
316                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; 
317                 break;
318             case 3 * 8 + 4:
319                 for(i=x-1; i >= 0; --i, src += 3, dest += 4) 
320                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; 
321                 break;
322             case 3 * 8 + 1: 
323                 for(i=x-1; i >= 0; --i, src += 3, dest += 1)
324                     dest[0]=compute_y(src[0],src[1],src[2]); 
325                 break;
326             case 3 * 8 + 2: 
327                 for(i=x-1; i >= 0; --i, src += 3, dest += 2)
328                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255; 
329                 break;
330             case 4 * 8 + 1:
331                 for(i=x-1; i >= 0; --i, src += 4, dest += 1)
332                     dest[0]=compute_y(src[0],src[1],src[2]); 
333                 break;
334             case 4 * 8 + 2: 
335                 for(i=x-1; i >= 0; --i, src += 4, dest += 2)
336                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3]; 
337                 break;
338             case 4 * 8 + 3: 
339                 for(i=x-1; i >= 0; --i, src += 4, dest += 3)
340                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; 
341                 break;
342             default: assert(0);
343         }
344     }
345 
346     free(data);
347     return good;
348 }
349 
350 //
351 //  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
352 //
353 //    simple implementation
354 //      - channel subsampling of at most 2 in each dimension
355 //      - doesn't support delayed output of y-dimension
356 //      - simple interface (only one output format: 8-bit interleaved RGB)
357 //      - doesn't try to recover corrupt jpegs
358 //      - doesn't allow partial loading, loading multiple at once
359 //      - still fast on x86 (copying globals into locals doesn't help x86)
360 //      - allocates lots of intermediate memory (full size of all components)
361 //        - non-interleaved case requires this anyway
362 //        - allows good upsampling (see next)
363 //    high-quality
364 //      - upsampled channels are bilinearly interpolated, even across blocks
365 //      - quality integer IDCT derived from IJG's 'slow'
366 //    performance
367 //      - fast huffman; reasonable integer IDCT
368 //      - uses a lot of intermediate memory, could cache poorly
369 //      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
370 //          stb_jpeg:   1.34 seconds (MSVC6, default release build)
371 //          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
372 //          IJL11.dll:  1.08 seconds (compiled by intel)
373 //          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
374 //          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
375 
376 // huffman decoding acceleration
377 enum FAST_BITS = 9;  // larger handles more cases; smaller stomps less cache
378 
379 struct huffman
380 {
381    ubyte[1 << FAST_BITS] fast;
382    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
383    ushort[256] code;
384    ubyte[256] values;
385    ubyte[257] size;
386    uint[18] maxcode;
387    int[17] delta;   // old 'firstsymbol' - old 'firstcode'
388 }
389 
390 struct jpeg
391 {
392    stbi *s;
393    huffman[4] huff_dc;
394    huffman[4] huff_ac;
395    ubyte[64][4] dequant;
396 
397 // sizes for components, interleaved MCUs
398    int img_h_max, img_v_max;
399    int img_mcu_x, img_mcu_y;
400    int img_mcu_w, img_mcu_h;
401 
402 // definition of jpeg image component
403    struct img_comp_
404    {
405       int id;
406       int h,v;
407       int tq;
408       int hd,ha;
409       int dc_pred;
410 
411       int x,y,w2,h2;
412       ubyte *data;
413       void *raw_data;
414       ubyte *linebuf;
415    } 
416    
417    img_comp_[4] img_comp;
418 
419    uint         code_buffer; // jpeg entropy-coded buffer
420    int            code_bits;   // number of valid bits
421    ubyte          marker;      // marker seen while filling entropy buffer
422    int            nomore;      // flag if we saw a marker so must stop
423 
424    int scan_n;
425    int[4] order;
426    int restart_interval, todo;
427 }
428 
429 
430 int build_huffman(huffman *h, int *count)
431 {
432    int i,j,k=0,code;
433    // build size list for each symbol (from JPEG spec)
434    for (i=0; i < 16; ++i)
435       for (j=0; j < count[i]; ++j)
436          h.size[k++] = cast(ubyte) (i+1);
437    h.size[k] = 0;
438 
439    // compute actual symbols (from jpeg spec)
440    code = 0;
441    k = 0;
442    for(j=1; j <= 16; ++j) {
443       // compute delta to add to code to compute symbol id
444       h.delta[j] = k - code;
445       if (h.size[k] == j) {
446          while (h.size[k] == j)
447             h.code[k++] = cast(ushort) (code++);
448          if (code-1 >= (1 << j)) 
449              throw new STBImageException("Bad code lengths, corrupt JPEG");
450       }
451       // compute largest code + 1 for this size, preshifted as needed later
452       h.maxcode[j] = code << (16-j);
453       code <<= 1;
454    }
455    h.maxcode[j] = 0xffffffff;
456 
457    // build non-spec acceleration table; 255 is flag for not-accelerated
458    memset(h.fast.ptr, 255, 1 << FAST_BITS);
459    for (i=0; i < k; ++i) {
460       int s = h.size[i];
461       if (s <= FAST_BITS) {
462          int c = h.code[i] << (FAST_BITS-s);
463          int m = 1 << (FAST_BITS-s);
464          for (j=0; j < m; ++j) {
465             h.fast[c+j] = cast(ubyte) i;
466          }
467       }
468    }
469    return 1;
470 }
471 
472 void grow_buffer_unsafe(jpeg *j)
473 {
474    do {
475       int b = j.nomore ? 0 : get8(j.s);
476       if (b == 0xff) {
477          int c = get8(j.s);
478          if (c != 0) {
479             j.marker = cast(ubyte) c;
480             j.nomore = 1;
481             return;
482          }
483       }
484       j.code_buffer |= b << (24 - j.code_bits);
485       j.code_bits += 8;
486    } while (j.code_bits <= 24);
487 }
488 
489 // (1 << n) - 1
490 static immutable uint bmask[17]=[0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535];
491 
492 // decode a jpeg huffman value from the bitstream
493 int decode(jpeg *j, huffman *h)
494 {
495    uint temp;
496    int c,k;
497 
498    if (j.code_bits < 16) grow_buffer_unsafe(j);
499 
500    // look at the top FAST_BITS and determine what symbol ID it is,
501    // if the code is <= FAST_BITS
502    c = (j.code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
503    k = h.fast[c];
504    if (k < 255) {
505       int s = h.size[k];
506       if (s > j.code_bits)
507          return -1;
508       j.code_buffer <<= s;
509       j.code_bits -= s;
510       return h.values[k];
511    }
512 
513    // naive test is to shift the code_buffer down so k bits are
514    // valid, then test against maxcode. To speed this up, we've
515    // preshifted maxcode left so that it has (16-k) 0s at the
516    // end; in other words, regardless of the number of bits, it
517    // wants to be compared against something shifted to have 16;
518    // that way we don't need to shift inside the loop.
519    temp = j.code_buffer >> 16;
520    for (k=FAST_BITS+1 ; ; ++k)
521       if (temp < h.maxcode[k])
522          break;
523    if (k == 17) {
524       // error! code not found
525       j.code_bits -= 16;
526       return -1;
527    }
528 
529    if (k > j.code_bits)
530       return -1;
531 
532    // convert the huffman code to the symbol id
533    c = ((j.code_buffer >> (32 - k)) & bmask[k]) + h.delta[k];
534    assert((((j.code_buffer) >> (32 - h.size[c])) & bmask[h.size[c]]) == h.code[c]);
535 
536    // convert the id to a symbol
537    j.code_bits -= k;
538    j.code_buffer <<= k;
539    return h.values[c];
540 }
541 
542 // combined JPEG 'receive' and JPEG 'extend', since baseline
543 // always extends everything it receives.
544 int extend_receive(jpeg *j, int n)
545 {
546    uint m = 1 << (n-1);
547    uint k;
548    if (j.code_bits < n) grow_buffer_unsafe(j);
549 
550    k = stbi_lrot(j.code_buffer, n);
551    j.code_buffer = k & ~bmask[n];
552    k &= bmask[n];
553    j.code_bits -= n;
554 
555    // the following test is probably a random branch that won't
556    // predict well. I tried to table accelerate it but failed.
557    // maybe it's compiling as a conditional move?
558    if (k < m)
559       return (-1 << n) + k + 1;
560    else
561       return k;
562 }
563 
564 // given a value that's at position X in the zigzag stream,
565 // where does it appear in the 8x8 matrix coded as row-major?
566 static immutable ubyte dezigzag[64+15] =
567 [
568     0,  1,  8, 16,  9,  2,  3, 10,
569    17, 24, 32, 25, 18, 11,  4,  5,
570    12, 19, 26, 33, 40, 48, 41, 34,
571    27, 20, 13,  6,  7, 14, 21, 28,
572    35, 42, 49, 56, 57, 50, 43, 36,
573    29, 22, 15, 23, 30, 37, 44, 51,
574    58, 59, 52, 45, 38, 31, 39, 46,
575    53, 60, 61, 54, 47, 55, 62, 63,
576    // let corrupt input sample past end
577    63, 63, 63, 63, 63, 63, 63, 63,
578    63, 63, 63, 63, 63, 63, 63
579 ];
580 
581 // decode one 64-entry block--
582 int decode_block(jpeg *j, short data[64], huffman *hdc, huffman *hac, int b)
583 {
584    int diff,dc,k;
585    int t = decode(j, hdc);
586    if (t < 0)
587        throw new STBImageException("Bad huffman code, corrupt JPEG");
588 
589    // 0 all the ac values now so we can do it 32-bits at a time
590    memset(data.ptr,0,64*(data[0]).sizeof);
591 
592    diff = t ? extend_receive(j, t) : 0;
593    dc = j.img_comp[b].dc_pred + diff;
594    j.img_comp[b].dc_pred = dc;
595    data[0] = cast(short) dc;
596 
597    // decode AC components, see JPEG spec
598    k = 1;
599    do {
600       int r,s;
601       int rs = decode(j, hac);
602       if (rs < 0)
603          throw new STBImageException("Bad huffman code, corrupt JPEG");
604       s = rs & 15;
605       r = rs >> 4;
606       if (s == 0) {
607          if (rs != 0xf0) break; // end block
608          k += 16;
609       } else {
610          k += r;
611          // decode into unzigzag'd location
612          data[dezigzag[k++]] = cast(short) extend_receive(j,s);
613       }
614    } while (k < 64);
615    return 1;
616 }
617 
618 // take a -128..127 value and clamp it and convert to 0..255
619 ubyte clamp(int x)
620 {
621    // trick to use a single test to catch both cases
622    if (cast(uint) x > 255) {
623       if (x < 0) return 0;
624       if (x > 255) return 255;
625    }
626    return cast(ubyte) x;
627 }
628 
629 int f2f(double x)
630 {
631     return cast(int)(x * 4096 + 0.5);
632 }
633 
634 int fsh(int x)
635 {
636     return x << 12;
637 }
638 
639 // derived from jidctint -- DCT_ISLOW
640 void IDCT_1D(int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
641              out int t0, out int t1, out int t2, out int t3,
642              out int x0, out int x1, out int x2, out int x3)
643 {
644    int p1,p2,p3,p4,p5; 
645    p2 = s2;                                    
646    p3 = s6;                                    
647    p1 = (p2+p3) * f2f(0.5411961f);             
648    t2 = p1 + p3*f2f(-1.847759065f);            
649    t3 = p1 + p2*f2f( 0.765366865f);            
650    p2 = s0;                                    
651    p3 = s4;                                    
652    t0 = fsh(p2+p3);                            
653    t1 = fsh(p2-p3);                            
654    x0 = t0+t3;                                 
655    x3 = t0-t3;                                 
656    x1 = t1+t2;                                 
657    x2 = t1-t2;                                 
658    t0 = s7;                                    
659    t1 = s5;                                    
660    t2 = s3;                                    
661    t3 = s1;                                    
662    p3 = t0+t2;                                 
663    p4 = t1+t3;                                 
664    p1 = t0+t3;                                 
665    p2 = t1+t2;                                 
666    p5 = (p3+p4)*f2f( 1.175875602f);            
667    t0 = t0*f2f( 0.298631336f);                 
668    t1 = t1*f2f( 2.053119869f);                 
669    t2 = t2*f2f( 3.072711026f);                 
670    t3 = t3*f2f( 1.501321110f);                 
671    p1 = p5 + p1*f2f(-0.899976223f);            
672    p2 = p5 + p2*f2f(-2.562915447f);            
673    p3 = p3*f2f(-1.961570560f);                 
674    p4 = p4*f2f(-0.390180644f);                 
675    t3 += p1+p4;                                
676    t2 += p2+p3;                                
677    t1 += p2+p4;                                
678    t0 += p1+p3;
679  }
680 
681 alias stbi_dequantize_t = ubyte;
682 
683 // .344 seconds on 3*anemones.jpg
684 void idct_block(ubyte *out_, int out_stride, short data[64], stbi_dequantize_t *dequantize)
685 {
686    int i;
687    int[64] val;
688    int*v = val.ptr;
689    stbi_dequantize_t *dq = dequantize;
690    ubyte *o;
691    short *d = data.ptr;
692 
693    // columns
694    for (i=0; i < 8; ++i,++d,++dq, ++v) {
695       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
696       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
697            && d[40]==0 && d[48]==0 && d[56]==0) {
698          //    no shortcut                 0     seconds
699          //    (1|2|3|4|5|6|7)==0          0     seconds
700          //    all separate               -0.047 seconds
701          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
702          int dcterm = d[0] * dq[0] << 2;
703          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
704       } else {
705          int t0, t1, t2, t3, x0, x1, x2, x3;
706          IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
707                  d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56],
708                  t0, t1, t2, t3, x0, x1, x2, x3);
709          // constants scaled things up by 1<<12; let's bring them back
710          // down, but keep 2 extra bits of precision
711          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
712          v[ 0] = (x0+t3) >> 10;
713          v[56] = (x0-t3) >> 10;
714          v[ 8] = (x1+t2) >> 10;
715          v[48] = (x1-t2) >> 10;
716          v[16] = (x2+t1) >> 10;
717          v[40] = (x2-t1) >> 10;
718          v[24] = (x3+t0) >> 10;
719          v[32] = (x3-t0) >> 10;
720       }
721    }
722 
723    for (i=0, v=val.ptr, o=out_; i < 8; ++i,v+=8,o+=out_stride) {
724 
725       // no fast case since the first 1D IDCT spread components out
726       int t0, t1, t2, t3, x0, x1, x2, x3;
727       IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7], t0, t1, t2, t3, x0, x1, x2, x3);
728       // constants scaled things up by 1<<12, plus we had 1<<2 from first
729       // loop, plus horizontal and vertical each scale by sqrt(8) so together
730       // we've got an extra 1<<3, so 1<<17 total we need to remove.
731       // so we want to round that, which means adding 0.5 * 1<<17,
732       // aka 65536. Also, we'll end up with -128 to 127 that we want
733       // to encode as 0..255 by adding 128, so we'll add that before the shift
734       x0 += 65536 + (128<<17);
735       x1 += 65536 + (128<<17);
736       x2 += 65536 + (128<<17);
737       x3 += 65536 + (128<<17);
738       // tried computing the shifts into temps, or'ing the temps to see
739       // if any were out of range, but that was slower
740       o[0] = clamp((x0+t3) >> 17);
741       o[7] = clamp((x0-t3) >> 17);
742       o[1] = clamp((x1+t2) >> 17);
743       o[6] = clamp((x1-t2) >> 17);
744       o[2] = clamp((x2+t1) >> 17);
745       o[5] = clamp((x2-t1) >> 17);
746       o[3] = clamp((x3+t0) >> 17);
747       o[4] = clamp((x3-t0) >> 17);
748    }
749 }
750 
751 
752 enum MARKER_none = 0xff;
753 
754 // if there's a pending marker from the entropy stream, return that
755 // otherwise, fetch from the stream and get a marker. if there's no
756 // marker, return 0xff, which is never a valid marker value
757 ubyte get_marker(jpeg *j)
758 {
759    ubyte x;
760    if (j.marker != MARKER_none) { x = j.marker; j.marker = MARKER_none; return x; }
761    x = get8u(j.s);
762    if (x != 0xff) return MARKER_none;
763    while (x == 0xff)
764       x = get8u(j.s);
765    return x;
766 }
767 
768 // in each scan, we'll have scan_n components, and the order
769 // of the components is specified by order[]
770 bool RESTART(int x)
771 {
772     return (x >= 0xd0) && (x <= 0xd7);
773 }
774 
775 // after a restart interval, reset the entropy decoder and
776 // the dc prediction
777 void reset(jpeg *j)
778 {
779    j.code_bits = 0;
780    j.code_buffer = 0;
781    j.nomore = 0;
782    j.img_comp[0].dc_pred = j.img_comp[1].dc_pred = j.img_comp[2].dc_pred = 0;
783    j.marker = MARKER_none;
784    j.todo = j.restart_interval ? j.restart_interval : 0x7fffffff;
785    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
786    // since we don't even allow 1<<30 pixels
787 }
788 
789 int parse_entropy_coded_data(jpeg *z)
790 {
791    reset(z);
792    if (z.scan_n == 1) {
793       int i,j;
794       short data[64];
795       int n = z.order[0];
796       // non-interleaved data, we just need to process one block at a time,
797       // in trivial scanline order
798       // number of blocks to do just depends on how many actual "pixels" this
799       // component has, independent of interleaved MCU blocking and such
800       int w = (z.img_comp[n].x+7) >> 3;
801       int h = (z.img_comp[n].y+7) >> 3;
802       for (j=0; j < h; ++j) {
803          for (i=0; i < w; ++i) {
804             if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
805             idct_block(z.img_comp[n].data+z.img_comp[n].w2*j*8+i*8, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
806             // every data block is an MCU, so countdown the restart interval
807             if (--z.todo <= 0) {
808                if (z.code_bits < 24) grow_buffer_unsafe(z);
809                // if it's NOT a restart, then just bail, so we get corrupt data
810                // rather than no data
811                if (!RESTART(z.marker)) return 1;
812                reset(z);
813             }
814          }
815       }
816    } else { // interleaved!
817       int i,j,k,x,y;
818       short[64] data;
819       for (j=0; j < z.img_mcu_y; ++j) {
820          for (i=0; i < z.img_mcu_x; ++i) {
821             // scan an interleaved mcu... process scan_n components in order
822             for (k=0; k < z.scan_n; ++k) {
823                int n = z.order[k];
824                // scan out an mcu's worth of this component; that's just determined
825                // by the basic H and V specified for the component
826                for (y=0; y < z.img_comp[n].v; ++y) {
827                   for (x=0; x < z.img_comp[n].h; ++x) {
828                      int x2 = (i*z.img_comp[n].h + x)*8;
829                      int y2 = (j*z.img_comp[n].v + y)*8;
830                      if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
831                      idct_block(z.img_comp[n].data+z.img_comp[n].w2*y2+x2, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
832                   }
833                }
834             }
835             // after all interleaved components, that's an interleaved MCU,
836             // so now count down the restart interval
837             if (--z.todo <= 0) {
838                if (z.code_bits < 24) grow_buffer_unsafe(z);
839                // if it's NOT a restart, then just bail, so we get corrupt data
840                // rather than no data
841                if (!RESTART(z.marker)) return 1;
842                reset(z);
843             }
844          }
845       }
846    }
847    return 1;
848 }
849 
850 int process_marker(jpeg *z, int m)
851 {
852    int L;
853    switch (m) {
854       
855       case MARKER_none: // no marker found
856          throw new STBImageException("Expected marker, corrupt JPEG");
857 
858       case 0xC2: // SOF - progressive
859           throw new STBImageException("JPEG format not supported (progressive)");
860 
861       case 0xDD: // DRI - specify restart interval
862          if (get16(z.s) != 4) 
863              throw new STBImageException("Bad DRI len, corrupt JPEG");
864          z.restart_interval = get16(z.s);
865          return 1;
866 
867       case 0xDB: // DQT - define quantization table
868          L = get16(z.s)-2;
869          while (L > 0) {
870             int q = get8(z.s);
871             int p = q >> 4;
872             int t = q & 15,i;
873             if (p != 0)
874                throw new STBImageException("Bad DQT type, corrupt JPEG");
875             if (t > 3) 
876                throw new STBImageException("Bad DQT table, corrupt JPEG");
877             for (i=0; i < 64; ++i)
878                z.dequant[t][dezigzag[i]] = get8u(z.s);
879             L -= 65;
880          }
881          return L==0;
882 
883       case 0xC4: // DHT - define huffman table
884          L = get16(z.s)-2;
885          while (L > 0) {
886             ubyte *v;
887             int[16] sizes;
888             int i;
889             int m_ = 0;
890             int q = get8(z.s);
891             int tc = q >> 4;
892             int th = q & 15;
893             if (tc > 1 || th > 3) 
894                 throw new STBImageException("Bad DHT header, corrupt JPEG");
895             for (i=0; i < 16; ++i) {
896                sizes[i] = get8(z.s);
897                m_ += sizes[i];
898             }
899             L -= 17;
900             if (tc == 0) {
901                if (!build_huffman(z.huff_dc.ptr+th, sizes.ptr)) return 0;
902                v = z.huff_dc[th].values.ptr;
903             } else {
904                if (!build_huffman(z.huff_ac.ptr+th, sizes.ptr)) return 0;
905                v = z.huff_ac[th].values.ptr;
906             }
907             for (i=0; i < m_; ++i)
908                v[i] = get8u(z.s);
909             L -= m_;
910          }
911          return L==0;
912 
913       default:
914          break;
915    }
916    // check for comment block or APP blocks
917    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
918       skip(z.s, get16(z.s)-2);
919       return 1;
920    }
921    return 0;
922 }
923 
924 // after we see SOS
925 int process_scan_header(jpeg *z)
926 {
927    int i;
928    int Ls = get16(z.s);
929    z.scan_n = get8(z.s);
930    if (z.scan_n < 1 || z.scan_n > 4 || z.scan_n > cast(int) z.s.img_n) 
931       throw new STBImageException("Bad SOS component count, Corrupt JPEG");
932       
933    if (Ls != 6+2*z.scan_n) 
934       throw new STBImageException("Bad SOS length, Corrupt JPEG");
935       
936    for (i=0; i < z.scan_n; ++i) {
937       int id = get8(z.s), which;
938       int q = get8(z.s);
939       for (which = 0; which < z.s.img_n; ++which)
940          if (z.img_comp[which].id == id)
941             break;
942       if (which == z.s.img_n) return 0;
943       z.img_comp[which].hd = q >> 4;   
944       if (z.img_comp[which].hd > 3) 
945          throw new STBImageException("Bad DC huff, Corrupt JPEG");
946       z.img_comp[which].ha = q & 15;   
947       if (z.img_comp[which].ha > 3)
948          throw new STBImageException("Bad AC huff, Corrupt JPEG");
949       z.order[i] = which;
950    }
951    if (get8(z.s) != 0) 
952       throw new STBImageException("Bad SOS, Corrupt JPEG");
953    get8(z.s); // should be 63, but might be 0
954    if (get8(z.s) != 0) 
955       throw new STBImageException("Bad SOS, Corrupt JPEG");
956 
957    return 1;
958 }
959 
960 int process_frame_header(jpeg *z, int scan)
961 {
962    stbi *s = z.s;
963    int Lf,p,i,q, h_max=1,v_max=1,c;
964    Lf = get16(s);         if (Lf < 11) throw new STBImageException("Bad SOF len, Corrupt JPEG");
965    p  = get8(s);          if (p != 8) throw new STBImageException("JPEG format not supported: 8-bit only"); // JPEG baseline
966    s.img_y = get16(s);   if (s.img_y == 0) throw new STBImageException("No header height, JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
967    s.img_x = get16(s);   if (s.img_x == 0) throw new STBImageException("0 width, corrupt JPEG"); // JPEG requires
968    c = get8(s);
969    if (c != 3 && c != 1) throw new STBImageException("Bad component count, corrupt JPEG");    // JFIF requires
970    s.img_n = c;
971    for (i=0; i < c; ++i) {
972       z.img_comp[i].data = null;
973       z.img_comp[i].linebuf = null;
974    }
975 
976    if (Lf != 8+3*s.img_n) throw new STBImageException("Bad SOF len, corrupt JPEG"); 
977 
978    for (i=0; i < s.img_n; ++i) {
979       z.img_comp[i].id = get8(s);
980       if (z.img_comp[i].id != i+1)   // JFIF requires
981          if (z.img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
982             throw new STBImageException("Bad component ID, corrupt JPEG");
983       q = get8(s);
984       z.img_comp[i].h = (q >> 4);  if (!z.img_comp[i].h || z.img_comp[i].h > 4) throw new STBImageException("Bad H, corrupt JPEG");
985       z.img_comp[i].v = q & 15;    if (!z.img_comp[i].v || z.img_comp[i].v > 4) throw new STBImageException("Bad V, corrupt JPEG");
986       z.img_comp[i].tq = get8(s);  if (z.img_comp[i].tq > 3) throw new STBImageException("Bad TQ, corrupt JPEG");
987    }
988 
989    if (scan != SCAN_load) return 1;
990 
991    if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
992 
993    for (i=0; i < s.img_n; ++i) {
994       if (z.img_comp[i].h > h_max) h_max = z.img_comp[i].h;
995       if (z.img_comp[i].v > v_max) v_max = z.img_comp[i].v;
996    }
997 
998    // compute interleaved mcu info
999    z.img_h_max = h_max;
1000    z.img_v_max = v_max;
1001    z.img_mcu_w = h_max * 8;
1002    z.img_mcu_h = v_max * 8;
1003    z.img_mcu_x = (s.img_x + z.img_mcu_w-1) / z.img_mcu_w;
1004    z.img_mcu_y = (s.img_y + z.img_mcu_h-1) / z.img_mcu_h;
1005 
1006    for (i=0; i < s.img_n; ++i) {
1007       // number of effective pixels (e.g. for non-interleaved MCU)
1008       z.img_comp[i].x = (s.img_x * z.img_comp[i].h + h_max-1) / h_max;
1009       z.img_comp[i].y = (s.img_y * z.img_comp[i].v + v_max-1) / v_max;
1010       // to simplify generation, we'll allocate enough memory to decode
1011       // the bogus oversized data from using interleaved MCUs and their
1012       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1013       // discard the extra data until colorspace conversion
1014       z.img_comp[i].w2 = z.img_mcu_x * z.img_comp[i].h * 8;
1015       z.img_comp[i].h2 = z.img_mcu_y * z.img_comp[i].v * 8;
1016       z.img_comp[i].raw_data = malloc(z.img_comp[i].w2 * z.img_comp[i].h2+15);
1017       if (z.img_comp[i].raw_data == null) {
1018          for(--i; i >= 0; --i) {
1019             free(z.img_comp[i].raw_data);
1020             z.img_comp[i].data = null;
1021          }
1022          throw new STBImageException("Out of memory");
1023       }
1024       // align blocks for installable-idct using mmx/sse
1025       z.img_comp[i].data = cast(ubyte*) (( cast(size_t) z.img_comp[i].raw_data + 15) & ~15);
1026       z.img_comp[i].linebuf = null;
1027    }
1028 
1029    return 1;
1030 }
1031 
1032 // use comparisons since in some cases we handle more than one case (e.g. SOF)
1033 bool DNL(int x) { return x == 0xdc; }
1034 bool SOI(int x) { return x == 0xd8; }
1035 bool EOI(int x) { return x == 0xd9; }
1036 bool SOF(int x) { return x == 0xc0 || x == 0xc1; }
1037 bool SOS(int x) { return x == 0xda; }
1038 
1039 int decode_jpeg_header(jpeg *z, int scan)
1040 {
1041    int m;
1042    z.marker = MARKER_none; // initialize cached marker to empty
1043    m = get_marker(z);
1044    if (!SOI(m)) throw new STBImageException("No SOI, corrupt JPEG");
1045    if (scan == SCAN_type) return 1;
1046    m = get_marker(z);
1047    while (!SOF(m)) 
1048    {
1049 
1050       if (!process_marker(z,m)) return 0;
1051       m = get_marker(z);
1052 
1053 
1054 
1055       while (m == MARKER_none) 
1056       {
1057          // some files have extra padding after their blocks, so ok, we'll scan
1058          if (at_eof(z.s)) throw new STBImageException("No SOF, corrupt JPEG");
1059          m = get_marker(z);
1060       }
1061    }
1062    if (!process_frame_header(z, scan)) return 0;
1063    return 1;
1064 }
1065 
1066 int decode_jpeg_image(jpeg *j)
1067 {
1068    int m;
1069    j.restart_interval = 0;
1070    if (!decode_jpeg_header(j, SCAN_load)) return 0;
1071    m = get_marker(j);
1072    while (!EOI(m)) {
1073       if (SOS(m)) {
1074          if (!process_scan_header(j)) return 0;
1075          if (!parse_entropy_coded_data(j)) return 0;
1076          if (j.marker == MARKER_none ) {
1077             // handle 0s at the end of image data from IP Kamera 9060
1078             while (!at_eof(j.s)) {
1079                int x = get8(j.s);
1080                if (x == 255) {
1081                   j.marker = get8u(j.s);
1082                   break;
1083                } else if (x != 0) {
1084                   return 0;
1085                }
1086             }
1087             // if we reach eof without hitting a marker, get_marker() below will fail and we'll eventually return 0
1088          }
1089       } else {
1090          if (!process_marker(j, m)) return 0;
1091       }
1092       m = get_marker(j);
1093    }
1094    return 1;
1095 }
1096 
1097 // static jfif-centered resampling (across block boundaries)
1098 
1099 alias resample_row_func = ubyte* function(ubyte *out_, ubyte *in0, ubyte *in1, int w, int hs);
1100 
1101 ubyte div4(int x)
1102 {
1103     return cast(ubyte)(x >> 2);
1104 }
1105 
1106 ubyte *resample_row_1(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1107 { 
1108    return in_near;
1109 }
1110 
1111 ubyte* resample_row_v_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1112 {
1113    // need to generate two samples vertically for every one in input
1114    int i;
1115    for (i=0; i < w; ++i)
1116       out_[i] = div4(3*in_near[i] + in_far[i] + 2);
1117    return out_;
1118 }
1119 
1120 ubyte*  resample_row_h_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1121 {
1122    // need to generate two samples horizontally for every one in input
1123    int i;
1124    ubyte *input = in_near;
1125 
1126    if (w == 1) {
1127       // if only one sample, can't do any interpolation
1128       out_[0] = out_[1] = input[0];
1129       return out_;
1130    }
1131 
1132    out_[0] = input[0];
1133    out_[1] = div4(input[0]*3 + input[1] + 2);
1134    for (i=1; i < w-1; ++i) {
1135       int n = 3*input[i]+2;
1136       out_[i*2+0] = div4(n+input[i-1]);
1137       out_[i*2+1] = div4(n+input[i+1]);
1138    }
1139    out_[i*2+0] = div4(input[w-2]*3 + input[w-1] + 2);
1140    out_[i*2+1] = input[w-1];
1141 
1142    return out_;
1143 }
1144 
1145 ubyte div16(int x)
1146 {
1147     return cast(ubyte)(x >> 4);
1148 }
1149 
1150 
1151 ubyte *resample_row_hv_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1152 {
1153    // need to generate 2x2 samples for every one in input
1154    int i,t0,t1;
1155    if (w == 1) {
1156       out_[0] = out_[1] = div4(3*in_near[0] + in_far[0] + 2);
1157       return out_;
1158    }
1159 
1160    t1 = 3*in_near[0] + in_far[0];
1161    out_[0] = div4(t1+2);
1162    for (i=1; i < w; ++i) {
1163       t0 = t1;
1164       t1 = 3*in_near[i]+in_far[i];
1165       out_[i*2-1] = div16(3*t0 + t1 + 8);
1166       out_[i*2  ] = div16(3*t1 + t0 + 8);
1167    }
1168    out_[w*2-1] = div4(t1+2);
1169 
1170    return out_;
1171 }
1172 
1173 ubyte *resample_row_generic(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1174 {
1175    // resample with nearest-neighbor
1176    int i,j;
1177    in_far = in_far;
1178    for (i=0; i < w; ++i)
1179       for (j=0; j < hs; ++j)
1180          out_[i*hs+j] = in_near[i];
1181    return out_;
1182 }
1183 
1184 int float2fixed(double x)
1185 {
1186     return cast(int)((x) * 65536 + 0.5);
1187 }
1188 
1189 // 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
1190 // VC6 without processor=Pro is generating multiple LEAs per multiply!
1191 void YCbCr_to_RGB_row(ubyte *out_, const ubyte *y, const ubyte *pcb, const ubyte *pcr, int count, int step)
1192 {
1193    int i;
1194    for (i=0; i < count; ++i) {
1195       int y_fixed = (y[i] << 16) + 32768; // rounding
1196       int r,g,b;
1197       int cr = pcr[i] - 128;
1198       int cb = pcb[i] - 128;
1199       r = y_fixed + cr*float2fixed(1.40200f);
1200       g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
1201       b = y_fixed                            + cb*float2fixed(1.77200f);
1202       r >>= 16;
1203       g >>= 16;
1204       b >>= 16;
1205       if (cast(uint) r > 255) { if (r < 0) r = 0; else r = 255; }
1206       if (cast(uint) g > 255) { if (g < 0) g = 0; else g = 255; }
1207       if (cast(uint) b > 255) { if (b < 0) b = 0; else b = 255; }
1208       out_[0] = cast(ubyte)r;
1209       out_[1] = cast(ubyte)g;
1210       out_[2] = cast(ubyte)b;
1211       out_[3] = 255;
1212       out_ += step;
1213    }
1214 }
1215 
1216 // clean up the temporary component buffers
1217 void cleanup_jpeg(jpeg *j)
1218 {
1219    int i;
1220    for (i=0; i < j.s.img_n; ++i) {
1221       if (j.img_comp[i].data) {
1222          free(j.img_comp[i].raw_data);
1223          j.img_comp[i].data = null;
1224       }
1225       if (j.img_comp[i].linebuf) {
1226          free(j.img_comp[i].linebuf);
1227          j.img_comp[i].linebuf = null;
1228       }
1229    }
1230 }
1231 
1232 struct stbi_resample
1233 {
1234    resample_row_func resample;
1235    ubyte* line0;
1236    ubyte* line1;
1237    int hs,vs;   // expansion factor in each axis
1238    int w_lores; // horizontal pixels pre-expansion 
1239    int ystep;   // how far through vertical expansion we are
1240    int ypos;    // which pre-expansion row we're on
1241 } ;
1242 
1243 ubyte *load_jpeg_image(jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
1244 {
1245    int n, decode_n;
1246    // validate req_comp
1247    if (req_comp < 0 || req_comp > 4) 
1248        throw new STBImageException("Internal error: bad req_comp");
1249    z.s.img_n = 0;
1250 
1251    // load a jpeg image from whichever source
1252    if (!decode_jpeg_image(z)) { cleanup_jpeg(z); return null; }
1253 
1254    // determine actual number of components to generate
1255    n = req_comp ? req_comp : z.s.img_n;
1256 
1257    if (z.s.img_n == 3 && n < 3)
1258       decode_n = 1;
1259    else
1260       decode_n = z.s.img_n;
1261 
1262    // resample and color-convert
1263    {
1264       int k;
1265       uint i,j;
1266       ubyte *output;
1267       ubyte *coutput[4];
1268 
1269       stbi_resample res_comp[4];
1270 
1271       for (k=0; k < decode_n; ++k) {
1272          stbi_resample *r = &res_comp[k];
1273 
1274          // allocate line buffer big enough for upsampling off the edges
1275          // with upsample factor of 4
1276          z.img_comp[k].linebuf = cast(ubyte*) malloc(z.s.img_x + 3);
1277          if (!z.img_comp[k].linebuf) 
1278          { 
1279              cleanup_jpeg(z); 
1280              throw new STBImageException("Out of memory");
1281          }
1282 
1283          r.hs      = z.img_h_max / z.img_comp[k].h;
1284          r.vs      = z.img_v_max / z.img_comp[k].v;
1285          r.ystep   = r.vs >> 1;
1286          r.w_lores = (z.s.img_x + r.hs-1) / r.hs;
1287          r.ypos    = 0;
1288          r.line0   = r.line1 = z.img_comp[k].data;
1289 
1290          if      (r.hs == 1 && r.vs == 1) r.resample = &resample_row_1;
1291          else if (r.hs == 1 && r.vs == 2) r.resample = &resample_row_v_2;
1292          else if (r.hs == 2 && r.vs == 1) r.resample = &resample_row_h_2;
1293          else if (r.hs == 2 && r.vs == 2) r.resample = &resample_row_hv_2;
1294          else                               r.resample = &resample_row_generic;
1295       }
1296 
1297       // can't error after this so, this is safe
1298       output = cast(ubyte*) malloc(n * z.s.img_x * z.s.img_y + 1);
1299       if (!output) { cleanup_jpeg(z); throw new STBImageException("Out of memory"); }
1300 
1301       // now go ahead and resample
1302       for (j=0; j < z.s.img_y; ++j) {
1303          ubyte *out_ = output + n * z.s.img_x * j;
1304          for (k=0; k < decode_n; ++k) {
1305             stbi_resample *r = &res_comp[k];
1306             int y_bot = r.ystep >= (r.vs >> 1);
1307             coutput[k] = r.resample(z.img_comp[k].linebuf,
1308                                      y_bot ? r.line1 : r.line0,
1309                                      y_bot ? r.line0 : r.line1,
1310                                      r.w_lores, r.hs);
1311             if (++r.ystep >= r.vs) {
1312                r.ystep = 0;
1313                r.line0 = r.line1;
1314                if (++r.ypos < z.img_comp[k].y)
1315                   r.line1 += z.img_comp[k].w2;
1316             }
1317          }
1318          if (n >= 3) {
1319             ubyte *y = coutput[0];
1320             if (z.s.img_n == 3) {
1321                YCbCr_to_RGB_row(out_, y, coutput[1], coutput[2], z.s.img_x, n);
1322             } else
1323                for (i=0; i < z.s.img_x; ++i) {
1324                   out_[0] = out_[1] = out_[2] = y[i];
1325                   out_[3] = 255; // not used if n==3
1326                   out_ += n;
1327                }
1328          } else {
1329             ubyte *y = coutput[0];
1330             if (n == 1)
1331                for (i=0; i < z.s.img_x; ++i) out_[i] = y[i];
1332             else
1333                for (i=0; i < z.s.img_x; ++i) *out_++ = y[i], *out_++ = 255;
1334          }
1335       }
1336       cleanup_jpeg(z);
1337       *out_x = z.s.img_x;
1338       *out_y = z.s.img_y;
1339       if (comp) *comp  = z.s.img_n; // report original components, not output
1340       return output;
1341    }
1342 }
1343 
1344 ubyte* stbi_jpeg_load(stbi *s, int *x, int *y, int *comp, int req_comp)
1345 {
1346    jpeg j;
1347    j.s = s;
1348    return load_jpeg_image(&j, x,y,comp,req_comp);
1349 }
1350 
1351 void stbi_jpeg_test(stbi *s)
1352 {
1353    jpeg j;
1354    j.s = s;
1355    int r = decode_jpeg_header(&j, SCAN_type);   
1356    if (r == 0)
1357        throw new STBImageException("Couldn't decode JPEG header");
1358 }
1359 
1360 
1361 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
1362 //    simple implementation
1363 //      - all input must be provided in an upfront buffer
1364 //      - all output is written to a single output buffer (can malloc/realloc)
1365 //    performance
1366 //      - fast huffman
1367 
1368 // fast-way is faster to check than jpeg huffman, but slow way is slower
1369 enum ZFAST_BITS = 9; // accelerate all cases in default tables
1370 enum ZFAST_MASK = ((1 << ZFAST_BITS) - 1);
1371 
1372 // zlib-style huffman encoding
1373 // (jpegs packs from left, zlib from right, so can't share code)
1374 struct zhuffman
1375 {
1376    ushort[1 << ZFAST_BITS] fast;
1377    ushort[16] firstcode;
1378    int[17] maxcode;
1379    ushort[16] firstsymbol;
1380    ubyte[288] size;
1381    ushort[288] value;
1382 } ;
1383 
1384 int bitreverse16(int n)
1385 {
1386   n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
1387   n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
1388   n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
1389   n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
1390   return n;
1391 }
1392 
1393 int bit_reverse(int v, int bits)
1394 {
1395    assert(bits <= 16);
1396    // to bit reverse n bits, reverse 16 and shift
1397    // e.g. 11 bits, bit reverse and shift away 5
1398    return bitreverse16(v) >> (16-bits);
1399 }
1400 
1401 int zbuild_huffman(zhuffman *z, ubyte *sizelist, int num)
1402 {
1403    int i,k=0;
1404    int code;
1405    int[16] next_code;
1406    int[17] sizes;
1407 
1408    // DEFLATE spec for generating codes
1409    memset(sizes.ptr, 0, sizes.sizeof);
1410    memset(z.fast.ptr, 255, z.fast.sizeof);
1411    for (i=0; i < num; ++i) 
1412       ++sizes[sizelist[i]];
1413    sizes[0] = 0;
1414    for (i=1; i < 16; ++i)
1415       assert(sizes[i] <= (1 << i));
1416    code = 0;
1417    for (i=1; i < 16; ++i) {
1418       next_code[i] = code;
1419       z.firstcode[i] = cast(ushort) code;
1420       z.firstsymbol[i] = cast(ushort) k;
1421       code = (code + sizes[i]);
1422       if (sizes[i])
1423          if (code-1 >= (1 << i)) 
1424             throw new STBImageException("Bad codelength, corrupt JPEG");
1425       z.maxcode[i] = code << (16-i); // preshift for inner loop
1426       code <<= 1;
1427       k += sizes[i];
1428    }
1429    z.maxcode[16] = 0x10000; // sentinel
1430    for (i=0; i < num; ++i) {
1431       int s = sizelist[i];
1432       if (s) {
1433          int c = next_code[s] - z.firstcode[s] + z.firstsymbol[s];
1434          z.size[c] = cast(ubyte)s;
1435          z.value[c] = cast(ushort)i;
1436          if (s <= ZFAST_BITS) {
1437             int k_ = bit_reverse(next_code[s],s);
1438             while (k_ < (1 << ZFAST_BITS)) {
1439                z.fast[k_] = cast(ushort) c;
1440                k_ += (1 << s);
1441             }
1442          }
1443          ++next_code[s];
1444       }
1445    }
1446    return 1;
1447 }
1448 
1449 // zlib-from-memory implementation for PNG reading
1450 //    because PNG allows splitting the zlib stream arbitrarily,
1451 //    and it's annoying structurally to have PNG call ZLIB call PNG,
1452 //    we require PNG read all the IDATs and combine them into a single
1453 //    memory buffer
1454 
1455 struct zbuf
1456 {
1457    const(ubyte) *zbuffer;
1458    const(ubyte) *zbuffer_end;
1459    int num_bits;
1460    uint code_buffer;
1461 
1462    ubyte *zout;
1463    ubyte *zout_start;
1464    ubyte *zout_end;
1465    int   z_expandable;
1466 
1467    zhuffman z_length, z_distance;
1468 } ;
1469 
1470 int zget8(zbuf *z)
1471 {
1472    if (z.zbuffer >= z.zbuffer_end) return 0;
1473    return *z.zbuffer++;
1474 }
1475 
1476 void fill_bits(zbuf *z)
1477 {
1478    do {
1479       assert(z.code_buffer < (1U << z.num_bits));
1480       z.code_buffer |= zget8(z) << z.num_bits;
1481       z.num_bits += 8;
1482    } while (z.num_bits <= 24);
1483 }
1484 
1485 uint zreceive(zbuf *z, int n)
1486 {
1487    uint k;
1488    if (z.num_bits < n) fill_bits(z);
1489    k = z.code_buffer & ((1 << n) - 1);
1490    z.code_buffer >>= n;
1491    z.num_bits -= n;
1492    return k;   
1493 }
1494 
1495 int zhuffman_decode(zbuf *a, zhuffman *z)
1496 {
1497    int b,s,k;
1498    if (a.num_bits < 16) fill_bits(a);
1499    b = z.fast[a.code_buffer & ZFAST_MASK];
1500    if (b < 0xffff) {
1501       s = z.size[b];
1502       a.code_buffer >>= s;
1503       a.num_bits -= s;
1504       return z.value[b];
1505    }
1506 
1507    // not resolved by fast table, so compute it the slow way
1508    // use jpeg approach, which requires MSbits at top
1509    k = bit_reverse(a.code_buffer, 16);
1510    for (s=ZFAST_BITS+1; ; ++s)
1511       if (k < z.maxcode[s])
1512          break;
1513    if (s == 16) return -1; // invalid code!
1514    // code size is s, so:
1515    b = (k >> (16-s)) - z.firstcode[s] + z.firstsymbol[s];
1516    assert(z.size[b] == s);
1517    a.code_buffer >>= s;
1518    a.num_bits -= s;
1519    return z.value[b];
1520 }
1521 
1522 int expand(zbuf *z, int n)  // need to make room for n bytes
1523 {
1524    ubyte *q;
1525    int cur, limit;
1526    if (!z.z_expandable) 
1527       throw new STBImageException("Output buffer limit, corrupt PNG");
1528    cur   = cast(int) (z.zout     - z.zout_start);
1529    limit = cast(int) (z.zout_end - z.zout_start);
1530    while (cur + n > limit)
1531       limit *= 2;
1532    q = cast(ubyte*) realloc(z.zout_start, limit);
1533    if (q == null) 
1534       throw new STBImageException("Out of memory");
1535    z.zout_start = q;
1536    z.zout       = q + cur;
1537    z.zout_end   = q + limit;
1538    return 1;
1539 }
1540 
1541 static immutable int length_base[31] = [
1542    3,4,5,6,7,8,9,10,11,13,
1543    15,17,19,23,27,31,35,43,51,59,
1544    67,83,99,115,131,163,195,227,258,0,0 ];
1545 
1546 static immutable int length_extra[31]= 
1547 [ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 ];
1548 
1549 static immutable int dist_base[32] = [ 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
1550 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0];
1551 
1552 static immutable int dist_extra[32] =
1553 [ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13];
1554 
1555 int parse_huffman_block(zbuf *a)
1556 {
1557    for(;;) {
1558       int z = zhuffman_decode(a, &a.z_length);
1559       if (z < 256) {
1560          if (z < 0) 
1561              throw new STBImageException("Bad Huffman code, corrupt PNG");             
1562          if (a.zout >= a.zout_end) if (!expand(a, 1)) return 0;
1563          *a.zout++ = cast(ubyte) z;
1564       } else {
1565          ubyte *p;
1566          int len,dist;
1567          if (z == 256) return 1;
1568          z -= 257;
1569          len = length_base[z];
1570          if (length_extra[z]) len += zreceive(a, length_extra[z]);
1571          z = zhuffman_decode(a, &a.z_distance);
1572          if (z < 0) throw new STBImageException("Bad Huffman code, corrupt PNG");
1573          dist = dist_base[z];
1574          if (dist_extra[z]) dist += zreceive(a, dist_extra[z]);
1575          if (a.zout - a.zout_start < dist) throw new STBImageException("Bad dist, corrupt PNG");
1576          if (a.zout + len > a.zout_end) if (!expand(a, len)) return 0;
1577          p = a.zout - dist;
1578          while (len--)
1579             *a.zout++ = *p++;
1580       }
1581    }
1582 }
1583 
1584 int compute_huffman_codes(zbuf *a)
1585 {
1586    static immutable ubyte length_dezigzag[19] = [ 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 ];
1587    zhuffman z_codelength;
1588    ubyte lencodes[286+32+137];//padding for maximum single op
1589    ubyte codelength_sizes[19];
1590    int i,n;
1591 
1592    int hlit  = zreceive(a,5) + 257;
1593    int hdist = zreceive(a,5) + 1;
1594    int hclen = zreceive(a,4) + 4;
1595 
1596    memset(codelength_sizes.ptr, 0, codelength_sizes.sizeof);
1597    for (i=0; i < hclen; ++i) {
1598       int s = zreceive(a,3);
1599       codelength_sizes[length_dezigzag[i]] = cast(ubyte) s;
1600    }
1601    if (!zbuild_huffman(&z_codelength, codelength_sizes.ptr, 19)) return 0;
1602 
1603    n = 0;
1604    while (n < hlit + hdist) {
1605       int c = zhuffman_decode(a, &z_codelength);
1606       assert(c >= 0 && c < 19);
1607       if (c < 16)
1608          lencodes[n++] = cast(ubyte) c;
1609       else if (c == 16) {
1610          c = zreceive(a,2)+3;
1611          memset(lencodes.ptr+n, lencodes[n-1], c);
1612          n += c;
1613       } else if (c == 17) {
1614          c = zreceive(a,3)+3;
1615          memset(lencodes.ptr+n, 0, c);
1616          n += c;
1617       } else {
1618          assert(c == 18);
1619          c = zreceive(a,7)+11;
1620          memset(lencodes.ptr+n, 0, c);
1621          n += c;
1622       }
1623    }
1624    if (n != hlit+hdist) throw new STBImageException("Bad codelengths, corrupt PNG");
1625    if (!zbuild_huffman(&a.z_length, lencodes.ptr, hlit)) return 0;
1626    if (!zbuild_huffman(&a.z_distance, lencodes.ptr+hlit, hdist)) return 0;
1627    return 1;
1628 }
1629 
1630 int parse_uncompressed_block(zbuf *a)
1631 {
1632    ubyte header[4];
1633    int len,nlen,k;
1634    if (a.num_bits & 7)
1635       zreceive(a, a.num_bits & 7); // discard
1636    // drain the bit-packed data into header
1637    k = 0;
1638    while (a.num_bits > 0) {
1639       header[k++] = cast(ubyte) (a.code_buffer & 255); // wtf this warns?
1640       a.code_buffer >>= 8;
1641       a.num_bits -= 8;
1642    }
1643    assert(a.num_bits == 0);
1644    // now fill header the normal way
1645    while (k < 4)
1646       header[k++] = cast(ubyte) zget8(a);
1647    len  = header[1] * 256 + header[0];
1648    nlen = header[3] * 256 + header[2];
1649    if (nlen != (len ^ 0xffff)) throw new STBImageException("Zlib corrupt, corrupt PNG");
1650    if (a.zbuffer + len > a.zbuffer_end) throw new STBImageException("Read past buffer, corrupt PNG");
1651    if (a.zout + len > a.zout_end)
1652       if (!expand(a, len)) return 0;
1653    memcpy(a.zout, a.zbuffer, len);
1654    a.zbuffer += len;
1655    a.zout += len;
1656    return 1;
1657 }
1658 
1659 int parse_zlib_header(zbuf *a)
1660 {
1661    int cmf   = zget8(a);
1662    int cm    = cmf & 15;
1663    /* int cinfo = cmf >> 4; */
1664    int flg   = zget8(a);
1665    if ((cmf*256+flg) % 31 != 0) throw new STBImageException("Bad zlib header, corrupt PNG"); // zlib spec
1666    if (flg & 32) throw new STBImageException("No preset dict, corrupt PNG"); // preset dictionary not allowed in png
1667    if (cm != 8) throw new STBImageException("Bad compression, corrupt PNG");  // DEFLATE required for png
1668    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
1669    return 1;
1670 }
1671 
1672 // @TODO: should statically initialize these for optimal thread safety
1673 __gshared ubyte[288] default_length;
1674 __gshared ubyte[32] default_distance;
1675 
1676 void init_defaults()
1677 {
1678    int i;   // use <= to match clearly with spec
1679    for (i=0; i <= 143; ++i)     default_length[i]   = 8;
1680    for (   ; i <= 255; ++i)     default_length[i]   = 9;
1681    for (   ; i <= 279; ++i)     default_length[i]   = 7;
1682    for (   ; i <= 287; ++i)     default_length[i]   = 8;
1683 
1684    for (i=0; i <=  31; ++i)     default_distance[i] = 5;
1685 }
1686 
1687 __gshared int stbi_png_partial; // a quick hack to only allow decoding some of a PNG... I should implement real streaming support instead
1688 int parse_zlib(zbuf *a, int parse_header)
1689 {
1690    int final_, type;
1691    if (parse_header)
1692       if (!parse_zlib_header(a)) return 0;
1693    a.num_bits = 0;
1694    a.code_buffer = 0;
1695    do {
1696       final_ = zreceive(a,1);
1697       type = zreceive(a,2);
1698       if (type == 0) {
1699          if (!parse_uncompressed_block(a)) return 0;
1700       } else if (type == 3) {
1701          return 0;
1702       } else {
1703          if (type == 1) {
1704             // use fixed code lengths
1705             if (!default_distance[31]) init_defaults();
1706             if (!zbuild_huffman(&a.z_length  , default_length.ptr  , 288)) return 0;
1707             if (!zbuild_huffman(&a.z_distance, default_distance.ptr,  32)) return 0;
1708          } else {
1709             if (!compute_huffman_codes(a)) return 0;
1710          }
1711          if (!parse_huffman_block(a)) return 0;
1712       }
1713       if (stbi_png_partial && a.zout - a.zout_start > 65536)
1714          break;
1715    } while (!final_);
1716    return 1;
1717 }
1718 
1719 int do_zlib(zbuf *a, ubyte *obuf, int olen, int exp, int parse_header)
1720 {
1721    a.zout_start = obuf;
1722    a.zout       = obuf;
1723    a.zout_end   = obuf + olen;
1724    a.z_expandable = exp;
1725 
1726    return parse_zlib(a, parse_header);
1727 }
1728 
1729 ubyte *stbi_zlib_decode_malloc_guesssize(const(ubyte) *buffer, int len, int initial_size, int *outlen)
1730 {
1731    zbuf a;
1732    ubyte *p = cast(ubyte*) malloc(initial_size);
1733    if (p == null) return null;
1734    a.zbuffer = buffer;
1735    a.zbuffer_end = buffer + len;
1736    if (do_zlib(&a, p, initial_size, 1, 1)) {
1737       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1738       return a.zout_start;
1739    } else {
1740       free(a.zout_start);
1741       return null;
1742    }
1743 }
1744 
1745 ubyte *stbi_zlib_decode_malloc(const(ubyte) *buffer, int len, int *outlen)
1746 {
1747    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
1748 }
1749 
1750 ubyte *stbi_zlib_decode_malloc_guesssize_headerflag(const(ubyte) *buffer, int len, int initial_size, int *outlen, int parse_header)
1751 {
1752    zbuf a;
1753    ubyte *p = cast(ubyte*) malloc(initial_size);
1754    if (p == null) return null;
1755    a.zbuffer = buffer;
1756    a.zbuffer_end = buffer + len;
1757    if (do_zlib(&a, p, initial_size, 1, parse_header)) {
1758       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1759       return a.zout_start;
1760    } else {
1761       free(a.zout_start);
1762       return null;
1763    }
1764 }
1765 
1766 int stbi_zlib_decode_buffer(ubyte* obuffer, int olen, const(ubyte)* ibuffer, int ilen)
1767 {
1768    zbuf a;
1769    a.zbuffer = ibuffer;
1770    a.zbuffer_end = ibuffer + ilen;
1771    if (do_zlib(&a, obuffer, olen, 0, 1))
1772       return cast(int) (a.zout - a.zout_start);
1773    else
1774       return -1;
1775 }
1776 
1777 ubyte *stbi_zlib_decode_noheader_malloc(const(ubyte) *buffer, int len, int *outlen)
1778 {
1779    zbuf a;
1780    ubyte *p = cast(ubyte*) malloc(16384);
1781    if (p == null) return null;
1782    a.zbuffer = buffer;
1783    a.zbuffer_end = buffer+len;
1784    if (do_zlib(&a, p, 16384, 1, 0)) {
1785       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1786       return a.zout_start;
1787    } else {
1788       free(a.zout_start);
1789       return null;
1790    }
1791 }
1792 
1793 int stbi_zlib_decode_noheader_buffer(ubyte *obuffer, int olen, const(ubyte) *ibuffer, int ilen)
1794 {
1795    zbuf a;
1796    a.zbuffer = ibuffer;
1797    a.zbuffer_end = ibuffer + ilen;
1798    if (do_zlib(&a, obuffer, olen, 0, 0))
1799       return cast(int) (a.zout - a.zout_start);
1800    else
1801       return -1;
1802 }
1803 
1804 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
1805 //    simple implementation
1806 //      - only 8-bit samples
1807 //      - no CRC checking
1808 //      - allocates lots of intermediate memory
1809 //        - avoids problem of streaming data between subsystems
1810 //        - avoids explicit window management
1811 //    performance
1812 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
1813 
1814 
1815 struct chunk
1816 {
1817    uint length;
1818    uint type;
1819 }
1820 
1821 uint PNG_TYPE(ubyte a, ubyte b, ubyte c, ubyte d)
1822 {
1823    return (a << 24) + (b << 16) + (c << 8) + d;
1824 }
1825 
1826 chunk get_chunk_header(stbi *s)
1827 {
1828    chunk c;
1829    c.length = get32(s);
1830    c.type   = get32(s);
1831    return c;
1832 }
1833 
1834 static int check_png_header(stbi *s)
1835 {
1836    static immutable ubyte[8] png_sig = [ 137, 80, 78, 71, 13, 10, 26, 10 ];
1837    for (int i = 0; i < 8; ++i)
1838    {
1839        ubyte headerByte = get8u(s);
1840        ubyte expected = png_sig[i];
1841        if (headerByte != expected) 
1842            throw new STBImageException("Bad PNG sig, not a PNG");
1843    }
1844    return 1;
1845 }
1846 
1847 struct png
1848 {
1849    stbi *s;
1850    ubyte *idata;
1851    ubyte *expanded;
1852    ubyte *out_;
1853 }
1854 
1855 
1856 enum : int 
1857 {
1858    F_none=0, F_sub=1, F_up=2, F_avg=3, F_paeth=4,
1859    F_avg_first, F_paeth_first
1860 }
1861 
1862 static immutable ubyte[5] first_row_filter =
1863 [
1864    F_none, F_sub, F_none, F_avg_first, F_paeth_first
1865 ];
1866 
1867 static int paeth(int a, int b, int c)
1868 {
1869    int p = a + b - c;
1870    int pa = abs(p-a);
1871    int pb = abs(p-b);
1872    int pc = abs(p-c);
1873    if (pa <= pb && pa <= pc) return a;
1874    if (pb <= pc) return b;
1875    return c;
1876 }
1877 
1878 // create the png data from post-deflated data
1879 static int create_png_image_raw(png *a, ubyte *raw, uint raw_len, int out_n, uint x, uint y)
1880 {
1881    stbi *s = a.s;
1882    uint i,j,stride = x*out_n;
1883    int k;
1884    int img_n = s.img_n; // copy it into a local for later
1885    assert(out_n == s.img_n || out_n == s.img_n+1);
1886    if (stbi_png_partial) y = 1;
1887    a.out_ = cast(ubyte*) malloc(x * y * out_n);
1888    if (!a.out_) throw new STBImageException("Out of memory");
1889    if (!stbi_png_partial) {
1890       if (s.img_x == x && s.img_y == y) {
1891          if (raw_len != (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1892       } else { // interlaced:
1893          if (raw_len < (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1894       }
1895    }
1896    for (j=0; j < y; ++j) {
1897       ubyte *cur = a.out_ + stride*j;
1898       ubyte *prior = cur - stride;
1899       int filter = *raw++;
1900       if (filter > 4) throw new STBImageException("Invalid filter, corrupt PNG");
1901       // if first row, use special filter that doesn't sample previous row
1902       if (j == 0) filter = first_row_filter[filter];
1903       // handle first pixel explicitly
1904       for (k=0; k < img_n; ++k) {
1905          switch (filter) {
1906             case F_none       : cur[k] = raw[k]; break;
1907             case F_sub        : cur[k] = raw[k]; break;
1908             case F_up         : cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1909             case F_avg        : cur[k] = cast(ubyte)(raw[k] + (prior[k]>>1)); break;
1910             case F_paeth      : cur[k] = cast(ubyte) (raw[k] + paeth(0,prior[k],0)); break;
1911             case F_avg_first  : cur[k] = raw[k]; break;
1912             case F_paeth_first: cur[k] = raw[k]; break;
1913             default: break;
1914          }
1915       }
1916       if (img_n != out_n) cur[img_n] = 255;
1917       raw += img_n;
1918       cur += out_n;
1919       prior += out_n;
1920       // this is a little gross, so that we don't switch per-pixel or per-component
1921       if (img_n == out_n) {
1922 
1923          for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n)
1924             for (k=0; k < img_n; ++k)
1925             {
1926                switch (filter) {
1927                   case F_none:  cur[k] = raw[k]; break;
1928                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-img_n]); break;
1929                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1930                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-img_n])>>1)); break;
1931                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
1932                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-img_n] >> 1)); break;
1933                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],0,0)); break;
1934                   default: break;
1935                }
1936             }
1937       } else {
1938          assert(img_n+1 == out_n);
1939 
1940          for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n)
1941             for (k=0; k < img_n; ++k)
1942             {
1943                switch (filter) {
1944                   case F_none:  cur[k] = raw[k]; break;
1945                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-out_n]); break;
1946                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1947                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
1948                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
1949                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-out_n] >> 1)); break;
1950                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],0,0)); break;
1951                   default: break;
1952                }
1953             }
1954       }
1955    }
1956    return 1;
1957 }
1958 
1959 int create_png_image(png *a, ubyte *raw, uint raw_len, int out_n, int interlaced)
1960 {
1961    ubyte *final_;
1962    int p;
1963    int save;
1964    if (!interlaced)
1965       return create_png_image_raw(a, raw, raw_len, out_n, a.s.img_x, a.s.img_y);
1966    save = stbi_png_partial;
1967    stbi_png_partial = 0;
1968 
1969    // de-interlacing
1970    final_ = cast(ubyte*) malloc(a.s.img_x * a.s.img_y * out_n);
1971    for (p=0; p < 7; ++p) {
1972       int xorig[] = [ 0,4,0,2,0,1,0 ];
1973       int yorig[] = [ 0,0,4,0,2,0,1 ];
1974       int xspc[]  = [ 8,8,4,4,2,2,1 ];
1975       int yspc[]  = [ 8,8,8,4,4,2,2 ];
1976       int i,j,x,y;
1977       // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
1978       x = (a.s.img_x - xorig[p] + xspc[p]-1) / xspc[p];
1979       y = (a.s.img_y - yorig[p] + yspc[p]-1) / yspc[p];
1980       if (x && y) {
1981          if (!create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
1982             free(final_);
1983             return 0;
1984          }
1985          for (j=0; j < y; ++j)
1986             for (i=0; i < x; ++i)
1987                memcpy(final_ + (j*yspc[p]+yorig[p])*a.s.img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
1988                       a.out_ + (j*x+i)*out_n, out_n);
1989          free(a.out_);
1990          raw += (x*out_n+1)*y;
1991          raw_len -= (x*out_n+1)*y;
1992       }
1993    }
1994    a.out_ = final_;
1995 
1996    stbi_png_partial = save;
1997    return 1;
1998 }
1999 
2000 static int compute_transparency(png *z, ubyte tc[3], int out_n)
2001 {
2002    stbi *s = z.s;
2003    uint i, pixel_count = s.img_x * s.img_y;
2004    ubyte *p = z.out_;
2005 
2006    // compute color-based transparency, assuming we've
2007    // already got 255 as the alpha value in the output
2008    assert(out_n == 2 || out_n == 4);
2009 
2010    if (out_n == 2) {
2011       for (i=0; i < pixel_count; ++i) {
2012          p[1] = (p[0] == tc[0] ? 0 : 255);
2013          p += 2;
2014       }
2015    } else {
2016       for (i=0; i < pixel_count; ++i) {
2017          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
2018             p[3] = 0;
2019          p += 4;
2020       }
2021    }
2022    return 1;
2023 }
2024 
2025 int expand_palette(png *a, ubyte *palette, int len, int pal_img_n)
2026 {
2027    uint i, pixel_count = a.s.img_x * a.s.img_y;
2028    ubyte *p;
2029    ubyte *temp_out;
2030    ubyte *orig = a.out_;
2031 
2032    p = cast(ubyte*) malloc(pixel_count * pal_img_n);
2033    if (p == null) 
2034       throw new STBImageException("Out of memory");
2035 
2036    // between here and free(out) below, exitting would leak
2037    temp_out = p;
2038 
2039    if (pal_img_n == 3) {
2040       for (i=0; i < pixel_count; ++i) {
2041          int n = orig[i]*4;
2042          p[0] = palette[n  ];
2043          p[1] = palette[n+1];
2044          p[2] = palette[n+2];
2045          p += 3;
2046       }
2047    } else {
2048       for (i=0; i < pixel_count; ++i) {
2049          int n = orig[i]*4;
2050          p[0] = palette[n  ];
2051          p[1] = palette[n+1];
2052          p[2] = palette[n+2];
2053          p[3] = palette[n+3];
2054          p += 4;
2055       }
2056    }
2057    free(a.out_);
2058    a.out_ = temp_out;
2059 
2060    return 1;
2061 }
2062 
2063 int parse_png_file(png *z, int scan, int req_comp)
2064 {
2065    ubyte[1024] palette;
2066    ubyte pal_img_n=0;
2067    ubyte has_trans=0;
2068    ubyte tc[3];
2069    uint ioff=0, idata_limit=0, i, pal_len=0;
2070    int first=1,k,interlace=0;
2071    stbi *s = z.s;
2072 
2073    z.expanded = null;
2074    z.idata = null;
2075    z.out_ = null;
2076 
2077    if (!check_png_header(s)) return 0;
2078 
2079    if (scan == SCAN_type) return 1;
2080 
2081    for (;;) {
2082       chunk c = get_chunk_header(s);
2083       switch (c.type) {
2084          case PNG_TYPE('I','H','D','R'): {
2085             int depth,color,comp,filter;
2086             if (!first) throw new STBImageException("Multiple IHDR, corrupt PNG");
2087             first = 0;
2088             if (c.length != 13) throw new STBImageException("Bad IHDR len, corrupt PNG");
2089             s.img_x = get32(s); if (s.img_x > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2090             s.img_y = get32(s); if (s.img_y > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2091             depth = get8(s);  if (depth != 8)        throw new STBImageException("8bit only, PNG not supported: 8-bit only");
2092             color = get8(s);  if (color > 6)         throw new STBImageException("Bad ctype, corrupt PNG");
2093             if (color == 3) pal_img_n = 3; else if (color & 1) throw new STBImageException("Bad ctype, corrupt PNG");
2094             comp  = get8(s);  if (comp) throw new STBImageException("Bad comp method, corrupt PNG");
2095             filter= get8(s);  if (filter) throw new STBImageException("Bad filter method, corrupt PNG");
2096             interlace = get8(s); if (interlace>1) throw new STBImageException("Bad interlace method, corrupt PNG");
2097             if (!s.img_x || !s.img_y) throw new STBImageException("0-pixel image, corrupt PNG");
2098             if (!pal_img_n) {
2099                s.img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
2100                if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
2101                if (scan == SCAN_header) return 1;
2102             } else {
2103                // if paletted, then pal_n is our final components, and
2104                // img_n is # components to decompress/filter.
2105                s.img_n = 1;
2106                if ((1 << 30) / s.img_x / 4 < s.img_y) throw new STBImageException("Too large, corrupt PNG");
2107                // if SCAN_header, have to scan to see if we have a tRNS
2108             }
2109             break;
2110          }
2111 
2112          case PNG_TYPE('P','L','T','E'):  {
2113             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2114             if (c.length > 256*3) throw new STBImageException("invalid PLTE, corrupt PNG");
2115             pal_len = c.length / 3;
2116             if (pal_len * 3 != c.length) throw new STBImageException("invalid PLTE, corrupt PNG");
2117             for (i=0; i < pal_len; ++i) {
2118                palette[i*4+0] = get8u(s);
2119                palette[i*4+1] = get8u(s);
2120                palette[i*4+2] = get8u(s);
2121                palette[i*4+3] = 255;
2122             }
2123             break;
2124          }
2125 
2126          case PNG_TYPE('t','R','N','S'): {
2127             if (first) throw new STBImageException("first not IHDR, cCorrupt PNG");
2128             if (z.idata) throw new STBImageException("tRNS after IDAT, corrupt PNG");
2129             if (pal_img_n) {
2130                if (scan == SCAN_header) { s.img_n = 4; return 1; }
2131                if (pal_len == 0) throw new STBImageException("tRNS before PLTE, corrupt PNG");
2132                if (c.length > pal_len) throw new STBImageException("bad tRNS len, corrupt PNG");
2133                pal_img_n = 4;
2134                for (i=0; i < c.length; ++i)
2135                   palette[i*4+3] = get8u(s);
2136             } else {
2137                if (!(s.img_n & 1)) throw new STBImageException("tRNS with alpha, corrupt PNG");
2138                if (c.length != cast(uint) s.img_n*2) throw new STBImageException("bad tRNS len, corrupt PNG");
2139                has_trans = 1;
2140                for (k=0; k < s.img_n; ++k)
2141                   tc[k] = cast(ubyte) get16(s); // non 8-bit images will be larger
2142             }
2143             break;
2144          }
2145 
2146          case PNG_TYPE('I','D','A','T'): {
2147             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2148             if (pal_img_n && !pal_len) throw new STBImageException("no PLTE, corrupt PNG");
2149             if (scan == SCAN_header) { s.img_n = pal_img_n; return 1; }
2150             if (ioff + c.length > idata_limit) {
2151                ubyte *p;
2152                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
2153                while (ioff + c.length > idata_limit)
2154                   idata_limit *= 2;
2155                p = cast(ubyte*) realloc(z.idata, idata_limit); if (p == null) throw new STBImageException("outofmem, cOut of memory");
2156                z.idata = p;
2157             }
2158             if (!getn(s, z.idata+ioff,c.length)) throw new STBImageException("outofdata, corrupt PNG");
2159             ioff += c.length;
2160             break;
2161          }
2162 
2163          case PNG_TYPE('I','E','N','D'): {
2164             uint raw_len;
2165             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2166             if (scan != SCAN_load) return 1;
2167             if (z.idata == null) throw new STBImageException("no IDAT, corrupt PNG");
2168             z.expanded = stbi_zlib_decode_malloc_guesssize_headerflag(z.idata, ioff, 16384, cast(int *) &raw_len, 1);
2169             if (z.expanded == null) return 0; // zlib should set error
2170             free(z.idata); z.idata = null;
2171             if ((req_comp == s.img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
2172                s.img_out_n = s.img_n+1;
2173             else
2174                s.img_out_n = s.img_n;
2175             if (!create_png_image(z, z.expanded, raw_len, s.img_out_n, interlace)) return 0;
2176             if (has_trans)
2177                if (!compute_transparency(z, tc, s.img_out_n)) return 0;
2178             if (pal_img_n) {
2179                // pal_img_n == 3 or 4
2180                s.img_n = pal_img_n; // record the actual colors we had
2181                s.img_out_n = pal_img_n;
2182                if (req_comp >= 3) s.img_out_n = req_comp;
2183                if (!expand_palette(z, palette.ptr, pal_len, s.img_out_n))
2184                   return 0;
2185             }
2186             free(z.expanded); z.expanded = null;
2187             return 1;
2188          }
2189 
2190          default:
2191             // if critical, fail
2192             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2193             if ((c.type & (1 << 29)) == 0) {
2194 
2195                throw new STBImageException("PNG not supported: unknown chunk type");
2196             }
2197             skip(s, c.length);
2198             break;
2199       }
2200       // end of chunk, read and skip CRC
2201       get32(s);
2202    }
2203 }
2204 
2205 ubyte *do_png(png *p, int *x, int *y, int *n, int req_comp)
2206 {
2207    ubyte *result=null;
2208    if (req_comp < 0 || req_comp > 4) 
2209       throw new STBImageException("Internal error: bad req_comp");
2210    if (parse_png_file(p, SCAN_load, req_comp)) {
2211       result = p.out_;
2212       p.out_ = null;
2213       if (req_comp && req_comp != p.s.img_out_n) {
2214          result = convert_format(result, p.s.img_out_n, req_comp, p.s.img_x, p.s.img_y);
2215          p.s.img_out_n = req_comp;
2216          if (result == null) return result;
2217       }
2218       *x = p.s.img_x;
2219       *y = p.s.img_y;
2220       if (n) *n = p.s.img_n;
2221    }
2222    free(p.out_);      p.out_    = null;
2223    free(p.expanded); p.expanded = null;
2224    free(p.idata);    p.idata    = null;
2225 
2226    return result;
2227 }
2228 
2229 ubyte *stbi_png_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2230 {
2231    png p;
2232    p.s = s;
2233    return do_png(&p, x,y,comp,req_comp);
2234 }
2235 
2236 void stbi_png_test(stbi *s)
2237 {
2238    int r = check_png_header(s);
2239    if (r == 0)
2240        throw new STBImageException("Couldn't decode PNG header");
2241 }
2242 
2243 // Microsoft/Windows BMP image
2244 
2245 void stbi_bmp_test(stbi *s)
2246 {
2247     if (get8(s) != 'B') throw new STBImageException("Couldn't decode BMP header");
2248     if (get8(s) != 'M') throw new STBImageException("Couldn't decode BMP header");
2249     get32le(s); // discard filesize
2250     get16le(s); // discard reserved
2251     get16le(s); // discard reserved
2252     get32le(s); // discard data offset
2253     int sz = get32le(s);
2254     if (sz == 12 || sz == 40 || sz == 56 || sz == 108) 
2255         return;
2256 
2257     throw new STBImageException("Couldn't decode BMP header");
2258 }
2259 
2260 
2261 // returns 0..31 for the highest set bit
2262 int high_bit(uint z)
2263 {
2264    int n=0;
2265    if (z == 0) return -1;
2266    if (z >= 0x10000) n += 16, z >>= 16;
2267    if (z >= 0x00100) n +=  8, z >>=  8;
2268    if (z >= 0x00010) n +=  4, z >>=  4;
2269    if (z >= 0x00004) n +=  2, z >>=  2;
2270    if (z >= 0x00002) n +=  1, z >>=  1;
2271    return n;
2272 }
2273 
2274 int bitcount(uint a)
2275 {
2276    a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
2277    a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
2278    a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
2279    a = (a + (a >> 8)); // max 16 per 8 bits
2280    a = (a + (a >> 16)); // max 32 per 8 bits
2281    return a & 0xff;
2282 }
2283 
2284 int shiftsigned(int v, int shift, int bits)
2285 {
2286    int result;
2287    int z=0;
2288 
2289    if (shift < 0) v <<= -shift;
2290    else v >>= shift;
2291    result = v;
2292 
2293    z = bits;
2294    while (z < 8) {
2295       result += v >> z;
2296       z += bits;
2297    }
2298    return result;
2299 }
2300 
2301 ubyte *bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2302 {
2303    ubyte *out_;
2304    uint mr=0,mg=0,mb=0,ma=0, fake_a=0;
2305    ubyte pal[256][4];
2306    int psize=0,i,j,compress=0,width;
2307    int bpp, flip_vertically, pad, target, offset, hsz;
2308    if (get8(s) != 'B' || get8(s) != 'M') throw new STBImageException("not BMP, Corrupt BMP");
2309    get32le(s); // discard filesize
2310    get16le(s); // discard reserved
2311    get16le(s); // discard reserved
2312    offset = get32le(s);
2313    hsz = get32le(s);
2314    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108) throw new STBImageException("unknown BMP, BMP type not supported: unknown");
2315    if (hsz == 12) {
2316       s.img_x = get16le(s);
2317       s.img_y = get16le(s);
2318    } else {
2319       s.img_x = get32le(s);
2320       s.img_y = get32le(s);
2321    }
2322    if (get16le(s) != 1) throw new STBImageException("bad BMP");
2323    bpp = get16le(s);
2324    if (bpp == 1) throw new STBImageException("monochrome, BMP type not supported: 1-bit");
2325    flip_vertically = (cast(int) s.img_y) > 0;
2326    s.img_y = abs(cast(int) s.img_y);
2327    if (hsz == 12) {
2328       if (bpp < 24)
2329          psize = (offset - 14 - 24) / 3;
2330    } else {
2331       compress = get32le(s);
2332       if (compress == 1 || compress == 2) throw new STBImageException("BMP RLE, BMP type not supported: RLE");
2333       get32le(s); // discard sizeof
2334       get32le(s); // discard hres
2335       get32le(s); // discard vres
2336       get32le(s); // discard colorsused
2337       get32le(s); // discard max important
2338       if (hsz == 40 || hsz == 56) {
2339          if (hsz == 56) {
2340             get32le(s);
2341             get32le(s);
2342             get32le(s);
2343             get32le(s);
2344          }
2345          if (bpp == 16 || bpp == 32) {
2346             mr = mg = mb = 0;
2347             if (compress == 0) {
2348                if (bpp == 32) {
2349                   mr = 0xffu << 16;
2350                   mg = 0xffu <<  8;
2351                   mb = 0xffu <<  0;
2352                   ma = 0xffu << 24;
2353                   fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
2354                } else {
2355                   mr = 31u << 10;
2356                   mg = 31u <<  5;
2357                   mb = 31u <<  0;
2358                }
2359             } else if (compress == 3) {
2360                mr = get32le(s);
2361                mg = get32le(s);
2362                mb = get32le(s);
2363                // not documented, but generated by photoshop and handled by mspaint
2364                if (mr == mg && mg == mb) {
2365                   // ?!?!?
2366                   throw new STBImageException("bad BMP");
2367                }
2368             } else
2369                throw new STBImageException("bad BMP");
2370          }
2371       } else {
2372          assert(hsz == 108);
2373          mr = get32le(s);
2374          mg = get32le(s);
2375          mb = get32le(s);
2376          ma = get32le(s);
2377          get32le(s); // discard color space
2378          for (i=0; i < 12; ++i)
2379             get32le(s); // discard color space parameters
2380       }
2381       if (bpp < 16)
2382          psize = (offset - 14 - hsz) >> 2;
2383    }
2384    s.img_n = ma ? 4 : 3;
2385    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
2386       target = req_comp;
2387    else
2388       target = s.img_n; // if they want monochrome, we'll post-convert
2389    out_ = cast(ubyte*) malloc(target * s.img_x * s.img_y);
2390    if (!out_) throw new STBImageException("Out of memory");
2391    if (bpp < 16) {
2392       int z=0;
2393       if (psize == 0 || psize > 256) { free(out_); throw new STBImageException("invalid, Corrupt BMP"); }
2394       for (i=0; i < psize; ++i) {
2395          pal[i][2] = get8u(s);
2396          pal[i][1] = get8u(s);
2397          pal[i][0] = get8u(s);
2398          if (hsz != 12) get8(s);
2399          pal[i][3] = 255;
2400       }
2401       skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
2402       if (bpp == 4) width = (s.img_x + 1) >> 1;
2403       else if (bpp == 8) width = s.img_x;
2404       else { free(out_); throw new STBImageException("bad bpp, corrupt BMP"); }
2405       pad = (-width)&3;
2406       for (j=0; j < cast(int) s.img_y; ++j) {
2407          for (i=0; i < cast(int) s.img_x; i += 2) {
2408             int v=get8(s),v2=0;
2409             if (bpp == 4) {
2410                v2 = v & 15;
2411                v >>= 4;
2412             }
2413             out_[z++] = pal[v][0];
2414             out_[z++] = pal[v][1];
2415             out_[z++] = pal[v][2];
2416             if (target == 4) out_[z++] = 255;
2417             if (i+1 == cast(int) s.img_x) break;
2418             v = (bpp == 8) ? get8(s) : v2;
2419             out_[z++] = pal[v][0];
2420             out_[z++] = pal[v][1];
2421             out_[z++] = pal[v][2];
2422             if (target == 4) out_[z++] = 255;
2423          }
2424          skip(s, pad);
2425       }
2426    } else {
2427       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
2428       int z = 0;
2429       int easy=0;
2430       skip(s, offset - 14 - hsz);
2431       if (bpp == 24) width = 3 * s.img_x;
2432       else if (bpp == 16) width = 2*s.img_x;
2433       else /* bpp = 32 and pad = 0 */ width=0;
2434       pad = (-width) & 3;
2435       if (bpp == 24) {
2436          easy = 1;
2437       } else if (bpp == 32) {
2438          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
2439             easy = 2;
2440       }
2441       if (!easy) {
2442          if (!mr || !mg || !mb) { free(out_); throw new STBImageException("bad masks, corrupt BMP"); }
2443          // right shift amt to put high bit in position #7
2444          rshift = high_bit(mr)-7; rcount = bitcount(mr);
2445          gshift = high_bit(mg)-7; gcount = bitcount(mr);
2446          bshift = high_bit(mb)-7; bcount = bitcount(mr);
2447          ashift = high_bit(ma)-7; acount = bitcount(mr);
2448       }
2449       for (j=0; j < cast(int) s.img_y; ++j) {
2450          if (easy) {
2451             for (i=0; i < cast(int) s.img_x; ++i) {
2452                int a;
2453                out_[z+2] = get8u(s);
2454                out_[z+1] = get8u(s);
2455                out_[z+0] = get8u(s);
2456                z += 3;
2457                a = (easy == 2 ? get8(s) : 255);
2458                if (target == 4) out_[z++] = cast(ubyte) a;
2459             }
2460          } else {
2461             for (i=0; i < cast(int) s.img_x; ++i) {
2462                uint v = (bpp == 16 ? get16le(s) : get32le(s));
2463                int a;
2464                out_[z++] = cast(ubyte) shiftsigned(v & mr, rshift, rcount);
2465                out_[z++] = cast(ubyte) shiftsigned(v & mg, gshift, gcount);
2466                out_[z++] = cast(ubyte) shiftsigned(v & mb, bshift, bcount);
2467                a = (ma ? shiftsigned(v & ma, ashift, acount) : 255);
2468                if (target == 4) out_[z++] = cast(ubyte) a; 
2469             }
2470          }
2471          skip(s, pad);
2472       }
2473    }
2474    if (flip_vertically) {
2475       ubyte t;
2476       for (j=0; j < cast(int) s.img_y>>1; ++j) {
2477          ubyte *p1 = out_ +      j     *s.img_x*target;
2478          ubyte *p2 = out_ + (s.img_y-1-j)*s.img_x*target;
2479          for (i=0; i < cast(int) s.img_x*target; ++i) {
2480             t = p1[i], p1[i] = p2[i], p2[i] = t;
2481          }
2482       }
2483    }
2484 
2485    if (req_comp && req_comp != target) {
2486       out_ = convert_format(out_, target, req_comp, s.img_x, s.img_y);
2487       if (out_ == null) return out_; // convert_format frees input on failure
2488    }
2489 
2490    *x = s.img_x;
2491    *y = s.img_y;
2492    if (comp) *comp = s.img_n;
2493    return out_;
2494 }
2495 
2496 ubyte *stbi_bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2497 {
2498    return bmp_load(s, x,y,comp,req_comp);
2499 }
2500 
2501 // *************************************************************************************************
2502 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
2503 struct stbi_gif_lzw 
2504 {
2505    short prefix;
2506    ubyte first;
2507    ubyte suffix;
2508 }
2509 
2510 struct stbi_gif
2511 {
2512    int w,h;
2513    ubyte *out_;                 // output buffer (always 4 components)
2514    int flags, bgindex, ratio, transparent, eflags;
2515    ubyte  pal[256][4];
2516    ubyte lpal[256][4];
2517    stbi_gif_lzw codes[4096];
2518    ubyte *color_table;
2519    int parse, step;
2520    int lflags;
2521    int start_x, start_y;
2522    int max_x, max_y;
2523    int cur_x, cur_y;
2524    int line_size;
2525 }
2526 
2527 void stbi_gif_test(stbi *s)
2528 {
2529     int sz;
2530     if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8') 
2531         throw new STBImageException("Couldn't decode GIF header");
2532     sz = get8(s);
2533     if (sz != '9' && sz != '7') 
2534         throw new STBImageException("Couldn't decode GIF header");
2535     if (get8(s) != 'a') 
2536         throw new STBImageException("Couldn't decode GIF header");
2537 }
2538 
2539 void stbi_gif_parse_colortable(stbi *s, ubyte pal[256][4], int num_entries, int transp)
2540 {
2541    int i;
2542    for (i=0; i < num_entries; ++i) {
2543       pal[i][2] = get8u(s);
2544       pal[i][1] = get8u(s);
2545       pal[i][0] = get8u(s);
2546       pal[i][3] = transp ? 0 : 255;
2547    }   
2548 }
2549 
2550 int stbi_gif_header(stbi *s, stbi_gif *g, int *comp, int is_info)
2551 {
2552    ubyte version_;
2553    if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2554       throw new STBImageException("not GIF, corrupt GIF");
2555 
2556    version_ = get8u(s);
2557    if (version_ != '7' && version_ != '9')    throw new STBImageException("not GIF, corrupt GIF");
2558    if (get8(s) != 'a')                      throw new STBImageException("not GIF, corrupt GIF");
2559  
2560    g.w = get16le(s);
2561    g.h = get16le(s);
2562    g.flags = get8(s);
2563    g.bgindex = get8(s);
2564    g.ratio = get8(s);
2565    g.transparent = -1;
2566 
2567    if (comp != null) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
2568 
2569    if (is_info) return 1;
2570 
2571    if (g.flags & 0x80)
2572       stbi_gif_parse_colortable(s,g.pal, 2 << (g.flags & 7), -1);
2573 
2574    return 1;
2575 }
2576 
2577 void stbi_out_gif_code(stbi_gif *g, ushort code)
2578 {
2579    ubyte *p;
2580    ubyte *c;
2581 
2582    // recurse to decode the prefixes, since the linked-list is backwards,
2583    // and working backwards through an interleaved image would be nasty
2584    if (g.codes[code].prefix >= 0)
2585       stbi_out_gif_code(g, g.codes[code].prefix);
2586 
2587    if (g.cur_y >= g.max_y) return;
2588   
2589    p = (&g.out_[g.cur_x + g.cur_y]);
2590    c = &g.color_table[g.codes[code].suffix * 4];
2591 
2592    if (c[3] >= 128) {
2593       p[0] = c[2];
2594       p[1] = c[1];
2595       p[2] = c[0];
2596       p[3] = c[3];
2597    }
2598    g.cur_x += 4;
2599 
2600    if (g.cur_x >= g.max_x) {
2601       g.cur_x = g.start_x;
2602       g.cur_y += g.step;
2603 
2604       while (g.cur_y >= g.max_y && g.parse > 0) {
2605          g.step = (1 << g.parse) * g.line_size;
2606          g.cur_y = g.start_y + (g.step >> 1);
2607          --g.parse;
2608       }
2609    }
2610 }
2611 
2612 ubyte *stbi_process_gif_raster(stbi *s, stbi_gif *g)
2613 {
2614    ubyte lzw_cs;
2615    int len, code;
2616    uint first;
2617    int codesize, codemask, avail, oldcode, bits, valid_bits, clear;
2618    stbi_gif_lzw *p;
2619 
2620    lzw_cs = get8u(s);
2621    clear = 1 << lzw_cs;
2622    first = 1;
2623    codesize = lzw_cs + 1;
2624    codemask = (1 << codesize) - 1;
2625    bits = 0;
2626    valid_bits = 0;
2627    for (code = 0; code < clear; code++) {
2628       g.codes[code].prefix = -1;
2629       g.codes[code].first = cast(ubyte) code;
2630       g.codes[code].suffix = cast(ubyte) code;
2631    }
2632 
2633    // support no starting clear code
2634    avail = clear+2;
2635    oldcode = -1;
2636 
2637    len = 0;
2638    for(;;) {
2639       if (valid_bits < codesize) {
2640          if (len == 0) {
2641             len = get8(s); // start new block
2642             if (len == 0) 
2643                return g.out_;
2644          }
2645          --len;
2646          bits |= cast(int) get8(s) << valid_bits;
2647          valid_bits += 8;
2648       } else {
2649          int code_ = bits & codemask;
2650          bits >>= codesize;
2651          valid_bits -= codesize;
2652          // @OPTIMIZE: is there some way we can accelerate the non-clear path?
2653          if (code_ == clear) {  // clear code
2654             codesize = lzw_cs + 1;
2655             codemask = (1 << codesize) - 1;
2656             avail = clear + 2;
2657             oldcode = -1;
2658             first = 0;
2659          } else if (code_ == clear + 1) { // end of stream code
2660             skip(s, len);
2661             while ((len = get8(s)) > 0)
2662                skip(s,len);
2663             return g.out_;
2664          } else if (code_ <= avail) {
2665             if (first) throw new STBImageException("no clear code, corrupt GIF");
2666 
2667             if (oldcode >= 0) {
2668                p = &g.codes[avail++];
2669                if (avail > 4096)        throw new STBImageException("too many codes, corrupt GIF");
2670                p.prefix = cast(short) oldcode;
2671                p.first = g.codes[oldcode].first;
2672                p.suffix = (code_ == avail) ? p.first : g.codes[code_].first;
2673             } else if (code_ == avail)
2674                throw new STBImageException("illegal code in raster, corrupt GIF");
2675 
2676             stbi_out_gif_code(g, cast(ushort) code);
2677 
2678             if ((avail & codemask) == 0 && avail <= 0x0FFF) {
2679                codesize++;
2680                codemask = (1 << codesize) - 1;
2681             }
2682 
2683             oldcode = code_;
2684          } else {
2685             throw new STBImageException("illegal code in raster, corrupt GIF");
2686          }
2687       } 
2688    }
2689 }
2690 
2691 void stbi_fill_gif_background(stbi_gif *g)
2692 {
2693    int i;
2694    ubyte *c = g.pal[g.bgindex].ptr;
2695    // @OPTIMIZE: write a dword at a time
2696    for (i = 0; i < g.w * g.h * 4; i += 4) {
2697       ubyte *p  = &g.out_[i];
2698       p[0] = c[2];
2699       p[1] = c[1];
2700       p[2] = c[0];
2701       p[3] = c[3];
2702    }
2703 }
2704 
2705 // this function is designed to support animated gifs, although stb_image doesn't support it
2706 ubyte *stbi_gif_load_next(stbi *s, stbi_gif *g, int *comp, int req_comp)
2707 {
2708    int i;
2709    ubyte *old_out = null;
2710 
2711    if (g.out_ == null) {
2712       if (!stbi_gif_header(s, g, comp,0))     return null; // failure_reason set by stbi_gif_header
2713       g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2714       if (g.out_ == null)                      throw new STBImageException("Out of memory");
2715       stbi_fill_gif_background(g);
2716    } else {
2717       // animated-gif-only path
2718       if (((g.eflags & 0x1C) >> 2) == 3) {
2719          old_out = g.out_;
2720          g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2721          if (g.out_ == null)                   throw new STBImageException("Out of memory");
2722          memcpy(g.out_, old_out, g.w*g.h*4);
2723       }
2724    }
2725     
2726    for (;;) {
2727       switch (get8(s)) {
2728          case 0x2C: /* Image Descriptor */
2729          {
2730             int x, y, w, h;
2731             ubyte *o;
2732 
2733             x = get16le(s);
2734             y = get16le(s);
2735             w = get16le(s);
2736             h = get16le(s);
2737             if (((x + w) > (g.w)) || ((y + h) > (g.h)))
2738                throw new STBImageException("bad Image Descriptor, corrupt GIF");
2739 
2740             g.line_size = g.w * 4;
2741             g.start_x = x * 4;
2742             g.start_y = y * g.line_size;
2743             g.max_x   = g.start_x + w * 4;
2744             g.max_y   = g.start_y + h * g.line_size;
2745             g.cur_x   = g.start_x;
2746             g.cur_y   = g.start_y;
2747 
2748             g.lflags = get8(s);
2749 
2750             if (g.lflags & 0x40) {
2751                g.step = 8 * g.line_size; // first interlaced spacing
2752                g.parse = 3;
2753             } else {
2754                g.step = g.line_size;
2755                g.parse = 0;
2756             }
2757 
2758             if (g.lflags & 0x80) {
2759                stbi_gif_parse_colortable(s,g.lpal, 2 << (g.lflags & 7), g.eflags & 0x01 ? g.transparent : -1);
2760                g.color_table = &g.lpal[0][0];       
2761             } else if (g.flags & 0x80) {
2762                for (i=0; i < 256; ++i)  // @OPTIMIZE: reset only the previous transparent
2763                   g.pal[i][3] = 255; 
2764                if (g.transparent >= 0 && (g.eflags & 0x01))
2765                   g.pal[g.transparent][3] = 0;
2766                g.color_table = &g.pal[0][0];
2767             } else
2768                throw new STBImageException("missing color table, corrupt GIF");
2769    
2770             o = stbi_process_gif_raster(s, g);
2771             if (o == null) return null;
2772 
2773             if (req_comp && req_comp != 4)
2774                o = convert_format(o, 4, req_comp, g.w, g.h);
2775             return o;
2776          }
2777 
2778          case 0x21: // Comment Extension.
2779          {
2780             int len;
2781             if (get8(s) == 0xF9) { // Graphic Control Extension.
2782                len = get8(s);
2783                if (len == 4) {
2784                   g.eflags = get8(s);
2785                   get16le(s); // delay
2786                   g.transparent = get8(s);
2787                } else {
2788                   skip(s, len);
2789                   break;
2790                }
2791             }
2792             while ((len = get8(s)) != 0)
2793                skip(s, len);
2794             break;
2795          }
2796 
2797          case 0x3B: // gif stream termination code
2798             return cast(ubyte*) 1;
2799 
2800          default:
2801             throw new STBImageException("unknown code, corrupt GIF");
2802       }
2803    }
2804 }
2805 
2806 ubyte *stbi_gif_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2807 {
2808    ubyte *u = null;
2809    stbi_gif g={0};
2810 
2811    u = stbi_gif_load_next(s, &g, comp, req_comp);
2812    if (u == cast(void *) 1) u = null;  // end of animated gif marker
2813    if (u) {
2814       *x = g.w;
2815       *y = g.h;
2816    }
2817 
2818    return u;
2819 }
2820 
2821