1 
2 /// D translation of stb_image-1.33 (http://nothings.org/stb_image.c)
3 ///
4 /// This port only supports:
5 /// $(UL
6 ///   $(LI PNG 8-bit-per-channel only.)
7 ///   $(LI JPEG baseline (no JPEG progressive).)
8 ///   $(LI GIF.)
9 ///   $(LI BMP non-1bpp, non-RLE.)
10 /// )
11 ///
12 /// TODO:
13 /// $(UL
14 ///   $(LI Support a range as input.)
15 ///  )
16 
17 //============================    Contributors    =========================
18 //
19 // Image formats                                Optimizations & bugfixes
20 // Sean Barrett (jpeg, png, bmp)                Fabian "ryg" Giesen
21 // Nicolas Schulz (hdr, psd)                                                 
22 // Jonathan Dummer (tga)                     Bug fixes & warning fixes           
23 // Jean-Marc Lienher (gif)                      Marc LeBlanc               
24 // Tom Seddon (pic)                             Christpher Lloyd           
25 // Thatcher Ulrich (psd)                        Dave Moore                 
26 // Won Chun                   
27 // the Horde3D community      
28 // Extensions, features                            Janez Zemva                
29 // Jetro Lauha (stbi_info)                      Jonathan Blow              
30 // James "moose2000" Brown (iPhone PNG)         Laurent Gomila                             
31 // Ben "Disch" Wenger (io callbacks)            Aruelien Pocheville
32 // Martin "SpartanJ" Golini                     Ryamond Barbiero
33 // David Woo
34 
35 module gfm.image.stb_image;
36 
37 import core.stdc.stdlib;
38 import core.stdc..string;
39 
40 import gfm.math.vector,
41        gfm.image.bitmap;
42 
43 import ae.utils.graphics.image;
44 
45 enum STBI_VERSION = 1;
46 
47 /// The exception type thrown when loading an image failed.
48 class STBImageException : Exception
49 {
50     public
51     {
52         @safe pure nothrow this(string message, string file =__FILE__, size_t line = __LINE__, Throwable next = null)
53         {
54             super(message, file, line, next);
55         }
56     }
57 }
58 
59 enum : int
60 {
61    STBI_default    = 0, // only used for req_comp
62    STBI_grey       = 1,
63    STBI_grey_alpha = 2,
64    STBI_rgb        = 3,
65    STBI_rgb_alpha  = 4
66 };
67 
68 // define faster low-level operations (typically SIMD support)
69 
70 
71 uint stbi_lrot(uint x, uint y)
72 {
73     return (x << y) | (x >> (32 - y));
74 }
75 
76 // stbi structure is our basic context used by all images, so it
77 // contains all the IO context, plus some basic image information
78 struct stbi
79 {
80    uint img_x, img_y;
81    int img_n, img_out_n;
82    
83    int buflen;
84    ubyte buffer_start[128];
85 
86    const(ubyte) *img_buffer;
87    const(ubyte) *img_buffer_end;
88    const(ubyte) *img_buffer_original;
89 }
90 
91 
92 // initialize a memory-decode context
93 void start_mem(stbi *s, const(ubyte)*buffer, int len)
94 {
95    s.img_buffer = buffer;
96    s.img_buffer_original = buffer;
97    s.img_buffer_end = buffer+len;
98 }
99 
100 void stbi_rewind(stbi *s)
101 {
102    // conceptually rewind SHOULD rewind to the beginning of the stream,
103    // but we just rewind to the beginning of the initial buffer, because
104    // we only use it after doing 'test', which only ever looks at at most 92 bytes
105    s.img_buffer = s.img_buffer_original;
106 }
107 
108 
109 ubyte *stbi_load_main(stbi *s, int *x, int *y, int *comp, int req_comp)
110 {
111     try
112     {
113         stbi_jpeg_test(s);
114         stbi_rewind(s);
115         return stbi_jpeg_load(s,x,y,comp,req_comp);
116     }
117     catch(STBImageException e)
118     {
119         stbi_rewind(s);
120     }
121 
122     try
123     {
124         stbi_png_test(s);
125         stbi_rewind(s);
126         return stbi_png_load(s,x,y,comp,req_comp);
127     }
128     catch(STBImageException e)
129     {
130         stbi_rewind(s);
131     }
132 
133     try
134     {
135         stbi_bmp_test(s);
136         stbi_rewind(s);
137         return stbi_bmp_load(s,x,y,comp,req_comp);
138     }
139     catch(STBImageException e)
140     {
141         stbi_rewind(s);
142     }
143 
144     try
145     {
146         stbi_gif_test(s);
147         stbi_rewind(s);
148         return stbi_gif_load(s,x,y,comp,req_comp);
149     }
150     catch(STBImageException e)
151     {
152         stbi_rewind(s);
153     }
154 
155     throw new STBImageException("Image not of any known type, or corrupt");
156 }
157 
158 /// Loads an image from memory.
159 /// Throws: STBImageException on error.
160 ubyte* stbi_load_from_memory(void[] buffer, out int width, out int height, out int components, int requestedComponents)
161 {
162    stbi s;
163    start_mem(&s, cast(ubyte*)buffer.ptr, cast(int)(buffer.length));
164    return stbi_load_main(&s, &width, &height, &components, requestedComponents);
165 }
166 
167 /// Frees an image loaded by stb_image.
168 void stbi_image_free(void *retval_from_stbi_load)
169 {
170     free(retval_from_stbi_load);
171 }
172 
173 /// Load an image from memory and puts it in a Bitmap.
174 /// See_also: Bitmap.
175 /// Throws: STBImageException on error.
176 deprecated("Use ae.utils.graphics instead")
177 Bitmap!vec4ub stbiLoadImage(void[] buffer)
178 {
179     int width, height, components;
180     ubyte* data = stbi_load_from_memory(buffer, width, height, components, 4);
181     scope(exit) stbi_image_free(data);
182 
183     if(components != 4)
184         throw new STBImageException("Could't convert image to 4 components");
185 
186     auto result = Bitmap!vec4ub(vec2i(width, height));
187     memcpy(result.ptr, data, width * height);
188     return result;
189 }
190 
191 /// Load an image from memory and puts it in a ae.utils.graphics.image.Image.
192 /// Throws: STBImageException on error.
193 Image!vec4ub stbiLoadImageAE(void[] buffer)
194 {
195     int width, height, components;
196     ubyte* data = stbi_load_from_memory(buffer, width, height, components, 4);
197     scope(exit) stbi_image_free(data);
198 
199     if(components != 4)
200         throw new STBImageException("Could't convert image to 4 components");
201 
202     auto result = Image!vec4ub(width, height);
203     size_t length = width * height * vec4ub.sizeof;
204     result.pixels[] = cast(vec4ub[])(data[0..length]);
205     return result;
206 }
207 
208 //
209 // Common code used by all image loaders
210 //
211 
212 enum : int
213 {
214    SCAN_load=0,
215    SCAN_type,
216    SCAN_header
217 };
218 
219 
220 int get8(stbi *s)
221 {
222    if (s.img_buffer < s.img_buffer_end)
223       return *s.img_buffer++;
224    
225    return 0;
226 }
227 
228 int at_eof(stbi *s)
229 {
230    return s.img_buffer >= s.img_buffer_end;   
231 }
232 
233 ubyte get8u(stbi *s)
234 {
235    return cast(ubyte) get8(s);
236 }
237 
238 void skip(stbi *s, int n)
239 {
240    s.img_buffer += n;
241 }
242 
243 int getn(stbi *s, ubyte *buffer, int n)
244 {
245    if (s.img_buffer+n <= s.img_buffer_end) {
246       memcpy(buffer, s.img_buffer, n);
247       s.img_buffer += n;
248       return 1;
249    } else
250       return 0;
251 }
252 
253 int get16(stbi *s)
254 {
255    int z = get8(s);
256    return (z << 8) + get8(s);
257 }
258 
259 uint get32(stbi *s)
260 {
261    uint z = get16(s);
262    return (z << 16) + get16(s);
263 }
264 
265 int get16le(stbi *s)
266 {
267    int z = get8(s);
268    return z + (get8(s) << 8);
269 }
270 
271 uint get32le(stbi *s)
272 {
273    uint z = get16le(s);
274    return z + (get16le(s) << 16);
275 }
276 
277 //
278 //  generic converter from built-in img_n to req_comp
279 //    individual types do this automatically as much as possible (e.g. jpeg
280 //    does all cases internally since it needs to colorspace convert anyway,
281 //    and it never has alpha, so very few cases ). png can automatically
282 //    interleave an alpha=255 channel, but falls back to this for other cases
283 //
284 //  assume data buffer is malloced, so malloc a new one and free that one
285 //  only failure mode is malloc failing
286 
287 ubyte compute_y(int r, int g, int b)
288 {
289    return cast(ubyte) (((r*77) + (g*150) +  (29*b)) >> 8);
290 }
291 
292 ubyte *convert_format(ubyte *data, int img_n, int req_comp, uint x, uint y)
293 {
294     int i,j;
295     ubyte *good;
296 
297     if (req_comp == img_n) return data;
298     assert(req_comp >= 1 && req_comp <= 4);
299 
300     good = cast(ubyte*) malloc(req_comp * x * y);
301     if (good == null) {
302         free(data);
303         throw new STBImageException("Out of memory");
304     }
305 
306     for (j=0; j < cast(int) y; ++j) {
307         ubyte *src  = data + j * x * img_n   ;
308         ubyte *dest = good + j * x * req_comp;
309 
310         // convert source image with img_n components to one with req_comp components;
311         // avoid switch per pixel, so use switch per scanline and massive macros
312         switch (img_n * 8 + req_comp) 
313         {
314             case 1 * 8 + 2: 
315                 for(i=x-1; i >= 0; --i, src += 1, dest += 2)
316                     dest[0] = src[0], dest[1] = 255;
317                 break;
318             case 1 * 8 + 3: 
319                 for(i=x-1; i >= 0; --i, src += 1, dest += 3)
320                     dest[0]=dest[1]=dest[2]=src[0]; 
321                 break;
322             case 1 * 8 + 4: 
323                 for(i=x-1; i >= 0; --i, src += 1, dest += 4)
324                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; 
325                 break;
326             case 2 * 8 + 1: 
327                 for(i=x-1; i >= 0; --i, src += 2, dest += 1)
328                     dest[0]=src[0]; 
329                 break;
330             case 2 * 8 + 3: 
331                 for(i=x-1; i >= 0; --i, src += 2, dest += 3)
332                     dest[0]=dest[1]=dest[2]=src[0]; 
333                 break;
334             case 2 * 8 + 4: 
335                 for(i=x-1; i >= 0; --i, src += 2, dest += 4)
336                     dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; 
337                 break;
338             case 3 * 8 + 4:
339                 for(i=x-1; i >= 0; --i, src += 3, dest += 4) 
340                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; 
341                 break;
342             case 3 * 8 + 1: 
343                 for(i=x-1; i >= 0; --i, src += 3, dest += 1)
344                     dest[0]=compute_y(src[0],src[1],src[2]); 
345                 break;
346             case 3 * 8 + 2: 
347                 for(i=x-1; i >= 0; --i, src += 3, dest += 2)
348                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = 255; 
349                 break;
350             case 4 * 8 + 1:
351                 for(i=x-1; i >= 0; --i, src += 4, dest += 1)
352                     dest[0]=compute_y(src[0],src[1],src[2]); 
353                 break;
354             case 4 * 8 + 2: 
355                 for(i=x-1; i >= 0; --i, src += 4, dest += 2)
356                     dest[0]=compute_y(src[0],src[1],src[2]), dest[1] = src[3]; 
357                 break;
358             case 4 * 8 + 3: 
359                 for(i=x-1; i >= 0; --i, src += 4, dest += 3)
360                     dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; 
361                 break;
362             default: assert(0);
363         }
364     }
365 
366     free(data);
367     return good;
368 }
369 
370 //
371 //  "baseline" JPEG/JFIF decoder (not actually fully baseline implementation)
372 //
373 //    simple implementation
374 //      - channel subsampling of at most 2 in each dimension
375 //      - doesn't support delayed output of y-dimension
376 //      - simple interface (only one output format: 8-bit interleaved RGB)
377 //      - doesn't try to recover corrupt jpegs
378 //      - doesn't allow partial loading, loading multiple at once
379 //      - still fast on x86 (copying globals into locals doesn't help x86)
380 //      - allocates lots of intermediate memory (full size of all components)
381 //        - non-interleaved case requires this anyway
382 //        - allows good upsampling (see next)
383 //    high-quality
384 //      - upsampled channels are bilinearly interpolated, even across blocks
385 //      - quality integer IDCT derived from IJG's 'slow'
386 //    performance
387 //      - fast huffman; reasonable integer IDCT
388 //      - uses a lot of intermediate memory, could cache poorly
389 //      - load http://nothings.org/remote/anemones.jpg 3 times on 2.8Ghz P4
390 //          stb_jpeg:   1.34 seconds (MSVC6, default release build)
391 //          stb_jpeg:   1.06 seconds (MSVC6, processor = Pentium Pro)
392 //          IJL11.dll:  1.08 seconds (compiled by intel)
393 //          IJG 1998:   0.98 seconds (MSVC6, makefile provided by IJG)
394 //          IJG 1998:   0.95 seconds (MSVC6, makefile + proc=PPro)
395 
396 // huffman decoding acceleration
397 enum FAST_BITS = 9;  // larger handles more cases; smaller stomps less cache
398 
399 struct huffman
400 {
401    ubyte[1 << FAST_BITS] fast;
402    // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
403    ushort[256] code;
404    ubyte[256] values;
405    ubyte[257] size;
406    uint[18] maxcode;
407    int[17] delta;   // old 'firstsymbol' - old 'firstcode'
408 }
409 
410 struct jpeg
411 {
412    stbi *s;
413    huffman[4] huff_dc;
414    huffman[4] huff_ac;
415    ubyte[64][4] dequant;
416 
417 // sizes for components, interleaved MCUs
418    int img_h_max, img_v_max;
419    int img_mcu_x, img_mcu_y;
420    int img_mcu_w, img_mcu_h;
421 
422 // definition of jpeg image component
423    struct img_comp_
424    {
425       int id;
426       int h,v;
427       int tq;
428       int hd,ha;
429       int dc_pred;
430 
431       int x,y,w2,h2;
432       ubyte *data;
433       void *raw_data;
434       ubyte *linebuf;
435    } 
436    
437    img_comp_[4] img_comp;
438 
439    uint         code_buffer; // jpeg entropy-coded buffer
440    int            code_bits;   // number of valid bits
441    ubyte          marker;      // marker seen while filling entropy buffer
442    int            nomore;      // flag if we saw a marker so must stop
443 
444    int scan_n;
445    int[4] order;
446    int restart_interval, todo;
447 }
448 
449 
450 int build_huffman(huffman *h, int *count)
451 {
452    int i,j,k=0,code;
453    // build size list for each symbol (from JPEG spec)
454    for (i=0; i < 16; ++i)
455       for (j=0; j < count[i]; ++j)
456          h.size[k++] = cast(ubyte) (i+1);
457    h.size[k] = 0;
458 
459    // compute actual symbols (from jpeg spec)
460    code = 0;
461    k = 0;
462    for(j=1; j <= 16; ++j) {
463       // compute delta to add to code to compute symbol id
464       h.delta[j] = k - code;
465       if (h.size[k] == j) {
466          while (h.size[k] == j)
467             h.code[k++] = cast(ushort) (code++);
468          if (code-1 >= (1 << j)) 
469              throw new STBImageException("Bad code lengths, corrupt JPEG");
470       }
471       // compute largest code + 1 for this size, preshifted as needed later
472       h.maxcode[j] = code << (16-j);
473       code <<= 1;
474    }
475    h.maxcode[j] = 0xffffffff;
476 
477    // build non-spec acceleration table; 255 is flag for not-accelerated
478    memset(h.fast.ptr, 255, 1 << FAST_BITS);
479    for (i=0; i < k; ++i) {
480       int s = h.size[i];
481       if (s <= FAST_BITS) {
482          int c = h.code[i] << (FAST_BITS-s);
483          int m = 1 << (FAST_BITS-s);
484          for (j=0; j < m; ++j) {
485             h.fast[c+j] = cast(ubyte) i;
486          }
487       }
488    }
489    return 1;
490 }
491 
492 void grow_buffer_unsafe(jpeg *j)
493 {
494    do {
495       int b = j.nomore ? 0 : get8(j.s);
496       if (b == 0xff) {
497          int c = get8(j.s);
498          if (c != 0) {
499             j.marker = cast(ubyte) c;
500             j.nomore = 1;
501             return;
502          }
503       }
504       j.code_buffer |= b << (24 - j.code_bits);
505       j.code_bits += 8;
506    } while (j.code_bits <= 24);
507 }
508 
509 // (1 << n) - 1
510 static immutable uint bmask[17]=[0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535];
511 
512 // decode a jpeg huffman value from the bitstream
513 int decode(jpeg *j, huffman *h)
514 {
515    uint temp;
516    int c,k;
517 
518    if (j.code_bits < 16) grow_buffer_unsafe(j);
519 
520    // look at the top FAST_BITS and determine what symbol ID it is,
521    // if the code is <= FAST_BITS
522    c = (j.code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
523    k = h.fast[c];
524    if (k < 255) {
525       int s = h.size[k];
526       if (s > j.code_bits)
527          return -1;
528       j.code_buffer <<= s;
529       j.code_bits -= s;
530       return h.values[k];
531    }
532 
533    // naive test is to shift the code_buffer down so k bits are
534    // valid, then test against maxcode. To speed this up, we've
535    // preshifted maxcode left so that it has (16-k) 0s at the
536    // end; in other words, regardless of the number of bits, it
537    // wants to be compared against something shifted to have 16;
538    // that way we don't need to shift inside the loop.
539    temp = j.code_buffer >> 16;
540    for (k=FAST_BITS+1 ; ; ++k)
541       if (temp < h.maxcode[k])
542          break;
543    if (k == 17) {
544       // error! code not found
545       j.code_bits -= 16;
546       return -1;
547    }
548 
549    if (k > j.code_bits)
550       return -1;
551 
552    // convert the huffman code to the symbol id
553    c = ((j.code_buffer >> (32 - k)) & bmask[k]) + h.delta[k];
554    assert((((j.code_buffer) >> (32 - h.size[c])) & bmask[h.size[c]]) == h.code[c]);
555 
556    // convert the id to a symbol
557    j.code_bits -= k;
558    j.code_buffer <<= k;
559    return h.values[c];
560 }
561 
562 // combined JPEG 'receive' and JPEG 'extend', since baseline
563 // always extends everything it receives.
564 int extend_receive(jpeg *j, int n)
565 {
566    uint m = 1 << (n-1);
567    uint k;
568    if (j.code_bits < n) grow_buffer_unsafe(j);
569 
570    k = stbi_lrot(j.code_buffer, n);
571    j.code_buffer = k & ~bmask[n];
572    k &= bmask[n];
573    j.code_bits -= n;
574 
575    // the following test is probably a random branch that won't
576    // predict well. I tried to table accelerate it but failed.
577    // maybe it's compiling as a conditional move?
578    if (k < m)
579       return (-1 << n) + k + 1;
580    else
581       return k;
582 }
583 
584 // given a value that's at position X in the zigzag stream,
585 // where does it appear in the 8x8 matrix coded as row-major?
586 static immutable ubyte dezigzag[64+15] =
587 [
588     0,  1,  8, 16,  9,  2,  3, 10,
589    17, 24, 32, 25, 18, 11,  4,  5,
590    12, 19, 26, 33, 40, 48, 41, 34,
591    27, 20, 13,  6,  7, 14, 21, 28,
592    35, 42, 49, 56, 57, 50, 43, 36,
593    29, 22, 15, 23, 30, 37, 44, 51,
594    58, 59, 52, 45, 38, 31, 39, 46,
595    53, 60, 61, 54, 47, 55, 62, 63,
596    // let corrupt input sample past end
597    63, 63, 63, 63, 63, 63, 63, 63,
598    63, 63, 63, 63, 63, 63, 63
599 ];
600 
601 // decode one 64-entry block--
602 int decode_block(jpeg *j, short data[64], huffman *hdc, huffman *hac, int b)
603 {
604    int diff,dc,k;
605    int t = decode(j, hdc);
606    if (t < 0)
607        throw new STBImageException("Bad huffman code, corrupt JPEG");
608 
609    // 0 all the ac values now so we can do it 32-bits at a time
610    memset(data.ptr,0,64*(data[0]).sizeof);
611 
612    diff = t ? extend_receive(j, t) : 0;
613    dc = j.img_comp[b].dc_pred + diff;
614    j.img_comp[b].dc_pred = dc;
615    data[0] = cast(short) dc;
616 
617    // decode AC components, see JPEG spec
618    k = 1;
619    do {
620       int r,s;
621       int rs = decode(j, hac);
622       if (rs < 0)
623          throw new STBImageException("Bad huffman code, corrupt JPEG");
624       s = rs & 15;
625       r = rs >> 4;
626       if (s == 0) {
627          if (rs != 0xf0) break; // end block
628          k += 16;
629       } else {
630          k += r;
631          // decode into unzigzag'd location
632          data[dezigzag[k++]] = cast(short) extend_receive(j,s);
633       }
634    } while (k < 64);
635    return 1;
636 }
637 
638 // take a -128..127 value and clamp it and convert to 0..255
639 ubyte clamp(int x)
640 {
641    // trick to use a single test to catch both cases
642    if (cast(uint) x > 255) {
643       if (x < 0) return 0;
644       if (x > 255) return 255;
645    }
646    return cast(ubyte) x;
647 }
648 
649 int f2f(double x)
650 {
651     return cast(int)(x * 4096 + 0.5);
652 }
653 
654 int fsh(int x)
655 {
656     return x << 12;
657 }
658 
659 // derived from jidctint -- DCT_ISLOW
660 void IDCT_1D(int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7,
661              out int t0, out int t1, out int t2, out int t3,
662              out int x0, out int x1, out int x2, out int x3)
663 {
664    int p1,p2,p3,p4,p5; 
665    p2 = s2;                                    
666    p3 = s6;                                    
667    p1 = (p2+p3) * f2f(0.5411961f);             
668    t2 = p1 + p3*f2f(-1.847759065f);            
669    t3 = p1 + p2*f2f( 0.765366865f);            
670    p2 = s0;                                    
671    p3 = s4;                                    
672    t0 = fsh(p2+p3);                            
673    t1 = fsh(p2-p3);                            
674    x0 = t0+t3;                                 
675    x3 = t0-t3;                                 
676    x1 = t1+t2;                                 
677    x2 = t1-t2;                                 
678    t0 = s7;                                    
679    t1 = s5;                                    
680    t2 = s3;                                    
681    t3 = s1;                                    
682    p3 = t0+t2;                                 
683    p4 = t1+t3;                                 
684    p1 = t0+t3;                                 
685    p2 = t1+t2;                                 
686    p5 = (p3+p4)*f2f( 1.175875602f);            
687    t0 = t0*f2f( 0.298631336f);                 
688    t1 = t1*f2f( 2.053119869f);                 
689    t2 = t2*f2f( 3.072711026f);                 
690    t3 = t3*f2f( 1.501321110f);                 
691    p1 = p5 + p1*f2f(-0.899976223f);            
692    p2 = p5 + p2*f2f(-2.562915447f);            
693    p3 = p3*f2f(-1.961570560f);                 
694    p4 = p4*f2f(-0.390180644f);                 
695    t3 += p1+p4;                                
696    t2 += p2+p3;                                
697    t1 += p2+p4;                                
698    t0 += p1+p3;
699  }
700 
701 alias stbi_dequantize_t = ubyte;
702 
703 // .344 seconds on 3*anemones.jpg
704 void idct_block(ubyte *out_, int out_stride, short data[64], stbi_dequantize_t *dequantize)
705 {
706    int i;
707    int[64] val;
708    int*v = val.ptr;
709    stbi_dequantize_t *dq = dequantize;
710    ubyte *o;
711    short *d = data.ptr;
712 
713    // columns
714    for (i=0; i < 8; ++i,++d,++dq, ++v) {
715       // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
716       if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
717            && d[40]==0 && d[48]==0 && d[56]==0) {
718          //    no shortcut                 0     seconds
719          //    (1|2|3|4|5|6|7)==0          0     seconds
720          //    all separate               -0.047 seconds
721          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
722          int dcterm = d[0] * dq[0] << 2;
723          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
724       } else {
725          int t0, t1, t2, t3, x0, x1, x2, x3;
726          IDCT_1D(d[ 0]*dq[ 0],d[ 8]*dq[ 8],d[16]*dq[16],d[24]*dq[24],
727                  d[32]*dq[32],d[40]*dq[40],d[48]*dq[48],d[56]*dq[56],
728                  t0, t1, t2, t3, x0, x1, x2, x3);
729          // constants scaled things up by 1<<12; let's bring them back
730          // down, but keep 2 extra bits of precision
731          x0 += 512; x1 += 512; x2 += 512; x3 += 512;
732          v[ 0] = (x0+t3) >> 10;
733          v[56] = (x0-t3) >> 10;
734          v[ 8] = (x1+t2) >> 10;
735          v[48] = (x1-t2) >> 10;
736          v[16] = (x2+t1) >> 10;
737          v[40] = (x2-t1) >> 10;
738          v[24] = (x3+t0) >> 10;
739          v[32] = (x3-t0) >> 10;
740       }
741    }
742 
743    for (i=0, v=val.ptr, o=out_; i < 8; ++i,v+=8,o+=out_stride) {
744 
745       // no fast case since the first 1D IDCT spread components out
746       int t0, t1, t2, t3, x0, x1, x2, x3;
747       IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7], t0, t1, t2, t3, x0, x1, x2, x3);
748       // constants scaled things up by 1<<12, plus we had 1<<2 from first
749       // loop, plus horizontal and vertical each scale by sqrt(8) so together
750       // we've got an extra 1<<3, so 1<<17 total we need to remove.
751       // so we want to round that, which means adding 0.5 * 1<<17,
752       // aka 65536. Also, we'll end up with -128 to 127 that we want
753       // to encode as 0..255 by adding 128, so we'll add that before the shift
754       x0 += 65536 + (128<<17);
755       x1 += 65536 + (128<<17);
756       x2 += 65536 + (128<<17);
757       x3 += 65536 + (128<<17);
758       // tried computing the shifts into temps, or'ing the temps to see
759       // if any were out of range, but that was slower
760       o[0] = clamp((x0+t3) >> 17);
761       o[7] = clamp((x0-t3) >> 17);
762       o[1] = clamp((x1+t2) >> 17);
763       o[6] = clamp((x1-t2) >> 17);
764       o[2] = clamp((x2+t1) >> 17);
765       o[5] = clamp((x2-t1) >> 17);
766       o[3] = clamp((x3+t0) >> 17);
767       o[4] = clamp((x3-t0) >> 17);
768    }
769 }
770 
771 
772 enum MARKER_none = 0xff;
773 
774 // if there's a pending marker from the entropy stream, return that
775 // otherwise, fetch from the stream and get a marker. if there's no
776 // marker, return 0xff, which is never a valid marker value
777 ubyte get_marker(jpeg *j)
778 {
779    ubyte x;
780    if (j.marker != MARKER_none) { x = j.marker; j.marker = MARKER_none; return x; }
781    x = get8u(j.s);
782    if (x != 0xff) return MARKER_none;
783    while (x == 0xff)
784       x = get8u(j.s);
785    return x;
786 }
787 
788 // in each scan, we'll have scan_n components, and the order
789 // of the components is specified by order[]
790 bool RESTART(int x)
791 {
792     return (x >= 0xd0) && (x <= 0xd7);
793 }
794 
795 // after a restart interval, reset the entropy decoder and
796 // the dc prediction
797 void reset(jpeg *j)
798 {
799    j.code_bits = 0;
800    j.code_buffer = 0;
801    j.nomore = 0;
802    j.img_comp[0].dc_pred = j.img_comp[1].dc_pred = j.img_comp[2].dc_pred = 0;
803    j.marker = MARKER_none;
804    j.todo = j.restart_interval ? j.restart_interval : 0x7fffffff;
805    // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
806    // since we don't even allow 1<<30 pixels
807 }
808 
809 int parse_entropy_coded_data(jpeg *z)
810 {
811    reset(z);
812    if (z.scan_n == 1) {
813       int i,j;
814       short data[64];
815       int n = z.order[0];
816       // non-interleaved data, we just need to process one block at a time,
817       // in trivial scanline order
818       // number of blocks to do just depends on how many actual "pixels" this
819       // component has, independent of interleaved MCU blocking and such
820       int w = (z.img_comp[n].x+7) >> 3;
821       int h = (z.img_comp[n].y+7) >> 3;
822       for (j=0; j < h; ++j) {
823          for (i=0; i < w; ++i) {
824             if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
825             idct_block(z.img_comp[n].data+z.img_comp[n].w2*j*8+i*8, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
826             // every data block is an MCU, so countdown the restart interval
827             if (--z.todo <= 0) {
828                if (z.code_bits < 24) grow_buffer_unsafe(z);
829                // if it's NOT a restart, then just bail, so we get corrupt data
830                // rather than no data
831                if (!RESTART(z.marker)) return 1;
832                reset(z);
833             }
834          }
835       }
836    } else { // interleaved!
837       int i,j,k,x,y;
838       short[64] data;
839       for (j=0; j < z.img_mcu_y; ++j) {
840          for (i=0; i < z.img_mcu_x; ++i) {
841             // scan an interleaved mcu... process scan_n components in order
842             for (k=0; k < z.scan_n; ++k) {
843                int n = z.order[k];
844                // scan out an mcu's worth of this component; that's just determined
845                // by the basic H and V specified for the component
846                for (y=0; y < z.img_comp[n].v; ++y) {
847                   for (x=0; x < z.img_comp[n].h; ++x) {
848                      int x2 = (i*z.img_comp[n].h + x)*8;
849                      int y2 = (j*z.img_comp[n].v + y)*8;
850                      if (!decode_block(z, data, z.huff_dc.ptr+z.img_comp[n].hd, z.huff_ac.ptr+z.img_comp[n].ha, n)) return 0;
851                      idct_block(z.img_comp[n].data+z.img_comp[n].w2*y2+x2, z.img_comp[n].w2, data, z.dequant[z.img_comp[n].tq].ptr);
852                   }
853                }
854             }
855             // after all interleaved components, that's an interleaved MCU,
856             // so now count down the restart interval
857             if (--z.todo <= 0) {
858                if (z.code_bits < 24) grow_buffer_unsafe(z);
859                // if it's NOT a restart, then just bail, so we get corrupt data
860                // rather than no data
861                if (!RESTART(z.marker)) return 1;
862                reset(z);
863             }
864          }
865       }
866    }
867    return 1;
868 }
869 
870 int process_marker(jpeg *z, int m)
871 {
872    int L;
873    switch (m) {
874       
875       case MARKER_none: // no marker found
876          throw new STBImageException("Expected marker, corrupt JPEG");
877 
878       case 0xC2: // SOF - progressive
879           throw new STBImageException("JPEG format not supported (progressive)");
880 
881       case 0xDD: // DRI - specify restart interval
882          if (get16(z.s) != 4) 
883              throw new STBImageException("Bad DRI len, corrupt JPEG");
884          z.restart_interval = get16(z.s);
885          return 1;
886 
887       case 0xDB: // DQT - define quantization table
888          L = get16(z.s)-2;
889          while (L > 0) {
890             int q = get8(z.s);
891             int p = q >> 4;
892             int t = q & 15,i;
893             if (p != 0)
894                throw new STBImageException("Bad DQT type, corrupt JPEG");
895             if (t > 3) 
896                throw new STBImageException("Bad DQT table, corrupt JPEG");
897             for (i=0; i < 64; ++i)
898                z.dequant[t][dezigzag[i]] = get8u(z.s);
899             L -= 65;
900          }
901          return L==0;
902 
903       case 0xC4: // DHT - define huffman table
904          L = get16(z.s)-2;
905          while (L > 0) {
906             ubyte *v;
907             int[16] sizes;
908             int i;
909             int m_ = 0;
910             int q = get8(z.s);
911             int tc = q >> 4;
912             int th = q & 15;
913             if (tc > 1 || th > 3) 
914                 throw new STBImageException("Bad DHT header, corrupt JPEG");
915             for (i=0; i < 16; ++i) {
916                sizes[i] = get8(z.s);
917                m_ += sizes[i];
918             }
919             L -= 17;
920             if (tc == 0) {
921                if (!build_huffman(z.huff_dc.ptr+th, sizes.ptr)) return 0;
922                v = z.huff_dc[th].values.ptr;
923             } else {
924                if (!build_huffman(z.huff_ac.ptr+th, sizes.ptr)) return 0;
925                v = z.huff_ac[th].values.ptr;
926             }
927             for (i=0; i < m_; ++i)
928                v[i] = get8u(z.s);
929             L -= m_;
930          }
931          return L==0;
932 
933       default:
934          break;
935    }
936    // check for comment block or APP blocks
937    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
938       skip(z.s, get16(z.s)-2);
939       return 1;
940    }
941    return 0;
942 }
943 
944 // after we see SOS
945 int process_scan_header(jpeg *z)
946 {
947    int i;
948    int Ls = get16(z.s);
949    z.scan_n = get8(z.s);
950    if (z.scan_n < 1 || z.scan_n > 4 || z.scan_n > cast(int) z.s.img_n) 
951       throw new STBImageException("Bad SOS component count, Corrupt JPEG");
952       
953    if (Ls != 6+2*z.scan_n) 
954       throw new STBImageException("Bad SOS length, Corrupt JPEG");
955       
956    for (i=0; i < z.scan_n; ++i) {
957       int id = get8(z.s), which;
958       int q = get8(z.s);
959       for (which = 0; which < z.s.img_n; ++which)
960          if (z.img_comp[which].id == id)
961             break;
962       if (which == z.s.img_n) return 0;
963       z.img_comp[which].hd = q >> 4;   
964       if (z.img_comp[which].hd > 3) 
965          throw new STBImageException("Bad DC huff, Corrupt JPEG");
966       z.img_comp[which].ha = q & 15;   
967       if (z.img_comp[which].ha > 3)
968          throw new STBImageException("Bad AC huff, Corrupt JPEG");
969       z.order[i] = which;
970    }
971    if (get8(z.s) != 0) 
972       throw new STBImageException("Bad SOS, Corrupt JPEG");
973    get8(z.s); // should be 63, but might be 0
974    if (get8(z.s) != 0) 
975       throw new STBImageException("Bad SOS, Corrupt JPEG");
976 
977    return 1;
978 }
979 
980 int process_frame_header(jpeg *z, int scan)
981 {
982    stbi *s = z.s;
983    int Lf,p,i,q, h_max=1,v_max=1,c;
984    Lf = get16(s);         if (Lf < 11) throw new STBImageException("Bad SOF len, Corrupt JPEG");
985    p  = get8(s);          if (p != 8) throw new STBImageException("JPEG format not supported: 8-bit only"); // JPEG baseline
986    s.img_y = get16(s);   if (s.img_y == 0) throw new STBImageException("No header height, JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
987    s.img_x = get16(s);   if (s.img_x == 0) throw new STBImageException("0 width, corrupt JPEG"); // JPEG requires
988    c = get8(s);
989    if (c != 3 && c != 1) throw new STBImageException("Bad component count, corrupt JPEG");    // JFIF requires
990    s.img_n = c;
991    for (i=0; i < c; ++i) {
992       z.img_comp[i].data = null;
993       z.img_comp[i].linebuf = null;
994    }
995 
996    if (Lf != 8+3*s.img_n) throw new STBImageException("Bad SOF len, corrupt JPEG"); 
997 
998    for (i=0; i < s.img_n; ++i) {
999       z.img_comp[i].id = get8(s);
1000       if (z.img_comp[i].id != i+1)   // JFIF requires
1001          if (z.img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
1002             throw new STBImageException("Bad component ID, corrupt JPEG");
1003       q = get8(s);
1004       z.img_comp[i].h = (q >> 4);  if (!z.img_comp[i].h || z.img_comp[i].h > 4) throw new STBImageException("Bad H, corrupt JPEG");
1005       z.img_comp[i].v = q & 15;    if (!z.img_comp[i].v || z.img_comp[i].v > 4) throw new STBImageException("Bad V, corrupt JPEG");
1006       z.img_comp[i].tq = get8(s);  if (z.img_comp[i].tq > 3) throw new STBImageException("Bad TQ, corrupt JPEG");
1007    }
1008 
1009    if (scan != SCAN_load) return 1;
1010 
1011    if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
1012 
1013    for (i=0; i < s.img_n; ++i) {
1014       if (z.img_comp[i].h > h_max) h_max = z.img_comp[i].h;
1015       if (z.img_comp[i].v > v_max) v_max = z.img_comp[i].v;
1016    }
1017 
1018    // compute interleaved mcu info
1019    z.img_h_max = h_max;
1020    z.img_v_max = v_max;
1021    z.img_mcu_w = h_max * 8;
1022    z.img_mcu_h = v_max * 8;
1023    z.img_mcu_x = (s.img_x + z.img_mcu_w-1) / z.img_mcu_w;
1024    z.img_mcu_y = (s.img_y + z.img_mcu_h-1) / z.img_mcu_h;
1025 
1026    for (i=0; i < s.img_n; ++i) {
1027       // number of effective pixels (e.g. for non-interleaved MCU)
1028       z.img_comp[i].x = (s.img_x * z.img_comp[i].h + h_max-1) / h_max;
1029       z.img_comp[i].y = (s.img_y * z.img_comp[i].v + v_max-1) / v_max;
1030       // to simplify generation, we'll allocate enough memory to decode
1031       // the bogus oversized data from using interleaved MCUs and their
1032       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
1033       // discard the extra data until colorspace conversion
1034       z.img_comp[i].w2 = z.img_mcu_x * z.img_comp[i].h * 8;
1035       z.img_comp[i].h2 = z.img_mcu_y * z.img_comp[i].v * 8;
1036       z.img_comp[i].raw_data = malloc(z.img_comp[i].w2 * z.img_comp[i].h2+15);
1037       if (z.img_comp[i].raw_data == null) {
1038          for(--i; i >= 0; --i) {
1039             free(z.img_comp[i].raw_data);
1040             z.img_comp[i].data = null;
1041          }
1042          throw new STBImageException("Out of memory");
1043       }
1044       // align blocks for installable-idct using mmx/sse
1045       z.img_comp[i].data = cast(ubyte*) (( cast(size_t) z.img_comp[i].raw_data + 15) & ~15);
1046       z.img_comp[i].linebuf = null;
1047    }
1048 
1049    return 1;
1050 }
1051 
1052 // use comparisons since in some cases we handle more than one case (e.g. SOF)
1053 bool DNL(int x) { return x == 0xdc; }
1054 bool SOI(int x) { return x == 0xd8; }
1055 bool EOI(int x) { return x == 0xd9; }
1056 bool SOF(int x) { return x == 0xc0 || x == 0xc1; }
1057 bool SOS(int x) { return x == 0xda; }
1058 
1059 int decode_jpeg_header(jpeg *z, int scan)
1060 {
1061    int m;
1062    z.marker = MARKER_none; // initialize cached marker to empty
1063    m = get_marker(z);
1064    if (!SOI(m)) throw new STBImageException("No SOI, corrupt JPEG");
1065    if (scan == SCAN_type) return 1;
1066    m = get_marker(z);
1067    while (!SOF(m)) 
1068    {
1069 
1070       if (!process_marker(z,m)) return 0;
1071       m = get_marker(z);
1072 
1073 
1074 
1075       while (m == MARKER_none) 
1076       {
1077          // some files have extra padding after their blocks, so ok, we'll scan
1078          if (at_eof(z.s)) throw new STBImageException("No SOF, corrupt JPEG");
1079          m = get_marker(z);
1080       }
1081    }
1082    if (!process_frame_header(z, scan)) return 0;
1083    return 1;
1084 }
1085 
1086 int decode_jpeg_image(jpeg *j)
1087 {
1088    int m;
1089    j.restart_interval = 0;
1090    if (!decode_jpeg_header(j, SCAN_load)) return 0;
1091    m = get_marker(j);
1092    while (!EOI(m)) {
1093       if (SOS(m)) {
1094          if (!process_scan_header(j)) return 0;
1095          if (!parse_entropy_coded_data(j)) return 0;
1096          if (j.marker == MARKER_none ) {
1097             // handle 0s at the end of image data from IP Kamera 9060
1098             while (!at_eof(j.s)) {
1099                int x = get8(j.s);
1100                if (x == 255) {
1101                   j.marker = get8u(j.s);
1102                   break;
1103                } else if (x != 0) {
1104                   return 0;
1105                }
1106             }
1107             // if we reach eof without hitting a marker, get_marker() below will fail and we'll eventually return 0
1108          }
1109       } else {
1110          if (!process_marker(j, m)) return 0;
1111       }
1112       m = get_marker(j);
1113    }
1114    return 1;
1115 }
1116 
1117 // static jfif-centered resampling (across block boundaries)
1118 
1119 alias resample_row_func = ubyte* function(ubyte *out_, ubyte *in0, ubyte *in1, int w, int hs);
1120 
1121 ubyte div4(int x)
1122 {
1123     return cast(ubyte)(x >> 2);
1124 }
1125 
1126 ubyte *resample_row_1(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1127 { 
1128    return in_near;
1129 }
1130 
1131 ubyte* resample_row_v_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1132 {
1133    // need to generate two samples vertically for every one in input
1134    int i;
1135    for (i=0; i < w; ++i)
1136       out_[i] = div4(3*in_near[i] + in_far[i] + 2);
1137    return out_;
1138 }
1139 
1140 ubyte*  resample_row_h_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1141 {
1142    // need to generate two samples horizontally for every one in input
1143    int i;
1144    ubyte *input = in_near;
1145 
1146    if (w == 1) {
1147       // if only one sample, can't do any interpolation
1148       out_[0] = out_[1] = input[0];
1149       return out_;
1150    }
1151 
1152    out_[0] = input[0];
1153    out_[1] = div4(input[0]*3 + input[1] + 2);
1154    for (i=1; i < w-1; ++i) {
1155       int n = 3*input[i]+2;
1156       out_[i*2+0] = div4(n+input[i-1]);
1157       out_[i*2+1] = div4(n+input[i+1]);
1158    }
1159    out_[i*2+0] = div4(input[w-2]*3 + input[w-1] + 2);
1160    out_[i*2+1] = input[w-1];
1161 
1162    return out_;
1163 }
1164 
1165 ubyte div16(int x)
1166 {
1167     return cast(ubyte)(x >> 4);
1168 }
1169 
1170 
1171 ubyte *resample_row_hv_2(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1172 {
1173    // need to generate 2x2 samples for every one in input
1174    int i,t0,t1;
1175    if (w == 1) {
1176       out_[0] = out_[1] = div4(3*in_near[0] + in_far[0] + 2);
1177       return out_;
1178    }
1179 
1180    t1 = 3*in_near[0] + in_far[0];
1181    out_[0] = div4(t1+2);
1182    for (i=1; i < w; ++i) {
1183       t0 = t1;
1184       t1 = 3*in_near[i]+in_far[i];
1185       out_[i*2-1] = div16(3*t0 + t1 + 8);
1186       out_[i*2  ] = div16(3*t1 + t0 + 8);
1187    }
1188    out_[w*2-1] = div4(t1+2);
1189 
1190    return out_;
1191 }
1192 
1193 ubyte *resample_row_generic(ubyte *out_, ubyte *in_near, ubyte *in_far, int w, int hs)
1194 {
1195    // resample with nearest-neighbor
1196    int i,j;
1197    in_far = in_far;
1198    for (i=0; i < w; ++i)
1199       for (j=0; j < hs; ++j)
1200          out_[i*hs+j] = in_near[i];
1201    return out_;
1202 }
1203 
1204 int float2fixed(double x)
1205 {
1206     return cast(int)((x) * 65536 + 0.5);
1207 }
1208 
1209 // 0.38 seconds on 3*anemones.jpg   (0.25 with processor = Pro)
1210 // VC6 without processor=Pro is generating multiple LEAs per multiply!
1211 void YCbCr_to_RGB_row(ubyte *out_, const ubyte *y, const ubyte *pcb, const ubyte *pcr, int count, int step)
1212 {
1213    int i;
1214    for (i=0; i < count; ++i) {
1215       int y_fixed = (y[i] << 16) + 32768; // rounding
1216       int r,g,b;
1217       int cr = pcr[i] - 128;
1218       int cb = pcb[i] - 128;
1219       r = y_fixed + cr*float2fixed(1.40200f);
1220       g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
1221       b = y_fixed                            + cb*float2fixed(1.77200f);
1222       r >>= 16;
1223       g >>= 16;
1224       b >>= 16;
1225       if (cast(uint) r > 255) { if (r < 0) r = 0; else r = 255; }
1226       if (cast(uint) g > 255) { if (g < 0) g = 0; else g = 255; }
1227       if (cast(uint) b > 255) { if (b < 0) b = 0; else b = 255; }
1228       out_[0] = cast(ubyte)r;
1229       out_[1] = cast(ubyte)g;
1230       out_[2] = cast(ubyte)b;
1231       out_[3] = 255;
1232       out_ += step;
1233    }
1234 }
1235 
1236 // clean up the temporary component buffers
1237 void cleanup_jpeg(jpeg *j)
1238 {
1239    int i;
1240    for (i=0; i < j.s.img_n; ++i) {
1241       if (j.img_comp[i].data) {
1242          free(j.img_comp[i].raw_data);
1243          j.img_comp[i].data = null;
1244       }
1245       if (j.img_comp[i].linebuf) {
1246          free(j.img_comp[i].linebuf);
1247          j.img_comp[i].linebuf = null;
1248       }
1249    }
1250 }
1251 
1252 struct stbi_resample
1253 {
1254    resample_row_func resample;
1255    ubyte* line0;
1256    ubyte* line1;
1257    int hs,vs;   // expansion factor in each axis
1258    int w_lores; // horizontal pixels pre-expansion 
1259    int ystep;   // how far through vertical expansion we are
1260    int ypos;    // which pre-expansion row we're on
1261 } ;
1262 
1263 ubyte *load_jpeg_image(jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
1264 {
1265    int n, decode_n;
1266    // validate req_comp
1267    if (req_comp < 0 || req_comp > 4) 
1268        throw new STBImageException("Internal error: bad req_comp");
1269    z.s.img_n = 0;
1270 
1271    // load a jpeg image from whichever source
1272    if (!decode_jpeg_image(z)) { cleanup_jpeg(z); return null; }
1273 
1274    // determine actual number of components to generate
1275    n = req_comp ? req_comp : z.s.img_n;
1276 
1277    if (z.s.img_n == 3 && n < 3)
1278       decode_n = 1;
1279    else
1280       decode_n = z.s.img_n;
1281 
1282    // resample and color-convert
1283    {
1284       int k;
1285       uint i,j;
1286       ubyte *output;
1287       ubyte *coutput[4];
1288 
1289       stbi_resample res_comp[4];
1290 
1291       for (k=0; k < decode_n; ++k) {
1292          stbi_resample *r = &res_comp[k];
1293 
1294          // allocate line buffer big enough for upsampling off the edges
1295          // with upsample factor of 4
1296          z.img_comp[k].linebuf = cast(ubyte*) malloc(z.s.img_x + 3);
1297          if (!z.img_comp[k].linebuf) 
1298          { 
1299              cleanup_jpeg(z); 
1300              throw new STBImageException("Out of memory");
1301          }
1302 
1303          r.hs      = z.img_h_max / z.img_comp[k].h;
1304          r.vs      = z.img_v_max / z.img_comp[k].v;
1305          r.ystep   = r.vs >> 1;
1306          r.w_lores = (z.s.img_x + r.hs-1) / r.hs;
1307          r.ypos    = 0;
1308          r.line0   = r.line1 = z.img_comp[k].data;
1309 
1310          if      (r.hs == 1 && r.vs == 1) r.resample = &resample_row_1;
1311          else if (r.hs == 1 && r.vs == 2) r.resample = &resample_row_v_2;
1312          else if (r.hs == 2 && r.vs == 1) r.resample = &resample_row_h_2;
1313          else if (r.hs == 2 && r.vs == 2) r.resample = &resample_row_hv_2;
1314          else                               r.resample = &resample_row_generic;
1315       }
1316 
1317       // can't error after this so, this is safe
1318       output = cast(ubyte*) malloc(n * z.s.img_x * z.s.img_y + 1);
1319       if (!output) { cleanup_jpeg(z); throw new STBImageException("Out of memory"); }
1320 
1321       // now go ahead and resample
1322       for (j=0; j < z.s.img_y; ++j) {
1323          ubyte *out_ = output + n * z.s.img_x * j;
1324          for (k=0; k < decode_n; ++k) {
1325             stbi_resample *r = &res_comp[k];
1326             int y_bot = r.ystep >= (r.vs >> 1);
1327             coutput[k] = r.resample(z.img_comp[k].linebuf,
1328                                      y_bot ? r.line1 : r.line0,
1329                                      y_bot ? r.line0 : r.line1,
1330                                      r.w_lores, r.hs);
1331             if (++r.ystep >= r.vs) {
1332                r.ystep = 0;
1333                r.line0 = r.line1;
1334                if (++r.ypos < z.img_comp[k].y)
1335                   r.line1 += z.img_comp[k].w2;
1336             }
1337          }
1338          if (n >= 3) {
1339             ubyte *y = coutput[0];
1340             if (z.s.img_n == 3) {
1341                YCbCr_to_RGB_row(out_, y, coutput[1], coutput[2], z.s.img_x, n);
1342             } else
1343                for (i=0; i < z.s.img_x; ++i) {
1344                   out_[0] = out_[1] = out_[2] = y[i];
1345                   out_[3] = 255; // not used if n==3
1346                   out_ += n;
1347                }
1348          } else {
1349             ubyte *y = coutput[0];
1350             if (n == 1)
1351                for (i=0; i < z.s.img_x; ++i) out_[i] = y[i];
1352             else
1353                for (i=0; i < z.s.img_x; ++i) *out_++ = y[i], *out_++ = 255;
1354          }
1355       }
1356       cleanup_jpeg(z);
1357       *out_x = z.s.img_x;
1358       *out_y = z.s.img_y;
1359       if (comp) *comp  = z.s.img_n; // report original components, not output
1360       return output;
1361    }
1362 }
1363 
1364 ubyte* stbi_jpeg_load(stbi *s, int *x, int *y, int *comp, int req_comp)
1365 {
1366    jpeg j;
1367    j.s = s;
1368    return load_jpeg_image(&j, x,y,comp,req_comp);
1369 }
1370 
1371 void stbi_jpeg_test(stbi *s)
1372 {
1373    jpeg j;
1374    j.s = s;
1375    int r = decode_jpeg_header(&j, SCAN_type);   
1376    if (r == 0)
1377        throw new STBImageException("Couldn't decode JPEG header");
1378 }
1379 
1380 
1381 // public domain zlib decode    v0.2  Sean Barrett 2006-11-18
1382 //    simple implementation
1383 //      - all input must be provided in an upfront buffer
1384 //      - all output is written to a single output buffer (can malloc/realloc)
1385 //    performance
1386 //      - fast huffman
1387 
1388 // fast-way is faster to check than jpeg huffman, but slow way is slower
1389 enum ZFAST_BITS = 9; // accelerate all cases in default tables
1390 enum ZFAST_MASK = ((1 << ZFAST_BITS) - 1);
1391 
1392 // zlib-style huffman encoding
1393 // (jpegs packs from left, zlib from right, so can't share code)
1394 struct zhuffman
1395 {
1396    ushort[1 << ZFAST_BITS] fast;
1397    ushort[16] firstcode;
1398    int[17] maxcode;
1399    ushort[16] firstsymbol;
1400    ubyte[288] size;
1401    ushort[288] value;
1402 } ;
1403 
1404 int bitreverse16(int n)
1405 {
1406   n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
1407   n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
1408   n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
1409   n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
1410   return n;
1411 }
1412 
1413 int bit_reverse(int v, int bits)
1414 {
1415    assert(bits <= 16);
1416    // to bit reverse n bits, reverse 16 and shift
1417    // e.g. 11 bits, bit reverse and shift away 5
1418    return bitreverse16(v) >> (16-bits);
1419 }
1420 
1421 int zbuild_huffman(zhuffman *z, ubyte *sizelist, int num)
1422 {
1423    int i,k=0;
1424    int code;
1425    int[16] next_code;
1426    int[17] sizes;
1427 
1428    // DEFLATE spec for generating codes
1429    memset(sizes.ptr, 0, sizes.sizeof);
1430    memset(z.fast.ptr, 255, z.fast.sizeof);
1431    for (i=0; i < num; ++i) 
1432       ++sizes[sizelist[i]];
1433    sizes[0] = 0;
1434    for (i=1; i < 16; ++i)
1435       assert(sizes[i] <= (1 << i));
1436    code = 0;
1437    for (i=1; i < 16; ++i) {
1438       next_code[i] = code;
1439       z.firstcode[i] = cast(ushort) code;
1440       z.firstsymbol[i] = cast(ushort) k;
1441       code = (code + sizes[i]);
1442       if (sizes[i])
1443          if (code-1 >= (1 << i)) 
1444             throw new STBImageException("Bad codelength, corrupt JPEG");
1445       z.maxcode[i] = code << (16-i); // preshift for inner loop
1446       code <<= 1;
1447       k += sizes[i];
1448    }
1449    z.maxcode[16] = 0x10000; // sentinel
1450    for (i=0; i < num; ++i) {
1451       int s = sizelist[i];
1452       if (s) {
1453          int c = next_code[s] - z.firstcode[s] + z.firstsymbol[s];
1454          z.size[c] = cast(ubyte)s;
1455          z.value[c] = cast(ushort)i;
1456          if (s <= ZFAST_BITS) {
1457             int k_ = bit_reverse(next_code[s],s);
1458             while (k_ < (1 << ZFAST_BITS)) {
1459                z.fast[k_] = cast(ushort) c;
1460                k_ += (1 << s);
1461             }
1462          }
1463          ++next_code[s];
1464       }
1465    }
1466    return 1;
1467 }
1468 
1469 // zlib-from-memory implementation for PNG reading
1470 //    because PNG allows splitting the zlib stream arbitrarily,
1471 //    and it's annoying structurally to have PNG call ZLIB call PNG,
1472 //    we require PNG read all the IDATs and combine them into a single
1473 //    memory buffer
1474 
1475 struct zbuf
1476 {
1477    const(ubyte) *zbuffer;
1478    const(ubyte) *zbuffer_end;
1479    int num_bits;
1480    uint code_buffer;
1481 
1482    ubyte *zout;
1483    ubyte *zout_start;
1484    ubyte *zout_end;
1485    int   z_expandable;
1486 
1487    zhuffman z_length, z_distance;
1488 } ;
1489 
1490 int zget8(zbuf *z)
1491 {
1492    if (z.zbuffer >= z.zbuffer_end) return 0;
1493    return *z.zbuffer++;
1494 }
1495 
1496 void fill_bits(zbuf *z)
1497 {
1498    do {
1499       assert(z.code_buffer < (1U << z.num_bits));
1500       z.code_buffer |= zget8(z) << z.num_bits;
1501       z.num_bits += 8;
1502    } while (z.num_bits <= 24);
1503 }
1504 
1505 uint zreceive(zbuf *z, int n)
1506 {
1507    uint k;
1508    if (z.num_bits < n) fill_bits(z);
1509    k = z.code_buffer & ((1 << n) - 1);
1510    z.code_buffer >>= n;
1511    z.num_bits -= n;
1512    return k;   
1513 }
1514 
1515 int zhuffman_decode(zbuf *a, zhuffman *z)
1516 {
1517    int b,s,k;
1518    if (a.num_bits < 16) fill_bits(a);
1519    b = z.fast[a.code_buffer & ZFAST_MASK];
1520    if (b < 0xffff) {
1521       s = z.size[b];
1522       a.code_buffer >>= s;
1523       a.num_bits -= s;
1524       return z.value[b];
1525    }
1526 
1527    // not resolved by fast table, so compute it the slow way
1528    // use jpeg approach, which requires MSbits at top
1529    k = bit_reverse(a.code_buffer, 16);
1530    for (s=ZFAST_BITS+1; ; ++s)
1531       if (k < z.maxcode[s])
1532          break;
1533    if (s == 16) return -1; // invalid code!
1534    // code size is s, so:
1535    b = (k >> (16-s)) - z.firstcode[s] + z.firstsymbol[s];
1536    assert(z.size[b] == s);
1537    a.code_buffer >>= s;
1538    a.num_bits -= s;
1539    return z.value[b];
1540 }
1541 
1542 int expand(zbuf *z, int n)  // need to make room for n bytes
1543 {
1544    ubyte *q;
1545    int cur, limit;
1546    if (!z.z_expandable) 
1547       throw new STBImageException("Output buffer limit, corrupt PNG");
1548    cur   = cast(int) (z.zout     - z.zout_start);
1549    limit = cast(int) (z.zout_end - z.zout_start);
1550    while (cur + n > limit)
1551       limit *= 2;
1552    q = cast(ubyte*) realloc(z.zout_start, limit);
1553    if (q == null) 
1554       throw new STBImageException("Out of memory");
1555    z.zout_start = q;
1556    z.zout       = q + cur;
1557    z.zout_end   = q + limit;
1558    return 1;
1559 }
1560 
1561 static immutable int length_base[31] = [
1562    3,4,5,6,7,8,9,10,11,13,
1563    15,17,19,23,27,31,35,43,51,59,
1564    67,83,99,115,131,163,195,227,258,0,0 ];
1565 
1566 static immutable int length_extra[31]= 
1567 [ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 ];
1568 
1569 static immutable int dist_base[32] = [ 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
1570 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0];
1571 
1572 static immutable int dist_extra[32] =
1573 [ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13];
1574 
1575 int parse_huffman_block(zbuf *a)
1576 {
1577    for(;;) {
1578       int z = zhuffman_decode(a, &a.z_length);
1579       if (z < 256) {
1580          if (z < 0) 
1581              throw new STBImageException("Bad Huffman code, corrupt PNG");             
1582          if (a.zout >= a.zout_end) if (!expand(a, 1)) return 0;
1583          *a.zout++ = cast(ubyte) z;
1584       } else {
1585          ubyte *p;
1586          int len,dist;
1587          if (z == 256) return 1;
1588          z -= 257;
1589          len = length_base[z];
1590          if (length_extra[z]) len += zreceive(a, length_extra[z]);
1591          z = zhuffman_decode(a, &a.z_distance);
1592          if (z < 0) throw new STBImageException("Bad Huffman code, corrupt PNG");
1593          dist = dist_base[z];
1594          if (dist_extra[z]) dist += zreceive(a, dist_extra[z]);
1595          if (a.zout - a.zout_start < dist) throw new STBImageException("Bad dist, corrupt PNG");
1596          if (a.zout + len > a.zout_end) if (!expand(a, len)) return 0;
1597          p = a.zout - dist;
1598          while (len--)
1599             *a.zout++ = *p++;
1600       }
1601    }
1602 }
1603 
1604 int compute_huffman_codes(zbuf *a)
1605 {
1606    static immutable ubyte length_dezigzag[19] = [ 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 ];
1607    zhuffman z_codelength;
1608    ubyte lencodes[286+32+137];//padding for maximum single op
1609    ubyte codelength_sizes[19];
1610    int i,n;
1611 
1612    int hlit  = zreceive(a,5) + 257;
1613    int hdist = zreceive(a,5) + 1;
1614    int hclen = zreceive(a,4) + 4;
1615 
1616    memset(codelength_sizes.ptr, 0, codelength_sizes.sizeof);
1617    for (i=0; i < hclen; ++i) {
1618       int s = zreceive(a,3);
1619       codelength_sizes[length_dezigzag[i]] = cast(ubyte) s;
1620    }
1621    if (!zbuild_huffman(&z_codelength, codelength_sizes.ptr, 19)) return 0;
1622 
1623    n = 0;
1624    while (n < hlit + hdist) {
1625       int c = zhuffman_decode(a, &z_codelength);
1626       assert(c >= 0 && c < 19);
1627       if (c < 16)
1628          lencodes[n++] = cast(ubyte) c;
1629       else if (c == 16) {
1630          c = zreceive(a,2)+3;
1631          memset(lencodes.ptr+n, lencodes[n-1], c);
1632          n += c;
1633       } else if (c == 17) {
1634          c = zreceive(a,3)+3;
1635          memset(lencodes.ptr+n, 0, c);
1636          n += c;
1637       } else {
1638          assert(c == 18);
1639          c = zreceive(a,7)+11;
1640          memset(lencodes.ptr+n, 0, c);
1641          n += c;
1642       }
1643    }
1644    if (n != hlit+hdist) throw new STBImageException("Bad codelengths, corrupt PNG");
1645    if (!zbuild_huffman(&a.z_length, lencodes.ptr, hlit)) return 0;
1646    if (!zbuild_huffman(&a.z_distance, lencodes.ptr+hlit, hdist)) return 0;
1647    return 1;
1648 }
1649 
1650 int parse_uncompressed_block(zbuf *a)
1651 {
1652    ubyte header[4];
1653    int len,nlen,k;
1654    if (a.num_bits & 7)
1655       zreceive(a, a.num_bits & 7); // discard
1656    // drain the bit-packed data into header
1657    k = 0;
1658    while (a.num_bits > 0) {
1659       header[k++] = cast(ubyte) (a.code_buffer & 255); // wtf this warns?
1660       a.code_buffer >>= 8;
1661       a.num_bits -= 8;
1662    }
1663    assert(a.num_bits == 0);
1664    // now fill header the normal way
1665    while (k < 4)
1666       header[k++] = cast(ubyte) zget8(a);
1667    len  = header[1] * 256 + header[0];
1668    nlen = header[3] * 256 + header[2];
1669    if (nlen != (len ^ 0xffff)) throw new STBImageException("Zlib corrupt, corrupt PNG");
1670    if (a.zbuffer + len > a.zbuffer_end) throw new STBImageException("Read past buffer, corrupt PNG");
1671    if (a.zout + len > a.zout_end)
1672       if (!expand(a, len)) return 0;
1673    memcpy(a.zout, a.zbuffer, len);
1674    a.zbuffer += len;
1675    a.zout += len;
1676    return 1;
1677 }
1678 
1679 int parse_zlib_header(zbuf *a)
1680 {
1681    int cmf   = zget8(a);
1682    int cm    = cmf & 15;
1683    /* int cinfo = cmf >> 4; */
1684    int flg   = zget8(a);
1685    if ((cmf*256+flg) % 31 != 0) throw new STBImageException("Bad zlib header, corrupt PNG"); // zlib spec
1686    if (flg & 32) throw new STBImageException("No preset dict, corrupt PNG"); // preset dictionary not allowed in png
1687    if (cm != 8) throw new STBImageException("Bad compression, corrupt PNG");  // DEFLATE required for png
1688    // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
1689    return 1;
1690 }
1691 
1692 // @TODO: should statically initialize these for optimal thread safety
1693 __gshared ubyte[288] default_length;
1694 __gshared ubyte[32] default_distance;
1695 
1696 void init_defaults()
1697 {
1698    int i;   // use <= to match clearly with spec
1699    for (i=0; i <= 143; ++i)     default_length[i]   = 8;
1700    for (   ; i <= 255; ++i)     default_length[i]   = 9;
1701    for (   ; i <= 279; ++i)     default_length[i]   = 7;
1702    for (   ; i <= 287; ++i)     default_length[i]   = 8;
1703 
1704    for (i=0; i <=  31; ++i)     default_distance[i] = 5;
1705 }
1706 
1707 __gshared int stbi_png_partial; // a quick hack to only allow decoding some of a PNG... I should implement real streaming support instead
1708 int parse_zlib(zbuf *a, int parse_header)
1709 {
1710    int final_, type;
1711    if (parse_header)
1712       if (!parse_zlib_header(a)) return 0;
1713    a.num_bits = 0;
1714    a.code_buffer = 0;
1715    do {
1716       final_ = zreceive(a,1);
1717       type = zreceive(a,2);
1718       if (type == 0) {
1719          if (!parse_uncompressed_block(a)) return 0;
1720       } else if (type == 3) {
1721          return 0;
1722       } else {
1723          if (type == 1) {
1724             // use fixed code lengths
1725             if (!default_distance[31]) init_defaults();
1726             if (!zbuild_huffman(&a.z_length  , default_length.ptr  , 288)) return 0;
1727             if (!zbuild_huffman(&a.z_distance, default_distance.ptr,  32)) return 0;
1728          } else {
1729             if (!compute_huffman_codes(a)) return 0;
1730          }
1731          if (!parse_huffman_block(a)) return 0;
1732       }
1733       if (stbi_png_partial && a.zout - a.zout_start > 65536)
1734          break;
1735    } while (!final_);
1736    return 1;
1737 }
1738 
1739 int do_zlib(zbuf *a, ubyte *obuf, int olen, int exp, int parse_header)
1740 {
1741    a.zout_start = obuf;
1742    a.zout       = obuf;
1743    a.zout_end   = obuf + olen;
1744    a.z_expandable = exp;
1745 
1746    return parse_zlib(a, parse_header);
1747 }
1748 
1749 ubyte *stbi_zlib_decode_malloc_guesssize(const(ubyte) *buffer, int len, int initial_size, int *outlen)
1750 {
1751    zbuf a;
1752    ubyte *p = cast(ubyte*) malloc(initial_size);
1753    if (p == null) return null;
1754    a.zbuffer = buffer;
1755    a.zbuffer_end = buffer + len;
1756    if (do_zlib(&a, p, initial_size, 1, 1)) {
1757       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1758       return a.zout_start;
1759    } else {
1760       free(a.zout_start);
1761       return null;
1762    }
1763 }
1764 
1765 ubyte *stbi_zlib_decode_malloc(const(ubyte) *buffer, int len, int *outlen)
1766 {
1767    return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
1768 }
1769 
1770 ubyte *stbi_zlib_decode_malloc_guesssize_headerflag(const(ubyte) *buffer, int len, int initial_size, int *outlen, int parse_header)
1771 {
1772    zbuf a;
1773    ubyte *p = cast(ubyte*) malloc(initial_size);
1774    if (p == null) return null;
1775    a.zbuffer = buffer;
1776    a.zbuffer_end = buffer + len;
1777    if (do_zlib(&a, p, initial_size, 1, parse_header)) {
1778       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1779       return a.zout_start;
1780    } else {
1781       free(a.zout_start);
1782       return null;
1783    }
1784 }
1785 
1786 int stbi_zlib_decode_buffer(ubyte* obuffer, int olen, const(ubyte)* ibuffer, int ilen)
1787 {
1788    zbuf a;
1789    a.zbuffer = ibuffer;
1790    a.zbuffer_end = ibuffer + ilen;
1791    if (do_zlib(&a, obuffer, olen, 0, 1))
1792       return cast(int) (a.zout - a.zout_start);
1793    else
1794       return -1;
1795 }
1796 
1797 ubyte *stbi_zlib_decode_noheader_malloc(const(ubyte) *buffer, int len, int *outlen)
1798 {
1799    zbuf a;
1800    ubyte *p = cast(ubyte*) malloc(16384);
1801    if (p == null) return null;
1802    a.zbuffer = buffer;
1803    a.zbuffer_end = buffer+len;
1804    if (do_zlib(&a, p, 16384, 1, 0)) {
1805       if (outlen) *outlen = cast(int) (a.zout - a.zout_start);
1806       return a.zout_start;
1807    } else {
1808       free(a.zout_start);
1809       return null;
1810    }
1811 }
1812 
1813 int stbi_zlib_decode_noheader_buffer(ubyte *obuffer, int olen, const(ubyte) *ibuffer, int ilen)
1814 {
1815    zbuf a;
1816    a.zbuffer = ibuffer;
1817    a.zbuffer_end = ibuffer + ilen;
1818    if (do_zlib(&a, obuffer, olen, 0, 0))
1819       return cast(int) (a.zout - a.zout_start);
1820    else
1821       return -1;
1822 }
1823 
1824 // public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
1825 //    simple implementation
1826 //      - only 8-bit samples
1827 //      - no CRC checking
1828 //      - allocates lots of intermediate memory
1829 //        - avoids problem of streaming data between subsystems
1830 //        - avoids explicit window management
1831 //    performance
1832 //      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
1833 
1834 
1835 struct chunk
1836 {
1837    uint length;
1838    uint type;
1839 }
1840 
1841 uint PNG_TYPE(ubyte a, ubyte b, ubyte c, ubyte d)
1842 {
1843    return (a << 24) + (b << 16) + (c << 8) + d;
1844 }
1845 
1846 chunk get_chunk_header(stbi *s)
1847 {
1848    chunk c;
1849    c.length = get32(s);
1850    c.type   = get32(s);
1851    return c;
1852 }
1853 
1854 static int check_png_header(stbi *s)
1855 {
1856    static immutable ubyte[8] png_sig = [ 137, 80, 78, 71, 13, 10, 26, 10 ];
1857    for (int i = 0; i < 8; ++i)
1858    {
1859        ubyte headerByte = get8u(s);
1860        ubyte expected = png_sig[i];
1861        if (headerByte != expected) 
1862            throw new STBImageException("Bad PNG sig, not a PNG");
1863    }
1864    return 1;
1865 }
1866 
1867 struct png
1868 {
1869    stbi *s;
1870    ubyte *idata;
1871    ubyte *expanded;
1872    ubyte *out_;
1873 }
1874 
1875 
1876 enum : int 
1877 {
1878    F_none=0, F_sub=1, F_up=2, F_avg=3, F_paeth=4,
1879    F_avg_first, F_paeth_first
1880 }
1881 
1882 static immutable ubyte[5] first_row_filter =
1883 [
1884    F_none, F_sub, F_none, F_avg_first, F_paeth_first
1885 ];
1886 
1887 static int paeth(int a, int b, int c)
1888 {
1889    int p = a + b - c;
1890    int pa = abs(p-a);
1891    int pb = abs(p-b);
1892    int pc = abs(p-c);
1893    if (pa <= pb && pa <= pc) return a;
1894    if (pb <= pc) return b;
1895    return c;
1896 }
1897 
1898 // create the png data from post-deflated data
1899 static int create_png_image_raw(png *a, ubyte *raw, uint raw_len, int out_n, uint x, uint y)
1900 {
1901    stbi *s = a.s;
1902    uint i,j,stride = x*out_n;
1903    int k;
1904    int img_n = s.img_n; // copy it into a local for later
1905    assert(out_n == s.img_n || out_n == s.img_n+1);
1906    if (stbi_png_partial) y = 1;
1907    a.out_ = cast(ubyte*) malloc(x * y * out_n);
1908    if (!a.out_) throw new STBImageException("Out of memory");
1909    if (!stbi_png_partial) {
1910       if (s.img_x == x && s.img_y == y) {
1911          if (raw_len != (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1912       } else { // interlaced:
1913          if (raw_len < (img_n * x + 1) * y) throw new STBImageException("Not enough pixels, corrupt PNG");
1914       }
1915    }
1916    for (j=0; j < y; ++j) {
1917       ubyte *cur = a.out_ + stride*j;
1918       ubyte *prior = cur - stride;
1919       int filter = *raw++;
1920       if (filter > 4) throw new STBImageException("Invalid filter, corrupt PNG");
1921       // if first row, use special filter that doesn't sample previous row
1922       if (j == 0) filter = first_row_filter[filter];
1923       // handle first pixel explicitly
1924       for (k=0; k < img_n; ++k) {
1925          switch (filter) {
1926             case F_none       : cur[k] = raw[k]; break;
1927             case F_sub        : cur[k] = raw[k]; break;
1928             case F_up         : cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1929             case F_avg        : cur[k] = cast(ubyte)(raw[k] + (prior[k]>>1)); break;
1930             case F_paeth      : cur[k] = cast(ubyte) (raw[k] + paeth(0,prior[k],0)); break;
1931             case F_avg_first  : cur[k] = raw[k]; break;
1932             case F_paeth_first: cur[k] = raw[k]; break;
1933             default: break;
1934          }
1935       }
1936       if (img_n != out_n) cur[img_n] = 255;
1937       raw += img_n;
1938       cur += out_n;
1939       prior += out_n;
1940       // this is a little gross, so that we don't switch per-pixel or per-component
1941       if (img_n == out_n) {
1942 
1943          for (i=x-1; i >= 1; --i, raw+=img_n,cur+=img_n,prior+=img_n)
1944             for (k=0; k < img_n; ++k)
1945             {
1946                switch (filter) {
1947                   case F_none:  cur[k] = raw[k]; break;
1948                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-img_n]); break;
1949                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1950                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-img_n])>>1)); break;
1951                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],prior[k],prior[k-img_n])); break;
1952                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-img_n] >> 1)); break;
1953                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-img_n],0,0)); break;
1954                   default: break;
1955                }
1956             }
1957       } else {
1958          assert(img_n+1 == out_n);
1959 
1960          for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n)
1961             for (k=0; k < img_n; ++k)
1962             {
1963                switch (filter) {
1964                   case F_none:  cur[k] = raw[k]; break;
1965                   case F_sub:   cur[k] = cast(ubyte)(raw[k] + cur[k-out_n]); break;
1966                   case F_up:    cur[k] = cast(ubyte)(raw[k] + prior[k]); break;
1967                   case F_avg:   cur[k] = cast(ubyte)(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
1968                   case F_paeth:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
1969                   case F_avg_first:    cur[k] = cast(ubyte)(raw[k] + (cur[k-out_n] >> 1)); break;
1970                   case F_paeth_first:  cur[k] = cast(ubyte) (raw[k] + paeth(cur[k-out_n],0,0)); break;
1971                   default: break;
1972                }
1973             }
1974       }
1975    }
1976    return 1;
1977 }
1978 
1979 int create_png_image(png *a, ubyte *raw, uint raw_len, int out_n, int interlaced)
1980 {
1981    ubyte *final_;
1982    int p;
1983    int save;
1984    if (!interlaced)
1985       return create_png_image_raw(a, raw, raw_len, out_n, a.s.img_x, a.s.img_y);
1986    save = stbi_png_partial;
1987    stbi_png_partial = 0;
1988 
1989    // de-interlacing
1990    final_ = cast(ubyte*) malloc(a.s.img_x * a.s.img_y * out_n);
1991    for (p=0; p < 7; ++p) {
1992       int xorig[] = [ 0,4,0,2,0,1,0 ];
1993       int yorig[] = [ 0,0,4,0,2,0,1 ];
1994       int xspc[]  = [ 8,8,4,4,2,2,1 ];
1995       int yspc[]  = [ 8,8,8,4,4,2,2 ];
1996       int i,j,x,y;
1997       // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
1998       x = (a.s.img_x - xorig[p] + xspc[p]-1) / xspc[p];
1999       y = (a.s.img_y - yorig[p] + yspc[p]-1) / yspc[p];
2000       if (x && y) {
2001          if (!create_png_image_raw(a, raw, raw_len, out_n, x, y)) {
2002             free(final_);
2003             return 0;
2004          }
2005          for (j=0; j < y; ++j)
2006             for (i=0; i < x; ++i)
2007                memcpy(final_ + (j*yspc[p]+yorig[p])*a.s.img_x*out_n + (i*xspc[p]+xorig[p])*out_n,
2008                       a.out_ + (j*x+i)*out_n, out_n);
2009          free(a.out_);
2010          raw += (x*out_n+1)*y;
2011          raw_len -= (x*out_n+1)*y;
2012       }
2013    }
2014    a.out_ = final_;
2015 
2016    stbi_png_partial = save;
2017    return 1;
2018 }
2019 
2020 static int compute_transparency(png *z, ubyte tc[3], int out_n)
2021 {
2022    stbi *s = z.s;
2023    uint i, pixel_count = s.img_x * s.img_y;
2024    ubyte *p = z.out_;
2025 
2026    // compute color-based transparency, assuming we've
2027    // already got 255 as the alpha value in the output
2028    assert(out_n == 2 || out_n == 4);
2029 
2030    if (out_n == 2) {
2031       for (i=0; i < pixel_count; ++i) {
2032          p[1] = (p[0] == tc[0] ? 0 : 255);
2033          p += 2;
2034       }
2035    } else {
2036       for (i=0; i < pixel_count; ++i) {
2037          if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
2038             p[3] = 0;
2039          p += 4;
2040       }
2041    }
2042    return 1;
2043 }
2044 
2045 int expand_palette(png *a, ubyte *palette, int len, int pal_img_n)
2046 {
2047    uint i, pixel_count = a.s.img_x * a.s.img_y;
2048    ubyte *p;
2049    ubyte *temp_out;
2050    ubyte *orig = a.out_;
2051 
2052    p = cast(ubyte*) malloc(pixel_count * pal_img_n);
2053    if (p == null) 
2054       throw new STBImageException("Out of memory");
2055 
2056    // between here and free(out) below, exitting would leak
2057    temp_out = p;
2058 
2059    if (pal_img_n == 3) {
2060       for (i=0; i < pixel_count; ++i) {
2061          int n = orig[i]*4;
2062          p[0] = palette[n  ];
2063          p[1] = palette[n+1];
2064          p[2] = palette[n+2];
2065          p += 3;
2066       }
2067    } else {
2068       for (i=0; i < pixel_count; ++i) {
2069          int n = orig[i]*4;
2070          p[0] = palette[n  ];
2071          p[1] = palette[n+1];
2072          p[2] = palette[n+2];
2073          p[3] = palette[n+3];
2074          p += 4;
2075       }
2076    }
2077    free(a.out_);
2078    a.out_ = temp_out;
2079 
2080    return 1;
2081 }
2082 
2083 int parse_png_file(png *z, int scan, int req_comp)
2084 {
2085    ubyte[1024] palette;
2086    ubyte pal_img_n=0;
2087    ubyte has_trans=0;
2088    ubyte tc[3];
2089    uint ioff=0, idata_limit=0, i, pal_len=0;
2090    int first=1,k,interlace=0;
2091    stbi *s = z.s;
2092 
2093    z.expanded = null;
2094    z.idata = null;
2095    z.out_ = null;
2096 
2097    if (!check_png_header(s)) return 0;
2098 
2099    if (scan == SCAN_type) return 1;
2100 
2101    for (;;) {
2102       chunk c = get_chunk_header(s);
2103       switch (c.type) {
2104          case PNG_TYPE('I','H','D','R'): {
2105             int depth,color,comp,filter;
2106             if (!first) throw new STBImageException("Multiple IHDR, corrupt PNG");
2107             first = 0;
2108             if (c.length != 13) throw new STBImageException("Bad IHDR len, corrupt PNG");
2109             s.img_x = get32(s); if (s.img_x > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2110             s.img_y = get32(s); if (s.img_y > (1 << 24)) throw new STBImageException("Very large image (corrupt?)");
2111             depth = get8(s);  if (depth != 8)        throw new STBImageException("8bit only, PNG not supported: 8-bit only");
2112             color = get8(s);  if (color > 6)         throw new STBImageException("Bad ctype, corrupt PNG");
2113             if (color == 3) pal_img_n = 3; else if (color & 1) throw new STBImageException("Bad ctype, corrupt PNG");
2114             comp  = get8(s);  if (comp) throw new STBImageException("Bad comp method, corrupt PNG");
2115             filter= get8(s);  if (filter) throw new STBImageException("Bad filter method, corrupt PNG");
2116             interlace = get8(s); if (interlace>1) throw new STBImageException("Bad interlace method, corrupt PNG");
2117             if (!s.img_x || !s.img_y) throw new STBImageException("0-pixel image, corrupt PNG");
2118             if (!pal_img_n) {
2119                s.img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
2120                if ((1 << 30) / s.img_x / s.img_n < s.img_y) throw new STBImageException("Image too large to decode");
2121                if (scan == SCAN_header) return 1;
2122             } else {
2123                // if paletted, then pal_n is our final components, and
2124                // img_n is # components to decompress/filter.
2125                s.img_n = 1;
2126                if ((1 << 30) / s.img_x / 4 < s.img_y) throw new STBImageException("Too large, corrupt PNG");
2127                // if SCAN_header, have to scan to see if we have a tRNS
2128             }
2129             break;
2130          }
2131 
2132          case PNG_TYPE('P','L','T','E'):  {
2133             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2134             if (c.length > 256*3) throw new STBImageException("invalid PLTE, corrupt PNG");
2135             pal_len = c.length / 3;
2136             if (pal_len * 3 != c.length) throw new STBImageException("invalid PLTE, corrupt PNG");
2137             for (i=0; i < pal_len; ++i) {
2138                palette[i*4+0] = get8u(s);
2139                palette[i*4+1] = get8u(s);
2140                palette[i*4+2] = get8u(s);
2141                palette[i*4+3] = 255;
2142             }
2143             break;
2144          }
2145 
2146          case PNG_TYPE('t','R','N','S'): {
2147             if (first) throw new STBImageException("first not IHDR, cCorrupt PNG");
2148             if (z.idata) throw new STBImageException("tRNS after IDAT, corrupt PNG");
2149             if (pal_img_n) {
2150                if (scan == SCAN_header) { s.img_n = 4; return 1; }
2151                if (pal_len == 0) throw new STBImageException("tRNS before PLTE, corrupt PNG");
2152                if (c.length > pal_len) throw new STBImageException("bad tRNS len, corrupt PNG");
2153                pal_img_n = 4;
2154                for (i=0; i < c.length; ++i)
2155                   palette[i*4+3] = get8u(s);
2156             } else {
2157                if (!(s.img_n & 1)) throw new STBImageException("tRNS with alpha, corrupt PNG");
2158                if (c.length != cast(uint) s.img_n*2) throw new STBImageException("bad tRNS len, corrupt PNG");
2159                has_trans = 1;
2160                for (k=0; k < s.img_n; ++k)
2161                   tc[k] = cast(ubyte) get16(s); // non 8-bit images will be larger
2162             }
2163             break;
2164          }
2165 
2166          case PNG_TYPE('I','D','A','T'): {
2167             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2168             if (pal_img_n && !pal_len) throw new STBImageException("no PLTE, corrupt PNG");
2169             if (scan == SCAN_header) { s.img_n = pal_img_n; return 1; }
2170             if (ioff + c.length > idata_limit) {
2171                ubyte *p;
2172                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
2173                while (ioff + c.length > idata_limit)
2174                   idata_limit *= 2;
2175                p = cast(ubyte*) realloc(z.idata, idata_limit); if (p == null) throw new STBImageException("outofmem, cOut of memory");
2176                z.idata = p;
2177             }
2178             if (!getn(s, z.idata+ioff,c.length)) throw new STBImageException("outofdata, corrupt PNG");
2179             ioff += c.length;
2180             break;
2181          }
2182 
2183          case PNG_TYPE('I','E','N','D'): {
2184             uint raw_len;
2185             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2186             if (scan != SCAN_load) return 1;
2187             if (z.idata == null) throw new STBImageException("no IDAT, corrupt PNG");
2188             z.expanded = stbi_zlib_decode_malloc_guesssize_headerflag(z.idata, ioff, 16384, cast(int *) &raw_len, 1);
2189             if (z.expanded == null) return 0; // zlib should set error
2190             free(z.idata); z.idata = null;
2191             if ((req_comp == s.img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
2192                s.img_out_n = s.img_n+1;
2193             else
2194                s.img_out_n = s.img_n;
2195             if (!create_png_image(z, z.expanded, raw_len, s.img_out_n, interlace)) return 0;
2196             if (has_trans)
2197                if (!compute_transparency(z, tc, s.img_out_n)) return 0;
2198             if (pal_img_n) {
2199                // pal_img_n == 3 or 4
2200                s.img_n = pal_img_n; // record the actual colors we had
2201                s.img_out_n = pal_img_n;
2202                if (req_comp >= 3) s.img_out_n = req_comp;
2203                if (!expand_palette(z, palette.ptr, pal_len, s.img_out_n))
2204                   return 0;
2205             }
2206             free(z.expanded); z.expanded = null;
2207             return 1;
2208          }
2209 
2210          default:
2211             // if critical, fail
2212             if (first) throw new STBImageException("first not IHDR, corrupt PNG");
2213             if ((c.type & (1 << 29)) == 0) {
2214 
2215                throw new STBImageException("PNG not supported: unknown chunk type");
2216             }
2217             skip(s, c.length);
2218             break;
2219       }
2220       // end of chunk, read and skip CRC
2221       get32(s);
2222    }
2223 }
2224 
2225 ubyte *do_png(png *p, int *x, int *y, int *n, int req_comp)
2226 {
2227    ubyte *result=null;
2228    if (req_comp < 0 || req_comp > 4) 
2229       throw new STBImageException("Internal error: bad req_comp");
2230    if (parse_png_file(p, SCAN_load, req_comp)) {
2231       result = p.out_;
2232       p.out_ = null;
2233       if (req_comp && req_comp != p.s.img_out_n) {
2234          result = convert_format(result, p.s.img_out_n, req_comp, p.s.img_x, p.s.img_y);
2235          p.s.img_out_n = req_comp;
2236          if (result == null) return result;
2237       }
2238       *x = p.s.img_x;
2239       *y = p.s.img_y;
2240       if (n) *n = p.s.img_n;
2241    }
2242    free(p.out_);      p.out_    = null;
2243    free(p.expanded); p.expanded = null;
2244    free(p.idata);    p.idata    = null;
2245 
2246    return result;
2247 }
2248 
2249 ubyte *stbi_png_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2250 {
2251    png p;
2252    p.s = s;
2253    return do_png(&p, x,y,comp,req_comp);
2254 }
2255 
2256 void stbi_png_test(stbi *s)
2257 {
2258    int r = check_png_header(s);
2259    if (r == 0)
2260        throw new STBImageException("Couldn't decode PNG header");
2261 }
2262 
2263 // Microsoft/Windows BMP image
2264 
2265 void stbi_bmp_test(stbi *s)
2266 {
2267     if (get8(s) != 'B') throw new STBImageException("Couldn't decode BMP header");
2268     if (get8(s) != 'M') throw new STBImageException("Couldn't decode BMP header");
2269     get32le(s); // discard filesize
2270     get16le(s); // discard reserved
2271     get16le(s); // discard reserved
2272     get32le(s); // discard data offset
2273     int sz = get32le(s);
2274     if (sz == 12 || sz == 40 || sz == 56 || sz == 108) 
2275         return;
2276 
2277     throw new STBImageException("Couldn't decode BMP header");
2278 }
2279 
2280 
2281 // returns 0..31 for the highest set bit
2282 int high_bit(uint z)
2283 {
2284    int n=0;
2285    if (z == 0) return -1;
2286    if (z >= 0x10000) n += 16, z >>= 16;
2287    if (z >= 0x00100) n +=  8, z >>=  8;
2288    if (z >= 0x00010) n +=  4, z >>=  4;
2289    if (z >= 0x00004) n +=  2, z >>=  2;
2290    if (z >= 0x00002) n +=  1, z >>=  1;
2291    return n;
2292 }
2293 
2294 int bitcount(uint a)
2295 {
2296    a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
2297    a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
2298    a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
2299    a = (a + (a >> 8)); // max 16 per 8 bits
2300    a = (a + (a >> 16)); // max 32 per 8 bits
2301    return a & 0xff;
2302 }
2303 
2304 int shiftsigned(int v, int shift, int bits)
2305 {
2306    int result;
2307    int z=0;
2308 
2309    if (shift < 0) v <<= -shift;
2310    else v >>= shift;
2311    result = v;
2312 
2313    z = bits;
2314    while (z < 8) {
2315       result += v >> z;
2316       z += bits;
2317    }
2318    return result;
2319 }
2320 
2321 ubyte *bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2322 {
2323    ubyte *out_;
2324    uint mr=0,mg=0,mb=0,ma=0, fake_a=0;
2325    ubyte pal[256][4];
2326    int psize=0,i,j,compress=0,width;
2327    int bpp, flip_vertically, pad, target, offset, hsz;
2328    if (get8(s) != 'B' || get8(s) != 'M') throw new STBImageException("not BMP, Corrupt BMP");
2329    get32le(s); // discard filesize
2330    get16le(s); // discard reserved
2331    get16le(s); // discard reserved
2332    offset = get32le(s);
2333    hsz = get32le(s);
2334    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108) throw new STBImageException("unknown BMP, BMP type not supported: unknown");
2335    if (hsz == 12) {
2336       s.img_x = get16le(s);
2337       s.img_y = get16le(s);
2338    } else {
2339       s.img_x = get32le(s);
2340       s.img_y = get32le(s);
2341    }
2342    if (get16le(s) != 1) throw new STBImageException("bad BMP");
2343    bpp = get16le(s);
2344    if (bpp == 1) throw new STBImageException("monochrome, BMP type not supported: 1-bit");
2345    flip_vertically = (cast(int) s.img_y) > 0;
2346    s.img_y = abs(cast(int) s.img_y);
2347    if (hsz == 12) {
2348       if (bpp < 24)
2349          psize = (offset - 14 - 24) / 3;
2350    } else {
2351       compress = get32le(s);
2352       if (compress == 1 || compress == 2) throw new STBImageException("BMP RLE, BMP type not supported: RLE");
2353       get32le(s); // discard sizeof
2354       get32le(s); // discard hres
2355       get32le(s); // discard vres
2356       get32le(s); // discard colorsused
2357       get32le(s); // discard max important
2358       if (hsz == 40 || hsz == 56) {
2359          if (hsz == 56) {
2360             get32le(s);
2361             get32le(s);
2362             get32le(s);
2363             get32le(s);
2364          }
2365          if (bpp == 16 || bpp == 32) {
2366             mr = mg = mb = 0;
2367             if (compress == 0) {
2368                if (bpp == 32) {
2369                   mr = 0xffu << 16;
2370                   mg = 0xffu <<  8;
2371                   mb = 0xffu <<  0;
2372                   ma = 0xffu << 24;
2373                   fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
2374                } else {
2375                   mr = 31u << 10;
2376                   mg = 31u <<  5;
2377                   mb = 31u <<  0;
2378                }
2379             } else if (compress == 3) {
2380                mr = get32le(s);
2381                mg = get32le(s);
2382                mb = get32le(s);
2383                // not documented, but generated by photoshop and handled by mspaint
2384                if (mr == mg && mg == mb) {
2385                   // ?!?!?
2386                   throw new STBImageException("bad BMP");
2387                }
2388             } else
2389                throw new STBImageException("bad BMP");
2390          }
2391       } else {
2392          assert(hsz == 108);
2393          mr = get32le(s);
2394          mg = get32le(s);
2395          mb = get32le(s);
2396          ma = get32le(s);
2397          get32le(s); // discard color space
2398          for (i=0; i < 12; ++i)
2399             get32le(s); // discard color space parameters
2400       }
2401       if (bpp < 16)
2402          psize = (offset - 14 - hsz) >> 2;
2403    }
2404    s.img_n = ma ? 4 : 3;
2405    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
2406       target = req_comp;
2407    else
2408       target = s.img_n; // if they want monochrome, we'll post-convert
2409    out_ = cast(ubyte*) malloc(target * s.img_x * s.img_y);
2410    if (!out_) throw new STBImageException("Out of memory");
2411    if (bpp < 16) {
2412       int z=0;
2413       if (psize == 0 || psize > 256) { free(out_); throw new STBImageException("invalid, Corrupt BMP"); }
2414       for (i=0; i < psize; ++i) {
2415          pal[i][2] = get8u(s);
2416          pal[i][1] = get8u(s);
2417          pal[i][0] = get8u(s);
2418          if (hsz != 12) get8(s);
2419          pal[i][3] = 255;
2420       }
2421       skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
2422       if (bpp == 4) width = (s.img_x + 1) >> 1;
2423       else if (bpp == 8) width = s.img_x;
2424       else { free(out_); throw new STBImageException("bad bpp, corrupt BMP"); }
2425       pad = (-width)&3;
2426       for (j=0; j < cast(int) s.img_y; ++j) {
2427          for (i=0; i < cast(int) s.img_x; i += 2) {
2428             int v=get8(s),v2=0;
2429             if (bpp == 4) {
2430                v2 = v & 15;
2431                v >>= 4;
2432             }
2433             out_[z++] = pal[v][0];
2434             out_[z++] = pal[v][1];
2435             out_[z++] = pal[v][2];
2436             if (target == 4) out_[z++] = 255;
2437             if (i+1 == cast(int) s.img_x) break;
2438             v = (bpp == 8) ? get8(s) : v2;
2439             out_[z++] = pal[v][0];
2440             out_[z++] = pal[v][1];
2441             out_[z++] = pal[v][2];
2442             if (target == 4) out_[z++] = 255;
2443          }
2444          skip(s, pad);
2445       }
2446    } else {
2447       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
2448       int z = 0;
2449       int easy=0;
2450       skip(s, offset - 14 - hsz);
2451       if (bpp == 24) width = 3 * s.img_x;
2452       else if (bpp == 16) width = 2*s.img_x;
2453       else /* bpp = 32 and pad = 0 */ width=0;
2454       pad = (-width) & 3;
2455       if (bpp == 24) {
2456          easy = 1;
2457       } else if (bpp == 32) {
2458          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
2459             easy = 2;
2460       }
2461       if (!easy) {
2462          if (!mr || !mg || !mb) { free(out_); throw new STBImageException("bad masks, corrupt BMP"); }
2463          // right shift amt to put high bit in position #7
2464          rshift = high_bit(mr)-7; rcount = bitcount(mr);
2465          gshift = high_bit(mg)-7; gcount = bitcount(mr);
2466          bshift = high_bit(mb)-7; bcount = bitcount(mr);
2467          ashift = high_bit(ma)-7; acount = bitcount(mr);
2468       }
2469       for (j=0; j < cast(int) s.img_y; ++j) {
2470          if (easy) {
2471             for (i=0; i < cast(int) s.img_x; ++i) {
2472                int a;
2473                out_[z+2] = get8u(s);
2474                out_[z+1] = get8u(s);
2475                out_[z+0] = get8u(s);
2476                z += 3;
2477                a = (easy == 2 ? get8(s) : 255);
2478                if (target == 4) out_[z++] = cast(ubyte) a;
2479             }
2480          } else {
2481             for (i=0; i < cast(int) s.img_x; ++i) {
2482                uint v = (bpp == 16 ? get16le(s) : get32le(s));
2483                int a;
2484                out_[z++] = cast(ubyte) shiftsigned(v & mr, rshift, rcount);
2485                out_[z++] = cast(ubyte) shiftsigned(v & mg, gshift, gcount);
2486                out_[z++] = cast(ubyte) shiftsigned(v & mb, bshift, bcount);
2487                a = (ma ? shiftsigned(v & ma, ashift, acount) : 255);
2488                if (target == 4) out_[z++] = cast(ubyte) a; 
2489             }
2490          }
2491          skip(s, pad);
2492       }
2493    }
2494    if (flip_vertically) {
2495       ubyte t;
2496       for (j=0; j < cast(int) s.img_y>>1; ++j) {
2497          ubyte *p1 = out_ +      j     *s.img_x*target;
2498          ubyte *p2 = out_ + (s.img_y-1-j)*s.img_x*target;
2499          for (i=0; i < cast(int) s.img_x*target; ++i) {
2500             t = p1[i], p1[i] = p2[i], p2[i] = t;
2501          }
2502       }
2503    }
2504 
2505    if (req_comp && req_comp != target) {
2506       out_ = convert_format(out_, target, req_comp, s.img_x, s.img_y);
2507       if (out_ == null) return out_; // convert_format frees input on failure
2508    }
2509 
2510    *x = s.img_x;
2511    *y = s.img_y;
2512    if (comp) *comp = s.img_n;
2513    return out_;
2514 }
2515 
2516 ubyte *stbi_bmp_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2517 {
2518    return bmp_load(s, x,y,comp,req_comp);
2519 }
2520 
2521 // *************************************************************************************************
2522 // GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
2523 struct stbi_gif_lzw 
2524 {
2525    short prefix;
2526    ubyte first;
2527    ubyte suffix;
2528 }
2529 
2530 struct stbi_gif
2531 {
2532    int w,h;
2533    ubyte *out_;                 // output buffer (always 4 components)
2534    int flags, bgindex, ratio, transparent, eflags;
2535    ubyte  pal[256][4];
2536    ubyte lpal[256][4];
2537    stbi_gif_lzw codes[4096];
2538    ubyte *color_table;
2539    int parse, step;
2540    int lflags;
2541    int start_x, start_y;
2542    int max_x, max_y;
2543    int cur_x, cur_y;
2544    int line_size;
2545 }
2546 
2547 void stbi_gif_test(stbi *s)
2548 {
2549     int sz;
2550     if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8') 
2551         throw new STBImageException("Couldn't decode GIF header");
2552     sz = get8(s);
2553     if (sz != '9' && sz != '7') 
2554         throw new STBImageException("Couldn't decode GIF header");
2555     if (get8(s) != 'a') 
2556         throw new STBImageException("Couldn't decode GIF header");
2557 }
2558 
2559 void stbi_gif_parse_colortable(stbi *s, ubyte pal[256][4], int num_entries, int transp)
2560 {
2561    int i;
2562    for (i=0; i < num_entries; ++i) {
2563       pal[i][2] = get8u(s);
2564       pal[i][1] = get8u(s);
2565       pal[i][0] = get8u(s);
2566       pal[i][3] = transp ? 0 : 255;
2567    }   
2568 }
2569 
2570 int stbi_gif_header(stbi *s, stbi_gif *g, int *comp, int is_info)
2571 {
2572    ubyte version_;
2573    if (get8(s) != 'G' || get8(s) != 'I' || get8(s) != 'F' || get8(s) != '8')
2574       throw new STBImageException("not GIF, corrupt GIF");
2575 
2576    version_ = get8u(s);
2577    if (version_ != '7' && version_ != '9')    throw new STBImageException("not GIF, corrupt GIF");
2578    if (get8(s) != 'a')                      throw new STBImageException("not GIF, corrupt GIF");
2579  
2580    g.w = get16le(s);
2581    g.h = get16le(s);
2582    g.flags = get8(s);
2583    g.bgindex = get8(s);
2584    g.ratio = get8(s);
2585    g.transparent = -1;
2586 
2587    if (comp != null) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
2588 
2589    if (is_info) return 1;
2590 
2591    if (g.flags & 0x80)
2592       stbi_gif_parse_colortable(s,g.pal, 2 << (g.flags & 7), -1);
2593 
2594    return 1;
2595 }
2596 
2597 void stbi_out_gif_code(stbi_gif *g, ushort code)
2598 {
2599    ubyte *p;
2600    ubyte *c;
2601 
2602    // recurse to decode the prefixes, since the linked-list is backwards,
2603    // and working backwards through an interleaved image would be nasty
2604    if (g.codes[code].prefix >= 0)
2605       stbi_out_gif_code(g, g.codes[code].prefix);
2606 
2607    if (g.cur_y >= g.max_y) return;
2608   
2609    p = (&g.out_[g.cur_x + g.cur_y]);
2610    c = &g.color_table[g.codes[code].suffix * 4];
2611 
2612    if (c[3] >= 128) {
2613       p[0] = c[2];
2614       p[1] = c[1];
2615       p[2] = c[0];
2616       p[3] = c[3];
2617    }
2618    g.cur_x += 4;
2619 
2620    if (g.cur_x >= g.max_x) {
2621       g.cur_x = g.start_x;
2622       g.cur_y += g.step;
2623 
2624       while (g.cur_y >= g.max_y && g.parse > 0) {
2625          g.step = (1 << g.parse) * g.line_size;
2626          g.cur_y = g.start_y + (g.step >> 1);
2627          --g.parse;
2628       }
2629    }
2630 }
2631 
2632 ubyte *stbi_process_gif_raster(stbi *s, stbi_gif *g)
2633 {
2634    ubyte lzw_cs;
2635    int len, code;
2636    uint first;
2637    int codesize, codemask, avail, oldcode, bits, valid_bits, clear;
2638    stbi_gif_lzw *p;
2639 
2640    lzw_cs = get8u(s);
2641    clear = 1 << lzw_cs;
2642    first = 1;
2643    codesize = lzw_cs + 1;
2644    codemask = (1 << codesize) - 1;
2645    bits = 0;
2646    valid_bits = 0;
2647    for (code = 0; code < clear; code++) {
2648       g.codes[code].prefix = -1;
2649       g.codes[code].first = cast(ubyte) code;
2650       g.codes[code].suffix = cast(ubyte) code;
2651    }
2652 
2653    // support no starting clear code
2654    avail = clear+2;
2655    oldcode = -1;
2656 
2657    len = 0;
2658    for(;;) {
2659       if (valid_bits < codesize) {
2660          if (len == 0) {
2661             len = get8(s); // start new block
2662             if (len == 0) 
2663                return g.out_;
2664          }
2665          --len;
2666          bits |= cast(int) get8(s) << valid_bits;
2667          valid_bits += 8;
2668       } else {
2669          int code_ = bits & codemask;
2670          bits >>= codesize;
2671          valid_bits -= codesize;
2672          // @OPTIMIZE: is there some way we can accelerate the non-clear path?
2673          if (code_ == clear) {  // clear code
2674             codesize = lzw_cs + 1;
2675             codemask = (1 << codesize) - 1;
2676             avail = clear + 2;
2677             oldcode = -1;
2678             first = 0;
2679          } else if (code_ == clear + 1) { // end of stream code
2680             skip(s, len);
2681             while ((len = get8(s)) > 0)
2682                skip(s,len);
2683             return g.out_;
2684          } else if (code_ <= avail) {
2685             if (first) throw new STBImageException("no clear code, corrupt GIF");
2686 
2687             if (oldcode >= 0) {
2688                p = &g.codes[avail++];
2689                if (avail > 4096)        throw new STBImageException("too many codes, corrupt GIF");
2690                p.prefix = cast(short) oldcode;
2691                p.first = g.codes[oldcode].first;
2692                p.suffix = (code_ == avail) ? p.first : g.codes[code_].first;
2693             } else if (code_ == avail)
2694                throw new STBImageException("illegal code in raster, corrupt GIF");
2695 
2696             stbi_out_gif_code(g, cast(ushort) code);
2697 
2698             if ((avail & codemask) == 0 && avail <= 0x0FFF) {
2699                codesize++;
2700                codemask = (1 << codesize) - 1;
2701             }
2702 
2703             oldcode = code_;
2704          } else {
2705             throw new STBImageException("illegal code in raster, corrupt GIF");
2706          }
2707       } 
2708    }
2709 }
2710 
2711 void stbi_fill_gif_background(stbi_gif *g)
2712 {
2713    int i;
2714    ubyte *c = g.pal[g.bgindex].ptr;
2715    // @OPTIMIZE: write a dword at a time
2716    for (i = 0; i < g.w * g.h * 4; i += 4) {
2717       ubyte *p  = &g.out_[i];
2718       p[0] = c[2];
2719       p[1] = c[1];
2720       p[2] = c[0];
2721       p[3] = c[3];
2722    }
2723 }
2724 
2725 // this function is designed to support animated gifs, although stb_image doesn't support it
2726 ubyte *stbi_gif_load_next(stbi *s, stbi_gif *g, int *comp, int req_comp)
2727 {
2728    int i;
2729    ubyte *old_out = null;
2730 
2731    if (g.out_ == null) {
2732       if (!stbi_gif_header(s, g, comp,0))     return null; // failure_reason set by stbi_gif_header
2733       g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2734       if (g.out_ == null)                      throw new STBImageException("Out of memory");
2735       stbi_fill_gif_background(g);
2736    } else {
2737       // animated-gif-only path
2738       if (((g.eflags & 0x1C) >> 2) == 3) {
2739          old_out = g.out_;
2740          g.out_ = cast(ubyte*) malloc(4 * g.w * g.h);
2741          if (g.out_ == null)                   throw new STBImageException("Out of memory");
2742          memcpy(g.out_, old_out, g.w*g.h*4);
2743       }
2744    }
2745     
2746    for (;;) {
2747       switch (get8(s)) {
2748          case 0x2C: /* Image Descriptor */
2749          {
2750             int x, y, w, h;
2751             ubyte *o;
2752 
2753             x = get16le(s);
2754             y = get16le(s);
2755             w = get16le(s);
2756             h = get16le(s);
2757             if (((x + w) > (g.w)) || ((y + h) > (g.h)))
2758                throw new STBImageException("bad Image Descriptor, corrupt GIF");
2759 
2760             g.line_size = g.w * 4;
2761             g.start_x = x * 4;
2762             g.start_y = y * g.line_size;
2763             g.max_x   = g.start_x + w * 4;
2764             g.max_y   = g.start_y + h * g.line_size;
2765             g.cur_x   = g.start_x;
2766             g.cur_y   = g.start_y;
2767 
2768             g.lflags = get8(s);
2769 
2770             if (g.lflags & 0x40) {
2771                g.step = 8 * g.line_size; // first interlaced spacing
2772                g.parse = 3;
2773             } else {
2774                g.step = g.line_size;
2775                g.parse = 0;
2776             }
2777 
2778             if (g.lflags & 0x80) {
2779                stbi_gif_parse_colortable(s,g.lpal, 2 << (g.lflags & 7), g.eflags & 0x01 ? g.transparent : -1);
2780                g.color_table = &g.lpal[0][0];       
2781             } else if (g.flags & 0x80) {
2782                for (i=0; i < 256; ++i)  // @OPTIMIZE: reset only the previous transparent
2783                   g.pal[i][3] = 255; 
2784                if (g.transparent >= 0 && (g.eflags & 0x01))
2785                   g.pal[g.transparent][3] = 0;
2786                g.color_table = &g.pal[0][0];
2787             } else
2788                throw new STBImageException("missing color table, corrupt GIF");
2789    
2790             o = stbi_process_gif_raster(s, g);
2791             if (o == null) return null;
2792 
2793             if (req_comp && req_comp != 4)
2794                o = convert_format(o, 4, req_comp, g.w, g.h);
2795             return o;
2796          }
2797 
2798          case 0x21: // Comment Extension.
2799          {
2800             int len;
2801             if (get8(s) == 0xF9) { // Graphic Control Extension.
2802                len = get8(s);
2803                if (len == 4) {
2804                   g.eflags = get8(s);
2805                   get16le(s); // delay
2806                   g.transparent = get8(s);
2807                } else {
2808                   skip(s, len);
2809                   break;
2810                }
2811             }
2812             while ((len = get8(s)) != 0)
2813                skip(s, len);
2814             break;
2815          }
2816 
2817          case 0x3B: // gif stream termination code
2818             return cast(ubyte*) 1;
2819 
2820          default:
2821             throw new STBImageException("unknown code, corrupt GIF");
2822       }
2823    }
2824 }
2825 
2826 ubyte *stbi_gif_load(stbi *s, int *x, int *y, int *comp, int req_comp)
2827 {
2828    ubyte *u = null;
2829    stbi_gif g={0};
2830 
2831    u = stbi_gif_load_next(s, &g, comp, req_comp);
2832    if (u == cast(void *) 1) u = null;  // end of animated gif marker
2833    if (u) {
2834       *x = g.w;
2835       *y = g.h;
2836    }
2837 
2838    return u;
2839 }
2840 
2841