[Zlib-devel] Proposed code change

Steve Snyder swsnyder at insightbb.com
Sat Oct 28 10:04:02 EDT 2006


While profiling the display of large PNG graphics in Firefox recently 
I discovered that Zlib's inflate_fast() is the bottleneck in the 
entire decode-and-display procedure.  The patch below improves the 
performance of data output from inflate_fast().  

The improvement comes from copying the data in multiple bytes rather 
than one byte at a time.  The original code is a looped "*out++ = 
*in++" meaning 2 memory access and 2 pointer increments for each byte 
copied.  The patch attempts to copy the data in 32-bit blocks, falling 
back to the original code if the 32-bit copy is not practical.  

The display of large (10+ megabytes) PNG graphics isn't exactly 
mainsteam, so here's a more typical scenario.  The patch only does the 
multi-byte copy if the data to be copied is greater than 16 bytes.  
These are statistics seen when bringing up Firefox 2.0 to 
http://www.yahoo.com (Yahoo uses compressed pages): 

    5825 (0x16c1) calls to 32-bit loops (len > 16)
   33136 (0x8170) calls to multi-byte loops (len <= 16)
   38961 total calls to macro (32-bit = 14% by cnt)

   198765 (0x3086d) bytes copied by (> 16) code
   481822 (0x75a1e) bytes copied by (<=16) code
   680587 total bytes copied (32-bit = 30% by bytes)

The impact of this patch in my observation is that execution time for 
inflate_fast() is typically reduced by %5.  Even through 16+ byte 
output is not the typical case, it happen often enough to favorably 
impact the routine as a whole.  

Let me a address a concern.  You'll see below that my change is 
#ifdef'd to the symbol _X86_.  This is a CYA move because I only have 
x86 machines to test on.  This code does not rely on the byte ordering 
or instruction set of a given CPU type, though it does assume that the 
CPU can do both 32-bit and 16-bit memory accesses (implied: 32-bit 
integers and 16-bit short data types).  This is plain C code, and has 
been tested with Microsoft's VS2003 on Windows and Gnu's GCC v4.0.2 on 
Linux.  I hope and expect that the _X86_ dependency will go away once 
this patch gets wider testing.  

The patch was developed on Zlib 1.2.3, but cleanly patches the inffast.c 
from version 1.2.3.3.

I welcome your questions and comments on this proposed code change.

Thank you.

------ Patch starts below this line.---------------------------------

--- mozilla.orig/modules/zlib/src/inffast.c	2005-08-04 14:14:14.000000000 -0500
+++ mozilla/modules/zlib/src/inffast.c	2006-10-28 09:36:01.000000000 -0400
@@ -21,6 +21,12 @@
    - Pentium III (Anderson)
    - M68060 (Nikl)
  */
+
+#if defined(_X86_)
+/* for PUPCPY() */
+#define POSTINC
+#endif /* _X86_ */
+
 #ifdef POSTINC
 #  define OFF 0
 #  define PUP(a) *(a)++
@@ -29,6 +35,58 @@
 #  define PUP(a) *++(a)
 #endif
 
+#if defined(_X86_)
+#define PUPCPY(dst, src, len)    \
+   /* 32-bit copy justified by length and addresses? */ \
+   if ( (len > 16) && ((dst<=src) || (dst>(src+len))) ) \
+   {                             \
+      /* align to 32-bit dest */ \
+      while ((unsigned)dst & 3) {\
+         *dst++ = *src++; len--; \
+      }                          \
+      /* copy bulk of data */    \
+      while (len > 7) {          \
+         *((unsigned *)dst) =    \
+            *((unsigned *)src);  \
+         *((unsigned *)dst+1) =  \
+            *((unsigned *)src+1);\
+         dst+=8; src+=8; len-=8; \
+      }                          \
+      /* copy most odd bytes */  \
+      while (len > 1) {          \
+         *((short *)dst) =       \
+            *((short *)src);     \
+         dst+=2; src+=2; len-=2; \
+      }                          \
+   }                             \
+   else                          \
+   {                             \
+      /* copy bulk of data */    \
+      while (len > 3) {          \
+         dst[0] = src[0];        \
+         dst[1] = src[1];        \
+         dst[2] = src[2];        \
+         dst[3] = src[3];        \
+         dst+=4; src+=4; len-=4; \
+      }                          \
+   }                             \
+   /* singly copy odd byte(s) */ \
+   while (len--) *dst++ = *src++;
+#else /* _X86_ */
+#define PUPCPY(dst, src, len)    \
+   while (len > 2) {             \
+      PUP(dst) = PUP(src);       \
+      PUP(dst) = PUP(src);       \
+      PUP(dst) = PUP(src);       \
+      len -= 3;                  \
+   }                             \
+   if (len) {                    \
+      PUP(dst) = PUP(src);       \
+      if (len > 1)               \
+         PUP(dst) = PUP(src);    \
+   }
+#endif /* _X86_ */
+
 /*
    Decode literal, length, and distance codes and write out the resulting
    literal and match bytes until either not enough input or output is
@@ -231,31 +289,11 @@
                             from = out - dist;  /* rest from output */
                         }
                     }
-                    while (len > 2) {
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    }
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    PUPCPY(out, from, len)
                 }
                 else {
                     from = out - dist;          /* copy direct from output */
-                    do {                        /* minimum length is three */
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        PUP(out) = PUP(from);
-                        len -= 3;
-                    } while (len > 2);
-                    if (len) {
-                        PUP(out) = PUP(from);
-                        if (len > 1)
-                            PUP(out) = PUP(from);
-                    }
+                    PUPCPY(out, from, len)      /* minimum length is three */
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */





More information about the Zlib-devel mailing list