[Zlib-devel] RFC: Improve longest_match performance

Andreas Krebbel krebbel at linux.vnet.ibm.com
Wed Jun 6 02:59:27 EDT 2012


Hi,

I'm trying to improve performance of the zlib compression on S/390.

The code currently generated for longest_match looks far from optimal
due to a bunch of pointless zero/sign extend instructions.

By just promoting a few data types in the function I was able to get
rid of all but two. The new hotloop is almost half the size of the
original version providing quite a performance win for S/390:

Measured on a zEnterprise z196
zlib compiled with upstream GCC 4.8 branch

32 bit	      	   	old 	new
			
256MB randomdata:	11.65s	10.92s	6.68%	
100MB manpages:		4.23s	4.14s	2.17%
217MB PDF:		10.54s	9.44s	11.65%

unaligned ok

256MB randomdata:	10.90s	10.54s	3.41%
100MB manpages:		3.94s	3.87s	1.81%
217MB PDF:		8.77s	8.64s   1.50%

64 bit	      	   	old 	new
			
256MB randomdata:	11.90s	11.43s	4.11%	
100MB manpages:		4.51s	4.44s	1.58%
217MB PDF:		10.11s	9.89s	2.22%

unaligned ok

256MB randomdata:	11.51s	11.15s	3.23%
100MB manpages:		4.33s	3.99s	8.52%
217MB PDF:		9.81s	9.02s   8.76%

I also did some measurements on x86 and Power:

For Power (64 bit, unaligned_ok) an additional zero extend
appears. However, the impact is not measurable. There are minor wins
and minor regressions. The overall result is flat.

For Core2 32 bit the patch is a clear winner with up to 9% for the pdf
test.  Also on 64 bit the code optimized for Core2 gets a bit smaller
but unfortunately causes some regressions which I cannot explain.

For mainframe customers the performance of zlib is very important so I
would be very happy to see the patch integrated into upstream
zlib. Given that the patch might cause minor regressions on other
targets, would it be possible to enable it arch-dependent?

See below for the patch and some code snippets from my tests.

Bye,

-Andreas-


---
 deflate.c |   17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

Index: zlib/deflate.c
===================================================================
--- zlib.orig/deflate.c
+++ zlib/deflate.c
@@ -1143,15 +1143,16 @@ local void lm_init (s)
 /* For 80x86 and 680x0, an optimized version will be provided in match.asm or
  * match.S. The code will be functionally equivalent.
  */
-local uInt longest_match(s, cur_match)
+local uInt longest_match(s, pcur_match)
     deflate_state *s;
-    IPos cur_match;                             /* current match */
+    IPos pcur_match;                             /* current match */
 {
+    uLong cur_match = pcur_match; /* extend to pointer width */
     unsigned chain_length = s->max_chain_length;/* max hash chain length */
     register Bytef *scan = s->window + s->strstart; /* current string */
     register Bytef *match;                       /* matched string */
     register int len;                           /* length of current match */
-    int best_len = s->prev_length;              /* best match length so far */
+    uLong best_len = s->prev_length;              /* best match length so far */
     int nice_match = s->nice_match;             /* stop if match long enough */
     IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
         s->strstart - (IPos)MAX_DIST(s) : NIL;
@@ -1159,19 +1160,19 @@ local uInt longest_match(s, cur_match)
      * we prevent matches with the string of window index 0.
      */
     Posf *prev = s->prev;
-    uInt wmask = s->w_mask;
+    uLong wmask = s->w_mask;
 
 #ifdef UNALIGNED_OK
     /* Compare two bytes at a time. Note: this is not always beneficial.
      * Try with and without -DUNALIGNED_OK to check.
      */
     register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
-    register ush scan_start = *(ushf*)scan;
-    register ush scan_end   = *(ushf*)(scan+best_len-1);
+    register uInt scan_start = *(ushf*)scan;
+    register uInt scan_end   = *(ushf*)(scan+best_len-1);
 #else
     register Bytef *strend = s->window + s->strstart + MAX_MATCH;
-    register Byte scan_end1  = scan[best_len-1];
-    register Byte scan_end   = scan[best_len];
+    register uInt scan_end1  = scan[best_len-1];
+    register uInt scan_end   = scan[best_len];
 #endif
 
     /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.

  

64 bit -O3 -march=z196 -DUNALIGNED_OK

Unpatched:

    80004616:	14 37             	nr	%r3,%r7
    80004618:	b9 16 00 33       	llgfr	%r3,%r3		32>64 zero extend
    8000461c:	eb 33 00 01 00 0d 	sllg	%r3,%r3,1
    80004622:	e3 43 60 00 00 95 	llh	%r4,0(%r3,%r6)  16>32 zero extend
    80004628:	b9 16 00 34       	llgfr	%r3,%r4 	32>64 zero extend
    8000462c:	15 e4             	clr	%r14,%r4
    8000462e:	a7 a4 00 04       	jhe	80004636 <longest_match+0xee>
    80004632:	a7 16 00 11       	brct	%r1,80004654 <longest_match+0x10c>
...
    80004654:	b9 e8 30 49       	agrk	%r4,%r9,%r3
    80004658:	b9 14 00 5a       	lgfr	%r5,%r10        32>64 sign extend
    8000465c:	b9 95 00 c8       	llhr	%r12,%r8	16>32 zero extend
    80004660:	e3 55 4f ff ff 78 	lhy	%r5,-1(%r5,%r4)
    80004666:	b9 95 00 b5       	llhr	%r11,%r5	16>32 zero extend
    8000466a:	18 05             	lr	%r0,%r5
    8000466c:	19 bc             	cr	%r11,%r12
    8000466e:	a7 74 ff d4       	jne	80004616 <longest_match+0xce>

Patched:

    80004680:   b9 e8 30 5b             agrk    %r5,%r11,%r3
    80004684:   e3 ec 5f ff ff 95       llh     %r14,-1(%r12,%r5)
    8000468a:   19 e0                   cr      %r14,%r0
    8000468c:   a7 84 00 20             je      800046cc <longest_match+0xf8>
    80004690:   b9 80 00 3a             ngr     %r3,%r10
    80004694:   eb 33 00 01 00 0d       sllg    %r3,%r3,1
    8000469a:   e3 33 90 00 00 91       llgh    %r3,0(%r3,%r9)
    800046a0:   b9 21 00 38             clgr    %r3,%r8
    800046a4:   a7 c4 00 04             jle     800046ac <longest_match+0xd8>
    800046a8:   a7 16 ff ec             brct    %r1,80004680 <longest_match+0xac>


64 bit -O3 -march=core2 -DUNALIGNED_OK
Unpatched:

  403eb0:	21 de                	and    %ebx,%esi
  403eb2:	0f b7 74 75 00       	movzwl 0x0(%rbp,%rsi,2),%esi
  403eb7:	41 39 f2             	cmp    %esi,%r10d
  403eba:	0f 83 e0 00 00 00    	jae    403fa0 <longest_match+0x190>
  403ec0:	83 ea 01             	sub    $0x1,%edx
  403ec3:	0f 84 d7 00 00 00    	je     403fa0 <longest_match+0x190>
  403ec9:	89 f1                	mov    %esi,%ecx
  403ecb:	4c 63 c0             	movslq %eax,%r8
  403ece:	4c 01 c9             	add    %r9,%rcx
  403ed1:	46 0f b7 44 01 ff    	movzwl -0x1(%rcx,%r8,1),%r8d
  403ed7:	66 45 39 d8          	cmp    %r11w,%r8w
  403edb:	75 d3                	jne    403eb0 <longest_match+0xa0>

Patched:
  403f40:       48 21 ee                and    %rbp,%rsi
  403f43:       41 0f b7 34 74          movzwl (%r12,%rsi,2),%esi
  403f48:       48 39 de                cmp    %rbx,%rsi
  403f4b:       0f 86 df 00 00 00       jbe    404030 <longest_match+0x180>
  403f51:       83 ea 01                sub    $0x1,%edx
  403f54:       0f 84 d6 00 00 00       je     404030 <longest_match+0x180>
  403f5a:       49 8d 0c 32             lea    (%r10,%rsi,1),%rcx
  403f5e:       46 0f b7 44 09 ff       movzwl -0x1(%rcx,%r9,1),%r8d
  403f64:       45 39 d8                cmp    %r11d,%r8d
  403f67:       75 d7                   jne    403f40 <longest_match+0x90>


64 bit -O3 -mcpu=power7 -DUNALIGNED_OK

unpatched:

    100053f0:   7c 84 f8 38     and     r4,r4,r31                                                            
    100053f4:   78 84 0f a4     rldicr  r4,r4,1,62                                                           
    100053f8:   7c 8c 22 2e     lhzx    r4,r12,r4                                                            
    100053fc:   7f 80 20 40     cmplw   cr7,r0,r4                                                            
    10005400:   40 9c 00 f0     bge-    cr7,100054f0 <.longest_match+0x1a0>                                  
    10005404:   39 27 ff ff     addi    r9,r7,-1                                                             
    10005408:   79 27 00 21     clrldi. r7,r9,32                                                             
    1000540c:   41 82 00 e4     beq-    100054f0 <.longest_match+0x1a0>                                      
    10005410:   7d 26 22 14     add     r9,r6,r4                                                             
    10005414:   7d 49 2a 14     add     r10,r9,r5                                                            
    10005418:   a1 0a ff ff     lhz     r8,-1(r10)                                                           
    1000541c:   7f 88 58 40     cmplw   cr7,r8,r11                                                           
    10005420:   40 9e ff d0     bne+    cr7,100053f0 <.longest_match+0xa0> 

patched:

    100054d0:   7c 84 f8 38     and     r4,r4,r31
    100054d4:   78 84 0f a4     rldicr  r4,r4,1,62
    100054d8:   7c 8c 22 2e     lhzx    r4,r12,r4
    100054dc:   7f a4 00 40     cmpld   cr7,r4,r0
    100054e0:   40 9d 00 f0     ble-    cr7,100055d0 <.longest_match+0x1b0>
    100054e4:   39 28 ff ff     addi    r9,r8,-1
    100054e8:   79 28 00 21     clrldi. r8,r9,32
    100054ec:   41 82 00 e4     beq-    100055d0 <.longest_match+0x1b0>
    100054f0:   7c e6 22 14     add     r7,r6,r4
    100054f4:   7d 27 5a 14     add     r9,r7,r11
    100054f8:   a1 29 ff ff     lhz     r9,-1(r9)
    100054fc:   7f 89 50 40     cmplw   cr7,r9,r10
    10005500:   79 29 00 20     clrldi  r9,r9,32         <-- NEW
    10005504:   40 9e ff cc     bne+    cr7,100054d0 <.longest_match+0xb0>





More information about the Zlib-devel mailing list