[Zlib-devel] RFC: Improve longest_match performance
Andreas Krebbel
krebbel at linux.vnet.ibm.com
Wed Jun 6 02:59:27 EDT 2012
Hi,
I'm trying to improve performance of the zlib compression on S/390.
The code currently generated for longest_match looks far from optimal
due to a bunch of pointless zero/sign extend instructions.
By just promoting a few data types in the function I was able to get
rid of all but two. The new hotloop is almost half the size of the
original version providing quite a performance win for S/390:
Measured on a zEnterprise z196
zlib compiled with upstream GCC 4.8 branch
32 bit old new
256MB randomdata: 11.65s 10.92s 6.68%
100MB manpages: 4.23s 4.14s 2.17%
217MB PDF: 10.54s 9.44s 11.65%
unaligned ok
256MB randomdata: 10.90s 10.54s 3.41%
100MB manpages: 3.94s 3.87s 1.81%
217MB PDF: 8.77s 8.64s 1.50%
64 bit old new
256MB randomdata: 11.90s 11.43s 4.11%
100MB manpages: 4.51s 4.44s 1.58%
217MB PDF: 10.11s 9.89s 2.22%
unaligned ok
256MB randomdata: 11.51s 11.15s 3.23%
100MB manpages: 4.33s 3.99s 8.52%
217MB PDF: 9.81s 9.02s 8.76%
I also did some measurements on x86 and Power:
For Power (64 bit, unaligned_ok) an additional zero extend
appears. However, the impact is not measurable. There are minor wins
and minor regressions. The overall result is flat.
For Core2 32 bit the patch is a clear winner with up to 9% for the pdf
test. Also on 64 bit the code optimized for Core2 gets a bit smaller
but unfortunately causes some regressions which I cannot explain.
For mainframe customers the performance of zlib is very important so I
would be very happy to see the patch integrated into upstream
zlib. Given that the patch might cause minor regressions on other
targets, would it be possible to enable it arch-dependent?
See below for the patch and some code snippets from my tests.
Bye,
-Andreas-
---
deflate.c | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
Index: zlib/deflate.c
===================================================================
--- zlib.orig/deflate.c
+++ zlib/deflate.c
@@ -1143,15 +1143,16 @@ local void lm_init (s)
/* For 80x86 and 680x0, an optimized version will be provided in match.asm or
* match.S. The code will be functionally equivalent.
*/
-local uInt longest_match(s, cur_match)
+local uInt longest_match(s, pcur_match)
deflate_state *s;
- IPos cur_match; /* current match */
+ IPos pcur_match; /* current match */
{
+ uLong cur_match = pcur_match; /* extend to pointer width */
unsigned chain_length = s->max_chain_length;/* max hash chain length */
register Bytef *scan = s->window + s->strstart; /* current string */
register Bytef *match; /* matched string */
register int len; /* length of current match */
- int best_len = s->prev_length; /* best match length so far */
+ uLong best_len = s->prev_length; /* best match length so far */
int nice_match = s->nice_match; /* stop if match long enough */
IPos limit = s->strstart > (IPos)MAX_DIST(s) ?
s->strstart - (IPos)MAX_DIST(s) : NIL;
@@ -1159,19 +1160,19 @@ local uInt longest_match(s, cur_match)
* we prevent matches with the string of window index 0.
*/
Posf *prev = s->prev;
- uInt wmask = s->w_mask;
+ uLong wmask = s->w_mask;
#ifdef UNALIGNED_OK
/* Compare two bytes at a time. Note: this is not always beneficial.
* Try with and without -DUNALIGNED_OK to check.
*/
register Bytef *strend = s->window + s->strstart + MAX_MATCH - 1;
- register ush scan_start = *(ushf*)scan;
- register ush scan_end = *(ushf*)(scan+best_len-1);
+ register uInt scan_start = *(ushf*)scan;
+ register uInt scan_end = *(ushf*)(scan+best_len-1);
#else
register Bytef *strend = s->window + s->strstart + MAX_MATCH;
- register Byte scan_end1 = scan[best_len-1];
- register Byte scan_end = scan[best_len];
+ register uInt scan_end1 = scan[best_len-1];
+ register uInt scan_end = scan[best_len];
#endif
/* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
64 bit -O3 -march=z196 -DUNALIGNED_OK
Unpatched:
80004616: 14 37 nr %r3,%r7
80004618: b9 16 00 33 llgfr %r3,%r3 32>64 zero extend
8000461c: eb 33 00 01 00 0d sllg %r3,%r3,1
80004622: e3 43 60 00 00 95 llh %r4,0(%r3,%r6) 16>32 zero extend
80004628: b9 16 00 34 llgfr %r3,%r4 32>64 zero extend
8000462c: 15 e4 clr %r14,%r4
8000462e: a7 a4 00 04 jhe 80004636 <longest_match+0xee>
80004632: a7 16 00 11 brct %r1,80004654 <longest_match+0x10c>
...
80004654: b9 e8 30 49 agrk %r4,%r9,%r3
80004658: b9 14 00 5a lgfr %r5,%r10 32>64 sign extend
8000465c: b9 95 00 c8 llhr %r12,%r8 16>32 zero extend
80004660: e3 55 4f ff ff 78 lhy %r5,-1(%r5,%r4)
80004666: b9 95 00 b5 llhr %r11,%r5 16>32 zero extend
8000466a: 18 05 lr %r0,%r5
8000466c: 19 bc cr %r11,%r12
8000466e: a7 74 ff d4 jne 80004616 <longest_match+0xce>
Patched:
80004680: b9 e8 30 5b agrk %r5,%r11,%r3
80004684: e3 ec 5f ff ff 95 llh %r14,-1(%r12,%r5)
8000468a: 19 e0 cr %r14,%r0
8000468c: a7 84 00 20 je 800046cc <longest_match+0xf8>
80004690: b9 80 00 3a ngr %r3,%r10
80004694: eb 33 00 01 00 0d sllg %r3,%r3,1
8000469a: e3 33 90 00 00 91 llgh %r3,0(%r3,%r9)
800046a0: b9 21 00 38 clgr %r3,%r8
800046a4: a7 c4 00 04 jle 800046ac <longest_match+0xd8>
800046a8: a7 16 ff ec brct %r1,80004680 <longest_match+0xac>
64 bit -O3 -march=core2 -DUNALIGNED_OK
Unpatched:
403eb0: 21 de and %ebx,%esi
403eb2: 0f b7 74 75 00 movzwl 0x0(%rbp,%rsi,2),%esi
403eb7: 41 39 f2 cmp %esi,%r10d
403eba: 0f 83 e0 00 00 00 jae 403fa0 <longest_match+0x190>
403ec0: 83 ea 01 sub $0x1,%edx
403ec3: 0f 84 d7 00 00 00 je 403fa0 <longest_match+0x190>
403ec9: 89 f1 mov %esi,%ecx
403ecb: 4c 63 c0 movslq %eax,%r8
403ece: 4c 01 c9 add %r9,%rcx
403ed1: 46 0f b7 44 01 ff movzwl -0x1(%rcx,%r8,1),%r8d
403ed7: 66 45 39 d8 cmp %r11w,%r8w
403edb: 75 d3 jne 403eb0 <longest_match+0xa0>
Patched:
403f40: 48 21 ee and %rbp,%rsi
403f43: 41 0f b7 34 74 movzwl (%r12,%rsi,2),%esi
403f48: 48 39 de cmp %rbx,%rsi
403f4b: 0f 86 df 00 00 00 jbe 404030 <longest_match+0x180>
403f51: 83 ea 01 sub $0x1,%edx
403f54: 0f 84 d6 00 00 00 je 404030 <longest_match+0x180>
403f5a: 49 8d 0c 32 lea (%r10,%rsi,1),%rcx
403f5e: 46 0f b7 44 09 ff movzwl -0x1(%rcx,%r9,1),%r8d
403f64: 45 39 d8 cmp %r11d,%r8d
403f67: 75 d7 jne 403f40 <longest_match+0x90>
64 bit -O3 -mcpu=power7 -DUNALIGNED_OK
unpatched:
100053f0: 7c 84 f8 38 and r4,r4,r31
100053f4: 78 84 0f a4 rldicr r4,r4,1,62
100053f8: 7c 8c 22 2e lhzx r4,r12,r4
100053fc: 7f 80 20 40 cmplw cr7,r0,r4
10005400: 40 9c 00 f0 bge- cr7,100054f0 <.longest_match+0x1a0>
10005404: 39 27 ff ff addi r9,r7,-1
10005408: 79 27 00 21 clrldi. r7,r9,32
1000540c: 41 82 00 e4 beq- 100054f0 <.longest_match+0x1a0>
10005410: 7d 26 22 14 add r9,r6,r4
10005414: 7d 49 2a 14 add r10,r9,r5
10005418: a1 0a ff ff lhz r8,-1(r10)
1000541c: 7f 88 58 40 cmplw cr7,r8,r11
10005420: 40 9e ff d0 bne+ cr7,100053f0 <.longest_match+0xa0>
patched:
100054d0: 7c 84 f8 38 and r4,r4,r31
100054d4: 78 84 0f a4 rldicr r4,r4,1,62
100054d8: 7c 8c 22 2e lhzx r4,r12,r4
100054dc: 7f a4 00 40 cmpld cr7,r4,r0
100054e0: 40 9d 00 f0 ble- cr7,100055d0 <.longest_match+0x1b0>
100054e4: 39 28 ff ff addi r9,r8,-1
100054e8: 79 28 00 21 clrldi. r8,r9,32
100054ec: 41 82 00 e4 beq- 100055d0 <.longest_match+0x1b0>
100054f0: 7c e6 22 14 add r7,r6,r4
100054f4: 7d 27 5a 14 add r9,r7,r11
100054f8: a1 29 ff ff lhz r9,-1(r9)
100054fc: 7f 89 50 40 cmplw cr7,r9,r10
10005500: 79 29 00 20 clrldi r9,r9,32 <-- NEW
10005504: 40 9e ff cc bne+ cr7,100054d0 <.longest_match+0xb0>
More information about the Zlib-devel
mailing list