[Zlib-devel] Optimizations needed for gzgets() (Zlib version 1.2.3)

Gilles Vollant info at winimage.com
Thu Jan 7 18:37:27 EST 2010


gzseek take a lot of time

try :
- first, init a buffer_gets_t structure with init_buffer_gets_t
- call another_gzgets_buffered by reuse the same structure

if you need stop using xx_gets and do somes gzread (if the file contain
binary data after text), you can just call terminate_buffer_gets_t once

struct {
	unsigned char buffer_gets->tab_in[BUFFER_IN_CACHE_SIZE];
	size_t read_ascii_in_file ;

} buffer_gets_t;


void ZEXPORT init_buffer_gets_t(struct buffer_gets_t* buffer_gets)
{
    buffer_gets->read_ascii_in_file = 0;
}


void ZEXPORT terminate_buffer_gets_t(struct buffer_gets_t*
buffer_gets,flatfile_t *flatfile_p)
{
    gzseek(flatfile_p->gz_file_p, -1 *
(long)(buffer_gets->read_ascii_in_file),SEEK_CUR);
    buffer_gets->read_ascii_in_file = 0;
}

char* ZEXPORT another_gzgets_buffered(struct buffer_gets_t* buffer_gets,char
*line_buffer,int i_is_size,int nchars,flatfile_t *flatfile_p)
{
	char *b = line_buffer;
	int pos_in_char_line = 0;

	if ((i_is_size != 0) && (nchars == 0))
		return NULL;

	for (;;)
	{
		int size_to_read_binary = BUFFER_IN_CACHE_SIZE;
		if (i_is_size != 0)
		{
			if ((nchars-1) < BUFFER_IN_CACHE_SIZE)
		 		size_to_read_binary = nchars-1;
		}

		if ((size_to_read_binary>0) &&
(buffer_gets->read_ascii_in_file == 0))
		{
		//	buffer_gets->read_ascii_in_file =
fread(&buffer_gets->tab_in[0],1,(size_t)size_to_read_binary,f);
			buffer_gets->read_ascii_in_file =
gzread(flatfile_p->gz_file_p, &buffer_gets->tab_in[0], size_to_read_binary);
		}

		if (buffer_gets->read_ascii_in_file<=0)
		{
			if (pos_in_char_line == 0)
		 		   return NULL;
			else
			{
				line_buffer[pos_in_char_line]='\0';
				return b == line_buffer && pos_in_char_line
> 0 ? b: NULL;
			}
		}

		if (buffer_gets->read_ascii_in_file > 0)
		{
			int i;
			for (i=0;i<(int)buffer_gets->read_ascii_in_file;i++)
			{
		 		char c;
		 		c = (((char)buffer_gets->tab_in[(i)])) ;

		 		if (c!=0x0d)
		 		{
		 			if (c=='\n')
		 			{
		 			//	fseek(f,-1 *
(long)(buffer_gets->read_ascii_in_file- (i+1)),SEEK_CUR);
						//
gzseek(flatfile_p->gz_file_p, -1 * (long)(buffer_gets->read_ascii_in_file -
(i+1)),SEEK_CUR);
                        size_t do_loop;
                        for
(do_loop=0;do_loop<buffer_gets->read_ascii_in_file - (i+1);do_loop++)
						{
		 		 			/* only if you want
\n at end of string:
	
line_buffer[pos_in_char_line++]='\n';*/
                            buffer_gets->tab_in[do_loop] =
buffer_gets->tab_in[do_loop+i+1] ;
		 		 		}
                        
		 		 		if (i_is_size != 0)
						{
		 		 			/* only if you want
\n at end of string:
	
line_buffer[pos_in_char_line++]='\n';*/
		 		 		}
	
buffer_gets->read_ascii_in_file -= i+1;
	
line_buffer[pos_in_char_line] ='\0';
		 		 		return b == line_buffer &&
pos_in_char_line > 0 ? b : NULL;
		 			}

		 			if ((i_is_size == 1) &&
(pos_in_char_line == (nchars-1)))
		 		 	{
		 		 	//	fseek(f, -1 *
(long)(buffer_gets->read_ascii_in_file - i),SEEK_CUR);
						/*
gzseek(flatfile_p->gz_file_p,-1 *(long)(buffer_gets->read_ascii_in_file -
i),SEEK_CUR); */
                        size_t do_loop;
                        for
(do_loop=0;do_loop<buffer_gets->read_ascii_in_file - (i);do_loop++)
						{
		 		 			/* only if you want
\n at end of string:
	
line_buffer[pos_in_char_line++]='\n';*/
                            buffer_gets->tab_in[do_loop] =
buffer_gets->tab_in[do_loop+i] ;
		 		 		}
	
buffer_gets->read_ascii_in_file -= i;
	
line_buffer[pos_in_char_line]='\0';
		 		 		return b == line_buffer &&
pos_in_char_line > 0 ? b : NULL;
		 			}

		 			line_buffer[pos_in_char_line++] = c;
				}
			}
		}
		buffer_gets->read_ascii_in_file = 0;
	}

	return b == line_buffer && pos_in_char_line > 0 ? b : NULL; 
}




-----Message d'origine-----
De : zlib-devel-bounces at madler.net [mailto:zlib-devel-bounces at madler.net] De
la part de Scott_Riley at amat.com
Envoyé : jeudi 7 janvier 2010 23:15
À : zlib-devel at madler.net
Cc : zlib-devel at madler.net; zlib-devel-bounces at madler.net
Objet : Re: [Zlib-devel] Optimizations needed for gzgets() (Zlib version
1.2.3)

Hi Gilles,

I used your code with some minor modifications replacing fread() with
gzread() and fseek() with gzseek(). This approach actually slowed down the
decompression way more. In order to process a file of 14619 rows, it took
almost 26 seconds using the new approach whereas using gzgets() took about
750 milliseconds. It takes approximately 250 milliseconds to process an
uncompressed file with fgets() and fseek().

The only other change I made to your code is that I return a pointer to the
current buffer like gzgets(). The code is below:

Code used that takes 26 seconds to read 14619 lines all of which are
slightly less than 256 bytes long:





More information about the Zlib-devel mailing list