diff --git a/Lib/test/test_bz2.py b/Lib/test/test_bz2.py --- a/Lib/test/test_bz2.py +++ b/Lib/test/test_bz2.py @@ -21,7 +21,30 @@ class BaseTest(unittest.TestCase): "Base for other testcases." - TEXT = b'root:x:0:0:root:/root:/bin/bash\nbin:x:1:1:bin:/bin:\ndaemon:x:2:2:daemon:/sbin:\nadm:x:3:4:adm:/var/adm:\nlp:x:4:7:lp:/var/spool/lpd:\nsync:x:5:0:sync:/sbin:/bin/sync\nshutdown:x:6:0:shutdown:/sbin:/sbin/shutdown\nhalt:x:7:0:halt:/sbin:/sbin/halt\nmail:x:8:12:mail:/var/spool/mail:\nnews:x:9:13:news:/var/spool/news:\nuucp:x:10:14:uucp:/var/spool/uucp:\noperator:x:11:0:operator:/root:\ngames:x:12:100:games:/usr/games:\ngopher:x:13:30:gopher:/usr/lib/gopher-data:\nftp:x:14:50:FTP User:/var/ftp:/bin/bash\nnobody:x:65534:65534:Nobody:/home:\npostfix:x:100:101:postfix:/var/spool/postfix:\nniemeyer:x:500:500::/home/niemeyer:/bin/bash\npostgres:x:101:102:PostgreSQL Server:/var/lib/pgsql:/bin/bash\nmysql:x:102:103:MySQL server:/var/lib/mysql:/bin/bash\nwww:x:103:104::/var/www:/bin/false\n' + TEXT_LINES = [ + b'root:x:0:0:root:/root:/bin/bash\n', + b'bin:x:1:1:bin:/bin:\n', + b'daemon:x:2:2:daemon:/sbin:\n', + b'adm:x:3:4:adm:/var/adm:\n', + b'lp:x:4:7:lp:/var/spool/lpd:\n', + b'sync:x:5:0:sync:/sbin:/bin/sync\n', + b'shutdown:x:6:0:shutdown:/sbin:/sbin/shutdown\n', + b'halt:x:7:0:halt:/sbin:/sbin/halt\n', + b'mail:x:8:12:mail:/var/spool/mail:\n', + b'news:x:9:13:news:/var/spool/news:\n', + b'uucp:x:10:14:uucp:/var/spool/uucp:\n', + b'operator:x:11:0:operator:/root:\n', + b'games:x:12:100:games:/usr/games:\n', + b'gopher:x:13:30:gopher:/usr/lib/gopher-data:\n', + b'ftp:x:14:50:FTP User:/var/ftp:/bin/bash\n', + b'nobody:x:65534:65534:Nobody:/home:\n', + b'postfix:x:100:101:postfix:/var/spool/postfix:\n', + b'niemeyer:x:500:500::/home/niemeyer:/bin/bash\n', + b'postgres:x:101:102:PostgreSQL Server:/var/lib/pgsql:/bin/bash\n', + b'mysql:x:102:103:MySQL server:/var/lib/mysql:/bin/bash\n', + b'www:x:103:104::/var/www:/bin/false\n', + ] + TEXT = b''.join(TEXT_LINES) DATA = b'BZh91AY&SY.\xc8N\x18\x00\x01>_\x80\x00\x10@\x02\xff\xf0\x01\x07n\x00?\xe7\xff\xe00\x01\x99\xaa\x00\xc0\x03F\x86\x8c#&\x83F\x9a\x03\x06\xa6\xd0\xa6\x93M\x0fQ\xa7\xa8\x06\x804hh\x12$\x11\xa4i4\xf14S\xd2\x88\xe5\xcd9gd6\x0b\n\xe9\x9b\xd5\x8a\x99\xf7\x08.K\x8ev\xfb\xf7xw\xbb\xdf\xa1\x92\xf1\xdd|/";\xa2\xba\x9f\xd5\xb1#A\xb6\xf6\xb3o\xc9\xc5y\\\xebO\xe7\x85\x9a\xbc\xb6f8\x952\xd5\xd7"%\x89>V,\xf7\xa6z\xe2\x9f\xa3\xdf\x11\x11"\xd6E)I\xa9\x13^\xca\xf3r\xd0\x03U\x922\xf26\xec\xb6\xed\x8b\xc3U\x13\x9d\xc5\x170\xa4\xfa^\x92\xacDF\x8a\x97\xd6\x19\xfe\xdd\xb8\xbd\x1a\x9a\x19\xa3\x80ankR\x8b\xe5\xd83]\xa9\xc6\x08\x82f\xf6\xb9"6l$\xb8j@\xc0\x8a\xb0l1..\xbak\x83ls\x15\xbc\xf4\xc1\x13\xbe\xf8E\xb8\x9d\r\xa8\x9dk\x84\xd3n\xfa\xacQ\x07\xb1%y\xaav\xb4\x08\xe0z\x1b\x16\xf5\x04\xe9\xcc\xb9\x08z\x1en7.G\xfc]\xc9\x14\xe1B@\xbb!8`' DATA_CRLF = b'BZh91AY&SY\xaez\xbbN\x00\x01H\xdf\x80\x00\x12@\x02\xff\xf0\x01\x07n\x00?\xe7\xff\xe0@\x01\xbc\xc6`\x86*\x8d=M\xa9\x9a\x86\xd0L@\x0fI\xa6!\xa1\x13\xc8\x88jdi\x8d@\x03@\x1a\x1a\x0c\x0c\x83 \x00\xc4h2\x19\x01\x82D\x84e\t\xe8\x99\x89\x19\x1ah\x00\r\x1a\x11\xaf\x9b\x0fG\xf5(\x1b\x1f?\t\x12\xcf\xb5\xfc\x95E\x00ps\x89\x12^\xa4\xdd\xa2&\x05(\x87\x04\x98\x89u\xe40%\xb6\x19\'\x8c\xc4\x89\xca\x07\x0e\x1b!\x91UIFU%C\x994!DI\xd2\xfa\xf0\xf1N8W\xde\x13A\xf5\x9cr%?\x9f3;I45A\xd1\x8bT\xb1\xa4\xc7\x8d\x1a\\"\xad\xa1\xabyBg\x15\xb9l\x88\x88\x91k"\x94\xa4\xd4\x89\xae*\xa6\x0b\x10\x0c\xd6\xd4m\xe86\xec\xb5j\x8a\x86j\';\xca.\x01I\xf2\xaaJ\xe8\x88\x8cU+t3\xfb\x0c\n\xa33\x13r2\r\x16\xe0\xb3(\xbf\x1d\x83r\xe7M\xf0D\x1365\xd8\x88\xd3\xa4\x92\xcb2\x06\x04\\\xc1\xb0\xea//\xbek&\xd8\xe6+t\xe5\xa1\x13\xada\x16\xder5"w]\xa2i\xb7[\x97R \xe2IT\xcd;Z\x04dk4\xad\x8a\t\xd3\x81z\x10\xf1:^`\xab\x1f\xc5\xdc\x91N\x14$+\x9e\xae\xd3\x80' @@ -54,13 +77,15 @@ if os.path.isfile(self.filename): os.unlink(self.filename) - def createTempFile(self, crlf=0): + def getData(self, crlf=False): + if crlf: + return self.DATA_CRLF + else: + return self.DATA + + def createTempFile(self, crlf=False): with open(self.filename, "wb") as f: - if crlf: - data = self.DATA_CRLF - else: - data = self.DATA - f.write(data) + f.write(self.getData(crlf)) def testRead(self): # "Test BZ2File.read()" @@ -70,7 +95,7 @@ self.assertEqual(bz2f.read(), self.TEXT) def testRead0(self): - # Test BBZ2File.read(0)" + # "Test BBZ2File.read(0)" self.createTempFile() with BZ2File(self.filename) as bz2f: self.assertRaises(TypeError, bz2f.read, None) @@ -125,7 +150,7 @@ bz2f = BZ2File(self.filename) bz2f.close() self.assertRaises(ValueError, bz2f.__next__) - # This call will deadlock of the above .__next__ call failed to + # This call will deadlock if the above .__next__ call failed to # release the lock. self.assertRaises(ValueError, bz2f.readlines) @@ -278,17 +303,56 @@ t.join() def testMixedIterationReads(self): - # Issue #8397: mixed iteration and reads should be forbidden. - with bz2.BZ2File(self.filename, 'wb') as f: - # The internal buffer size is hard-wired to 8192 bytes, we must - # write out more than that for the test to stop half through - # the buffer. - f.write(self.TEXT * 100) - with bz2.BZ2File(self.filename, 'rb') as f: - next(f) - self.assertRaises(ValueError, f.read) - self.assertRaises(ValueError, f.readline) - self.assertRaises(ValueError, f.readlines) + # "Test mixed iteration and reads." + self.createTempFile() + linelen = len(self.TEXT_LINES[0]) + halflen = linelen // 2 + with bz2.BZ2File(self.filename) as bz2f: + bz2f.read(halflen) + self.assertEqual(next(bz2f), self.TEXT_LINES[0][halflen:]) + self.assertEqual(bz2f.read(), self.TEXT[linelen:]) + with bz2.BZ2File(self.filename) as bz2f: + bz2f.readline() + self.assertEqual(next(bz2f), self.TEXT_LINES[1]) + self.assertEqual(bz2f.readline(), self.TEXT_LINES[2]) + with bz2.BZ2File(self.filename) as bz2f: + bz2f.readlines() + with self.assertRaises(StopIteration): + next(bz2f) + self.assertEqual(bz2f.readlines(), []) + + def testReadBytesIO(self): + # "Test BZ2File.read() with BytesIO source" + with BytesIO(self.getData()) as bio: + with BZ2File(fileobj=bio) as bz2f: + self.assertRaises(TypeError, bz2f.read, None) + self.assertEqual(bz2f.read(), self.TEXT) + self.assertFalse(bio.closed) + + def testWriteBytesIO(self): + # "Test BZ2File.write() with BytesIO destination" + with BytesIO() as bio: + with BZ2File(fileobj=bio, mode="w") as bz2f: + self.assertRaises(TypeError, bz2f.write) + bz2f.write(self.TEXT) + self.assertEqual(self.decompress(bio.getvalue()), self.TEXT) + self.assertFalse(bio.closed) + + def testSeekForwardBytesIO(self): + # "Test BZ2File.seek(150, 0) with BytesIO source" + with BytesIO(self.getData()) as bio: + with BZ2File(fileobj=bio) as bz2f: + self.assertRaises(TypeError, bz2f.seek) + bz2f.seek(150) + self.assertEqual(bz2f.read(), self.TEXT[150:]) + + def testSeekBackwardsBytesIO(self): + # "Test BZ2File.seek(-150, 1) with BytesIO source" + with BytesIO(self.getData()) as bio: + with BZ2File(fileobj=bio) as bz2f: + bz2f.read(500) + bz2f.seek(-150, 1) + self.assertEqual(bz2f.read(), self.TEXT[500-150:]) class BZ2CompressorTest(BaseTest): def testCompress(self): diff --git a/Modules/bz2module.c b/Modules/bz2module.c --- a/Modules/bz2module.c +++ b/Modules/bz2module.c @@ -1,6 +1,6 @@ /* -python-bz2 - python bz2 library interface +bz2 - Python interface to libbzip2 Copyright (c) 2002 Gustavo Niemeyer Copyright (c) 2002 Python Software Foundation; All Rights Reserved @@ -8,7 +8,6 @@ */ #include "Python.h" -#include #include #include "structmember.h" @@ -16,17 +15,10 @@ #include "pythread.h" #endif -static char __author__[] = -"The bz2 python module was written by:\n\ -\n\ - Gustavo Niemeyer \n\ -"; +static char __author__[] = "Gustavo Niemeyer "; -/* Our very own off_t-like type, 64-bit if possible */ -/* copied from Objects/fileobject.c */ -#if !defined(HAVE_LARGEFILE_SUPPORT) -typedef off_t Py_off_t; -#elif SIZEOF_OFF_T >= 8 +/* Our own off_t-like type; 64-bit if we support large files. */ +#if !defined(HAVE_LARGEFILE_SUPPORT) || SIZEOF_OFF_T >= 8 typedef off_t Py_off_t; #elif SIZEOF_FPOS_T >= 8 typedef fpos_t Py_off_t; @@ -41,8 +33,6 @@ #define MODE_READ_EOF 2 #define MODE_WRITE 3 -#define BZ2FileObject_Check(v) (Py_TYPE(v) == &BZ2File_Type) - #ifdef BZ_CONFIG_ERROR @@ -59,12 +49,6 @@ #else /* ! BZ_CONFIG_ERROR */ -#define BZ2_bzRead bzRead -#define BZ2_bzReadOpen bzReadOpen -#define BZ2_bzReadClose bzReadClose -#define BZ2_bzWrite bzWrite -#define BZ2_bzWriteOpen bzWriteOpen -#define BZ2_bzWriteClose bzWriteClose #define BZ2_bzCompress bzCompress #define BZ2_bzCompressInit bzCompressInit #define BZ2_bzCompressEnd bzCompressEnd @@ -90,27 +74,40 @@ #define RELEASE_LOCK(obj) #endif -/* Bits in f_newlinetypes */ -#define NEWLINE_UNKNOWN 0 /* No newline seen, yet */ -#define NEWLINE_CR 1 /* \r newline seen */ -#define NEWLINE_LF 2 /* \n newline seen */ -#define NEWLINE_CRLF 4 /* \r\n newline seen */ /* ===================================================================== */ /* Structure definitions. */ typedef struct { PyObject_HEAD - FILE *rawfp; - char* f_buf; /* Allocated readahead buffer */ - char* f_bufend; /* Points after last occupied position */ - char* f_bufptr; /* Current buffer position */ + PyObject *fp; + int closefp; /* Flag: close() should close fp */ + int mode; - BZFILE *fp; - int mode; + /* Position in the uncompressed data stream, not counting the + contents of the readahead buffers. */ Py_off_t pos; + + /* Size of the uncompressed data stream; -1 if unknown. */ Py_off_t size; + + /* Compressor/decompressor state. */ + bz_stream bzs; + + /* Raw readahead buffer - data that has not yet been decompressed. + Either NULL, or a bytes object returned by fp's read() method. */ + PyObject *rawobj; + char *rawptr; + char *rawend; + + /* Decompressed readahead buffer - data waiting to be read. + If no buffer has been allocated, all three pointers are NULL. */ + char *decbase; + char *decptr; + char *decend; + int buffered_eof; /* Flag: EOF occurs at end of buffer */ + #ifdef WITH_THREAD PyThread_type_lock lock; #endif @@ -138,78 +135,53 @@ /* ===================================================================== */ /* Utility functions. */ -/* Refuse regular I/O if there's data in the iteration-buffer. - * Mixing them would cause data to arrive out of order, as the read* - * methods don't use the iteration buffer. */ static int -check_iterbuffered(BZ2FileObject *f) +catch_bz2_error(int bzerror) { - if (f->f_buf != NULL && - (f->f_bufend - f->f_bufptr) > 0 && - f->f_buf[0] != '\0') { - PyErr_SetString(PyExc_ValueError, - "Mixing iteration and read methods would lose data"); - return -1; - } - return 0; -} - -static int -Util_CatchBZ2Error(int bzerror) -{ - int ret = 0; switch(bzerror) { case BZ_OK: + case BZ_RUN_OK: + case BZ_FLUSH_OK: + case BZ_FINISH_OK: case BZ_STREAM_END: - break; + return 0; #ifdef BZ_CONFIG_ERROR case BZ_CONFIG_ERROR: PyErr_SetString(PyExc_SystemError, - "the bz2 library was not compiled " - "correctly"); - ret = 1; - break; + "libbzip2 was not compiled correctly"); + return 1; #endif - case BZ_PARAM_ERROR: PyErr_SetString(PyExc_ValueError, - "the bz2 library has received wrong " - "parameters"); - ret = 1; - break; - + "Internal error - invalid parameters passed to " + "libbzip2"); + return 1; case BZ_MEM_ERROR: PyErr_NoMemory(); - ret = 1; - break; - + return 1; case BZ_DATA_ERROR: case BZ_DATA_ERROR_MAGIC: - PyErr_SetString(PyExc_IOError, "invalid data stream"); - ret = 1; - break; - + PyErr_SetString(PyExc_IOError, "Invalid data stream"); + return 1; case BZ_IO_ERROR: - PyErr_SetString(PyExc_IOError, "unknown IO error"); - ret = 1; - break; - + PyErr_SetString(PyExc_IOError, "Unknown I/O error"); + return 1; case BZ_UNEXPECTED_EOF: PyErr_SetString(PyExc_EOFError, - "compressed file ended before the " - "logical end-of-stream was detected"); - ret = 1; - break; - + "Compressed file ended before the logical " + "end-of-stream was detected"); + return 1; case BZ_SEQUENCE_ERROR: PyErr_SetString(PyExc_RuntimeError, - "wrong sequence of bz2 library " - "commands used"); - ret = 1; - break; + "Internal error - libbzip2 received an invalid " + "sequence of commands"); + return 1; + default: + PyErr_Format(PyExc_IOError, + "Unrecognized error from libbzip2: %d", bzerror); + return 1; } - return ret; } #if BUFSIZ < 8192 @@ -224,822 +196,660 @@ #define BIGCHUNK (512 * 1024) #endif -/* This is a hacked version of Python's fileobject.c:new_buffersize(). */ +/* Adapted from _io/fileio.c:new_buffersize(). */ static size_t -Util_NewBufferSize(size_t currentsize) +grow_buffer_size(size_t currentsize) { - if (currentsize > SMALLCHUNK) { - /* Keep doubling until we reach BIGCHUNK; - then keep adding BIGCHUNK. */ - if (currentsize <= BIGCHUNK) - return currentsize + currentsize; - else - return currentsize + BIGCHUNK; - } - return currentsize + SMALLCHUNK; + if (currentsize <= SMALLCHUNK) + return currentsize + SMALLCHUNK; + else if (currentsize <= BIGCHUNK) + return currentsize * 2; + else + return currentsize + BIGCHUNK; } -/* This is a hacked version of Python's fileobject.c:get_line(). */ -static PyObject * -Util_GetLine(BZ2FileObject *f, int n) +/* Check that the file's mode allows the requested type of operation. */ + +static int +can_read(BZ2FileObject *f) { - char c; - char *buf, *end; - size_t total_v_size; /* total # of slots in buffer */ - size_t used_v_size; /* # used slots in buffer */ - size_t increment; /* amount to increment the buffer */ - PyObject *v; - int bzerror; - int bytes_read; - - total_v_size = n > 0 ? n : 100; - v = PyBytes_FromStringAndSize((char *)NULL, total_v_size); - if (v == NULL) - return NULL; - - buf = BUF(v); - end = buf + total_v_size; - - for (;;) { - Py_BEGIN_ALLOW_THREADS - do { - bytes_read = BZ2_bzRead(&bzerror, f->fp, &c, 1); - f->pos++; - if (bytes_read == 0) - break; - *buf++ = c; - } while (bzerror == BZ_OK && c != '\n' && buf != end); - Py_END_ALLOW_THREADS - if (bzerror == BZ_STREAM_END) { - f->size = f->pos; - f->mode = MODE_READ_EOF; - break; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - Py_DECREF(v); - return NULL; - } - if (c == '\n') - break; - /* Must be because buf == end */ - if (n > 0) - break; - used_v_size = total_v_size; - increment = total_v_size >> 2; /* mild exponential growth */ - total_v_size += increment; - if (total_v_size > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "line is longer than a Python string can hold"); - Py_DECREF(v); - return NULL; - } - if (_PyBytes_Resize(&v, total_v_size) < 0) { - return NULL; - } - buf = BUF(v) + used_v_size; - end = BUF(v) + total_v_size; - } - - used_v_size = buf - BUF(v); - if (used_v_size != total_v_size) { - if (_PyBytes_Resize(&v, used_v_size) < 0) { - v = NULL; - } - } - return v; -} - -/* This is a hacked version of Python's fileobject.c:drop_readahead(). */ -static void -Util_DropReadAhead(BZ2FileObject *f) -{ - if (f->f_buf != NULL) { - PyMem_Free(f->f_buf); - f->f_buf = NULL; + switch (f->mode) { + case MODE_READ: + case MODE_READ_EOF: + return 1; + case MODE_CLOSED: + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); + return 0; + default: + PyErr_SetString(PyExc_IOError, "File not open for reading"); + return 0; } } -/* This is a hacked version of Python's fileobject.c:readahead(). */ static int -Util_ReadAhead(BZ2FileObject *f, int bufsize) +can_write(BZ2FileObject *f) { - int chunksize; - int bzerror; + switch (f->mode) { + case MODE_WRITE: + return 1; + case MODE_CLOSED: + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); + return 0; + default: + PyErr_SetString(PyExc_IOError, "File not open for writing"); + return 0; + } +} - if (f->f_buf != NULL) { - if((f->f_bufend - f->f_bufptr) >= 1) +static int +can_seek(BZ2FileObject *f) +{ + switch (f->mode) { + case MODE_READ: + case MODE_READ_EOF: + return 1; + case MODE_CLOSED: + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); return 0; - else - Util_DropReadAhead(f); + default: + PyErr_SetString(PyExc_IOError, "seek() works only while reading"); + return 0; } - if (f->mode == MODE_READ_EOF) { - f->f_bufptr = f->f_buf; - f->f_bufend = f->f_buf; +} + +/* Ensure that f's raw readahead buffer contains data to decompress. */ +static int +fill_raw_readahead(BZ2FileObject *f) +{ + PyObject *obj; + + if (f->rawptr != f->rawend) /* Buffer already contains data */ return 0; - } - if ((f->f_buf = PyMem_Malloc(bufsize)) == NULL) { - PyErr_NoMemory(); + + Py_XDECREF(f->rawobj); + f->rawobj = NULL; + obj = PyObject_CallMethod(f->fp, "read", "i", SMALLCHUNK); + if (obj == NULL) + return -1; + if (!PyBytes_Check(obj)) { + Py_DECREF(obj); + PyErr_SetString(PyExc_TypeError, + "File's read() method should return bytes"); return -1; } - Py_BEGIN_ALLOW_THREADS - chunksize = BZ2_bzRead(&bzerror, f->fp, f->f_buf, bufsize); - Py_END_ALLOW_THREADS - f->pos += chunksize; - if (bzerror == BZ_STREAM_END) { - f->size = f->pos; - f->mode = MODE_READ_EOF; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - Util_DropReadAhead(f); + if (PyBytes_GET_SIZE(obj) == 0) { + Py_DECREF(obj); + catch_bz2_error(BZ_UNEXPECTED_EOF); return -1; } - f->f_bufptr = f->f_buf; - f->f_bufend = f->f_buf + chunksize; + f->rawobj = obj; + f->rawptr = PyBytes_AS_STRING(obj); + f->rawend = f->rawptr + PyBytes_GET_SIZE(obj); return 0; } -/* This is a hacked version of Python's - * fileobject.c:readahead_get_line_skip(). */ -static PyBytesObject * -Util_ReadAheadGetLineSkip(BZ2FileObject *f, int skip, int bufsize) +/* Read and decompress data from f, without updating file position, + size field, or EOF status. */ +static ssize_t +read_noupdate(BZ2FileObject *f, char *buffer, size_t size, int *p_eos_reached) { - PyBytesObject* s; - char *bufptr; + int bzerror; + + f->bzs.next_out = buffer; + f->bzs.avail_out = size; + while (f->bzs.avail_out > 0) { + if (fill_raw_readahead(f) < 0) + return -1; + Py_BEGIN_ALLOW_THREADS + f->bzs.next_in = f->rawptr; + f->bzs.avail_in = f->rawend - f->rawptr; + bzerror = BZ2_bzDecompress(&f->bzs); + f->rawptr = f->bzs.next_in; + Py_END_ALLOW_THREADS + if (catch_bz2_error(bzerror)) + return -1; + if (bzerror == BZ_STREAM_END) { + *p_eos_reached = 1; + break; + } + } + return f->bzs.next_out - buffer; +} + +/* Read and decompress a block of data from f. */ +static ssize_t +read_decompressed(BZ2FileObject *f, char *buffer, size_t size) +{ + ssize_t total_read = 0; + + /* Get data from readahead buffer before reading more. */ + while (f->decptr != f->decend && size > 0) { + *buffer++ = *f->decptr++; + f->pos++; + total_read++; + size--; + } + + /* If the readahead buffer was emptied, try to read more. */ + if (f->decptr == f->decend) { + if (f->buffered_eof) { + f->mode = MODE_READ_EOF; + f->size = f->pos; + } else if (size > 0) { + int eof = 0; + ssize_t nread = read_noupdate(f, buffer, size, &eof); + if (nread < 0) + return -1; + f->pos += nread; + total_read += nread; + if (eof) { + f->mode = MODE_READ_EOF; + f->size = f->pos; + } + } + } + return total_read; +} + +/* Ensure that f's decompressed readahead buffer contains data. */ +static int +fill_dec_readahead(BZ2FileObject *f) +{ + ssize_t nread; + + if (f->decptr != f->decend) /* Buffer already contains data */ + return 0; + + if (f->buffered_eof) { + f->mode = MODE_READ_EOF; + f->size = f->pos; + return 0; + } + + if (f->decbase == NULL) { + f->decbase = PyMem_Malloc(SMALLCHUNK); + if (f->decbase == NULL) { + PyErr_NoMemory(); + return -1; + } + } + + nread = read_noupdate(f, f->decbase, SMALLCHUNK, &f->buffered_eof); + if (nread < 0) + return -1; + f->decptr = f->decbase; + f->decend = f->decbase + nread; + return 0; +} + +/* Read and decompress a line of data from f. */ +static PyObject * +read_line(BZ2FileObject *f, ssize_t max_size) +{ + ssize_t capacity, size; + PyObject *line; char *buf; - int len; + char c; - if (f->f_buf == NULL) - if (Util_ReadAhead(f, bufsize) < 0) - return NULL; + if (f->mode == MODE_READ_EOF || max_size == 0) + return PyBytes_FromStringAndSize("", 0); + capacity = (max_size < 0) ? 100 : max_size; + line = PyBytes_FromStringAndSize(NULL, capacity); + if (line == NULL) + return NULL; + size = 0; + buf = PyBytes_AS_STRING(line); - len = f->f_bufend - f->f_bufptr; - if (len == 0) - return (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, skip); - bufptr = memchr(f->f_bufptr, '\n', len); - if (bufptr != NULL) { - bufptr++; /* Count the '\n' */ - len = bufptr - f->f_bufptr; - s = (PyBytesObject *) - PyBytes_FromStringAndSize(NULL, skip+len); - if (s == NULL) - return NULL; - memcpy(PyBytes_AS_STRING(s)+skip, f->f_bufptr, len); - f->f_bufptr = bufptr; - if (bufptr == f->f_bufend) - Util_DropReadAhead(f); - } else { - bufptr = f->f_bufptr; - buf = f->f_buf; - f->f_buf = NULL; /* Force new readahead buffer */ - s = Util_ReadAheadGetLineSkip(f, skip+len, - bufsize + (bufsize>>2)); - if (s == NULL) { - PyMem_Free(buf); + for (;;) { + if (fill_dec_readahead(f) < 0) { + Py_DECREF(line); return NULL; } - memcpy(PyBytes_AS_STRING(s)+skip, bufptr, len); - PyMem_Free(buf); + if (f->mode == MODE_READ_EOF) + break; + + c = *buf++ = *f->decptr++; + f->pos++; + size++; + if (c == '\n' || size == max_size) + break; + + if (size == capacity) { + /* Since lines are usually short, expand the buffer + more slowly than we would for a block read. */ + capacity += capacity / 4; + if (_PyBytes_Resize(&line, capacity) < 0) + return NULL; + buf = PyBytes_AS_STRING(line); + } } - return s; + + if (size != capacity) + if (_PyBytes_Resize(&line, size) < 0) + return NULL; + return line; } +/* Compress and write a block of data to f. action can be BZ_RUN + (for regular operation), BZ_FLUSH (when flushing the stream), + or BZ_FINISH (when flushing and closing the stream). */ +static int +write_compressed(BZ2FileObject *f, int action, char *data, int size) +{ + f->bzs.next_in = data; + f->bzs.avail_in = size; + for (;;) { + char *this_in; + char buffer[SMALLCHUNK]; + int bzerror, chunk_size; + PyObject *result; + + /* In regular compression mode, stop when data is exhausted. */ + if (action == BZ_RUN && f->bzs.avail_in == 0) + return 0; + + Py_BEGIN_ALLOW_THREADS + this_in = f->bzs.next_in; + f->bzs.next_out = buffer; + f->bzs.avail_out = sizeof buffer; + bzerror = BZ2_bzCompress(&f->bzs, action); + chunk_size = f->bzs.next_out - buffer; + Py_END_ALLOW_THREADS + + if (catch_bz2_error(bzerror)) + return -1; + + result = PyObject_CallMethod(f->fp, "write", "y#", buffer, chunk_size); + if (result == NULL) + return -1; + Py_DECREF(result); + f->pos += f->bzs.next_in - this_in; + + /* In either flushing mode, keep going until bzCompress returns + the appropriate sentinel value. */ + if ((action == BZ_FINISH && bzerror == BZ_STREAM_END) || + (action == BZ_FLUSH && bzerror == BZ_RUN_OK)) + return 0; + } +} + +/* Rewind f to the beginning of the data stream. */ +static int +rewind_stream(BZ2FileObject *f) +{ + PyObject *seek_result; + + seek_result = PyObject_CallMethod(f->fp, "seek", "ii", 0, 0); + if (seek_result == NULL) + return -1; + Py_DECREF(seek_result); + + f->mode = MODE_READ; + f->pos = 0; + /* Discard stale readahead buffers. */ + Py_XDECREF(f->rawobj); + f->rawobj = NULL; + f->rawptr = NULL; + f->rawend = NULL; + PyMem_Free(f->decbase); + f->decbase = NULL; + f->decptr = NULL; + f->decend = NULL; + f->buffered_eof = 0; + + /* Reinitialize decompressor state. */ + if (catch_bz2_error(BZ2_bzDecompressEnd(&f->bzs))) + return -1; + memset(&f->bzs, 0, sizeof f->bzs); + if (catch_bz2_error(BZ2_bzDecompressInit(&f->bzs, 0, 0))) + return -1; + return 0; +} + +/* Close f, flushing any buffered output. */ +static int +flush_and_close(BZ2FileObject *f) +{ + int bzerror; + int result = 0; + + switch (f->mode) { + case MODE_CLOSED: + return 0; + case MODE_READ: + case MODE_READ_EOF: + if (catch_bz2_error(BZ2_bzDecompressEnd(&f->bzs))) + result = -1; + break; + case MODE_WRITE: + /* Flush any buffered data before closing. */ + if (write_compressed(f, BZ_FINISH, NULL, 0) < 0) + result = -1; + bzerror = BZ2_bzCompressEnd(&f->bzs); + if (result == 0 && catch_bz2_error(bzerror)) + result = -1; + break; + } + if (f->closefp) { + PyObject *close_result = PyObject_CallMethod(f->fp, "close", NULL); + if (close_result == NULL) + result = -1; + else + Py_DECREF(close_result); + } + Py_DECREF(f->fp); + f->fp = NULL; + f->mode = MODE_CLOSED; + PyMem_Free(f->decbase); + f->decbase = NULL; + return result; +} + +/* Open a file using the Python function builtins.open(). */ +static PyObject * +builtins_open(PyObject *filename, int mode) +{ + PyObject *py_builtins = NULL; + PyObject *py_open = NULL; + PyObject *file = NULL; + const char *mode_str = (mode == MODE_READ) ? "rb" : "wb"; + + py_builtins = PyImport_ImportModule("builtins"); + if (py_builtins == NULL) + goto cleanup; + py_open = PyObject_GetAttrString(py_builtins, "open"); + if (py_open == NULL) { + PyErr_SetString(PyExc_AttributeError, + "Unable to access builtins.open()"); + goto cleanup; + } + + file = PyObject_CallFunction(py_open, "Os", filename, mode_str); + +cleanup: + Py_XDECREF(py_open); + Py_XDECREF(py_builtins); + return file; +} + + /* ===================================================================== */ /* Methods of BZ2File. */ PyDoc_STRVAR(BZ2File_read__doc__, -"read([size]) -> string\n\ -\n\ -Read at most size uncompressed bytes, returned as a string. If the size\n\ -argument is negative or omitted, read until EOF is reached.\n\ -"); +"read([size]) -> bytes\n" +"\n" +"Read at most size uncompressed bytes, returned as bytes. If size\n" +"is omitted or negative, read until EOF is reached. On end-of-file,\n" +"returns b''.\n"); -/* This is a hacked version of Python's fileobject.c:file_read(). */ static PyObject * BZ2File_read(BZ2FileObject *self, PyObject *args) { - long bytesrequested = -1; - size_t bytesread, buffersize, chunksize; - int bzerror; - PyObject *ret = NULL; + long requested_size = -1; + size_t capacity, size; + PyObject *data = NULL; - if (!PyArg_ParseTuple(args, "|l:read", &bytesrequested)) + if (!PyArg_ParseTuple(args, "|l:read", &requested_size)) return NULL; ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_READ: - break; - case MODE_READ_EOF: - ret = PyBytes_FromStringAndSize("", 0); - goto cleanup; - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - default: - PyErr_SetString(PyExc_IOError, - "file is not ready for reading"); - goto cleanup; + if (!can_read(self)) + goto cleanup; + if (self->mode == MODE_READ_EOF || requested_size == 0) { + data = PyBytes_FromStringAndSize("", 0); + goto cleanup; } - /* refuse to mix with f.next() */ - if (check_iterbuffered(self)) + capacity = (requested_size < 0) ? SMALLCHUNK : requested_size; + data = PyBytes_FromStringAndSize(NULL, capacity); + if (data == NULL) goto cleanup; - - if (bytesrequested < 0) - buffersize = Util_NewBufferSize((size_t)0); - else - buffersize = bytesrequested; - if (buffersize > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "requested number of bytes is " - "more than a Python string can hold"); - goto cleanup; - } - ret = PyBytes_FromStringAndSize((char *)NULL, buffersize); - if (ret == NULL || buffersize == 0) - goto cleanup; - bytesread = 0; + size = 0; for (;;) { - Py_BEGIN_ALLOW_THREADS - chunksize = BZ2_bzRead(&bzerror, self->fp, - BUF(ret)+bytesread, - buffersize-bytesread); - self->pos += chunksize; - Py_END_ALLOW_THREADS - bytesread += chunksize; - if (bzerror == BZ_STREAM_END) { - self->size = self->pos; - self->mode = MODE_READ_EOF; - break; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - Py_DECREF(ret); - ret = NULL; + ssize_t nread = read_decompressed(self, + PyBytes_AS_STRING(data) + size, + capacity - size); + if (nread < 0) { + Py_DECREF(data); + data = NULL; goto cleanup; } - if (bytesrequested < 0) { - buffersize = Util_NewBufferSize(buffersize); - if (_PyBytes_Resize(&ret, buffersize) < 0) { - ret = NULL; + size += nread; + if (self->mode == MODE_READ_EOF || size == requested_size) { + break; + } else if (size == capacity) { + capacity = grow_buffer_size(capacity); + if (_PyBytes_Resize(&data, capacity) < 0) { + data = NULL; goto cleanup; } - } else { - break; - } - } - if (bytesread != buffersize) { - if (_PyBytes_Resize(&ret, bytesread) < 0) { - ret = NULL; } } + if (size != capacity) + if (_PyBytes_Resize(&data, size) < 0) + data = NULL; + cleanup: RELEASE_LOCK(self); - return ret; + return data; } PyDoc_STRVAR(BZ2File_readline__doc__, -"readline([size]) -> string\n\ -\n\ -Return the next line from the file, as a string, retaining newline.\n\ -A non-negative size argument will limit the maximum number of bytes to\n\ -return (an incomplete line may be returned then). Return an empty\n\ -string at EOF.\n\ -"); +"readline([size]) -> bytes\n" +"\n" +"Read and return a line from the file as bytes, retaining newline.\n" +"A non-negative size argument will limit the maximum number of\n" +"bytes to return (in which case the line may be incomplete).\n" +"On end-of-file, returns b''.\n"); static PyObject * BZ2File_readline(BZ2FileObject *self, PyObject *args) { - PyObject *ret = NULL; - int sizehint = -1; + long size = -1; + PyObject *line; - if (!PyArg_ParseTuple(args, "|i:readline", &sizehint)) + if (!PyArg_ParseTuple(args, "|l:readline", &size)) return NULL; ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_READ: + if (can_read(self)) + line = read_line(self, size); + else + line = NULL; + RELEASE_LOCK(self); + return line; +} + +PyDoc_STRVAR(BZ2File_readlines__doc__, +"readlines([size]) -> list\n" +"\n" +"Read and return a list of lines from the file, as a list of bytes.\n" +"If size is given, read lines until the total size (in bytes) of\n" +"all the lines read equals or exceeds size. Otherwise, reads lines\n" +"until end-of-file is reached.\n"); + +static PyObject * +BZ2File_readlines(BZ2FileObject *self, PyObject *args) +{ + long size = -1, list_size = 0; + PyObject *list = NULL; + + if (!PyArg_ParseTuple(args, "|l:readlines", &size)) + return NULL; + + ACQUIRE_LOCK(self); + if (!can_read(self)) + goto error; + + list = PyList_New(0); + if (list == NULL) + goto error; + + while (self->mode != MODE_READ_EOF && (size < 0 || list_size < size)) { + PyObject *line = read_line(self, -1); + if (line == NULL) + goto error; + if (PyBytes_GET_SIZE(line) == 0) { + Py_DECREF(line); break; - case MODE_READ_EOF: - ret = PyBytes_FromStringAndSize("", 0); - goto cleanup; - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - default: - PyErr_SetString(PyExc_IOError, - "file is not ready for reading"); - goto cleanup; + } + if (PyList_Append(list, line) < 0) { + Py_DECREF(line); + goto error; + } + list_size += PyBytes_GET_SIZE(line); } + RELEASE_LOCK(self); + return list; - /* refuse to mix with f.next() */ - if (check_iterbuffered(self)) +error: + Py_XDECREF(list); + RELEASE_LOCK(self); + return NULL; +} + +PyDoc_STRVAR(BZ2File_write__doc__, +"write(data) -> None\n" +"\n" +"Write bytes to the file. Note that due to buffering, the file on\n" +"disk may not reflect the data written until close() is called.\n"); + +static PyObject * +BZ2File_write(BZ2FileObject *self, PyObject *args) +{ + Py_buffer data; + PyObject *result = NULL; + + if (!PyArg_ParseTuple(args, "y*:write", &data)) + return NULL; + + ACQUIRE_LOCK(self); + if (!can_write(self)) goto cleanup; - if (sizehint == 0) - ret = PyBytes_FromStringAndSize("", 0); - else - ret = Util_GetLine(self, (sizehint < 0) ? 0 : sizehint); + if (write_compressed(self, BZ_RUN, data.buf, data.len) < 0) + goto cleanup; + + Py_INCREF(Py_None); + result = Py_None; cleanup: RELEASE_LOCK(self); - return ret; -} - -PyDoc_STRVAR(BZ2File_readlines__doc__, -"readlines([size]) -> list\n\ -\n\ -Call readline() repeatedly and return a list of lines read.\n\ -The optional size argument, if given, is an approximate bound on the\n\ -total number of bytes in the lines returned.\n\ -"); - -/* This is a hacked version of Python's fileobject.c:file_readlines(). */ -static PyObject * -BZ2File_readlines(BZ2FileObject *self, PyObject *args) -{ - long sizehint = 0; - PyObject *list = NULL; - PyObject *line; - char small_buffer[SMALLCHUNK]; - char *buffer = small_buffer; - size_t buffersize = SMALLCHUNK; - PyObject *big_buffer = NULL; - size_t nfilled = 0; - size_t nread; - size_t totalread = 0; - char *p, *q, *end; - int err; - int shortread = 0; - int bzerror; - - if (!PyArg_ParseTuple(args, "|l:readlines", &sizehint)) - return NULL; - - ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_READ: - break; - case MODE_READ_EOF: - list = PyList_New(0); - goto cleanup; - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - default: - PyErr_SetString(PyExc_IOError, - "file is not ready for reading"); - goto cleanup; - } - - /* refuse to mix with f.next() */ - if (check_iterbuffered(self)) - goto cleanup; - - if ((list = PyList_New(0)) == NULL) - goto cleanup; - - for (;;) { - Py_BEGIN_ALLOW_THREADS - nread = BZ2_bzRead(&bzerror, self->fp, - buffer+nfilled, buffersize-nfilled); - self->pos += nread; - Py_END_ALLOW_THREADS - if (bzerror == BZ_STREAM_END) { - self->size = self->pos; - self->mode = MODE_READ_EOF; - if (nread == 0) { - sizehint = 0; - break; - } - shortread = 1; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - error: - Py_DECREF(list); - list = NULL; - goto cleanup; - } - totalread += nread; - p = memchr(buffer+nfilled, '\n', nread); - if (!shortread && p == NULL) { - /* Need a larger buffer to fit this line */ - nfilled += nread; - buffersize *= 2; - if (buffersize > INT_MAX) { - PyErr_SetString(PyExc_OverflowError, - "line is longer than a Python string can hold"); - goto error; - } - if (big_buffer == NULL) { - /* Create the big buffer */ - big_buffer = PyBytes_FromStringAndSize( - NULL, buffersize); - if (big_buffer == NULL) - goto error; - buffer = PyBytes_AS_STRING(big_buffer); - memcpy(buffer, small_buffer, nfilled); - } - else { - /* Grow the big buffer */ - if (_PyBytes_Resize(&big_buffer, buffersize) < 0){ - big_buffer = NULL; - goto error; - } - buffer = PyBytes_AS_STRING(big_buffer); - } - continue; - } - end = buffer+nfilled+nread; - q = buffer; - while (p != NULL) { - /* Process complete lines */ - p++; - line = PyBytes_FromStringAndSize(q, p-q); - if (line == NULL) - goto error; - err = PyList_Append(list, line); - Py_DECREF(line); - if (err != 0) - goto error; - q = p; - p = memchr(q, '\n', end-q); - } - /* Move the remaining incomplete line to the start */ - nfilled = end-q; - memmove(buffer, q, nfilled); - if (sizehint > 0) - if (totalread >= (size_t)sizehint) - break; - if (shortread) { - sizehint = 0; - break; - } - } - if (nfilled != 0) { - /* Partial last line */ - line = PyBytes_FromStringAndSize(buffer, nfilled); - if (line == NULL) - goto error; - if (sizehint > 0) { - /* Need to complete the last line */ - PyObject *rest = Util_GetLine(self, 0); - if (rest == NULL) { - Py_DECREF(line); - goto error; - } - PyBytes_Concat(&line, rest); - Py_DECREF(rest); - if (line == NULL) - goto error; - } - err = PyList_Append(list, line); - Py_DECREF(line); - if (err != 0) - goto error; - } - - cleanup: - RELEASE_LOCK(self); - if (big_buffer) { - Py_DECREF(big_buffer); - } - return list; -} - -PyDoc_STRVAR(BZ2File_write__doc__, -"write(data) -> None\n\ -\n\ -Write the 'data' string to file. Note that due to buffering, close() may\n\ -be needed before the file on disk reflects the data written.\n\ -"); - -/* This is a hacked version of Python's fileobject.c:file_write(). */ -static PyObject * -BZ2File_write(BZ2FileObject *self, PyObject *args) -{ - PyObject *ret = NULL; - Py_buffer pbuf; - char *buf; - int len; - int bzerror; - - if (!PyArg_ParseTuple(args, "y*:write", &pbuf)) - return NULL; - buf = pbuf.buf; - len = pbuf.len; - - ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_WRITE: - break; - - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - - default: - PyErr_SetString(PyExc_IOError, - "file is not ready for writing"); - goto cleanup; - } - - Py_BEGIN_ALLOW_THREADS - BZ2_bzWrite (&bzerror, self->fp, buf, len); - self->pos += len; - Py_END_ALLOW_THREADS - - if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - goto cleanup; - } - - Py_INCREF(Py_None); - ret = Py_None; - -cleanup: - PyBuffer_Release(&pbuf); - RELEASE_LOCK(self); - return ret; + PyBuffer_Release(&data); + return result; } PyDoc_STRVAR(BZ2File_writelines__doc__, -"writelines(sequence_of_strings) -> None\n\ -\n\ -Write the sequence of strings to the file. Note that newlines are not\n\ -added. The sequence can be any iterable object producing strings. This is\n\ -equivalent to calling write() for each string.\n\ -"); +"writelines(sequence) -> None\n" +"\n" +"Write a sequence bytes objects to the file. Note that newlines are\n" +"not added. sequence can be any iterable object producing bytes.\n" +"This is equivalent to calling write() for each string.\n"); -/* This is a hacked version of Python's fileobject.c:file_writelines(). */ static PyObject * BZ2File_writelines(BZ2FileObject *self, PyObject *seq) { -#define CHUNKSIZE 1000 - PyObject *list = NULL; + PyObject *line; PyObject *iter = NULL; - PyObject *ret = NULL; - PyObject *line; - int i, j, index, len, islist; - int bzerror; + PyObject *result = NULL; ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_WRITE: - break; + if (!can_write(self)) + goto cleanup; - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto error; + iter = PyObject_GetIter(seq); + if (iter == NULL) + goto cleanup; - default: - PyErr_SetString(PyExc_IOError, - "file is not ready for writing"); - goto error; + while ((line = PyIter_Next(iter)) != NULL) { + int status; + Py_buffer data; + + status = PyObject_GetBuffer(line, &data, PyBUF_SIMPLE); + Py_DECREF(line); + if (status < 0) + goto cleanup; + status = write_compressed(self, BZ_RUN, data.buf, data.len); + PyBuffer_Release(&data); + if (status < 0) + goto cleanup; } - - islist = PyList_Check(seq); - if (!islist) { - iter = PyObject_GetIter(seq); - if (iter == NULL) { - PyErr_SetString(PyExc_TypeError, - "writelines() requires an iterable argument"); - goto error; - } - list = PyList_New(CHUNKSIZE); - if (list == NULL) - goto error; - } - - /* Strategy: slurp CHUNKSIZE lines into a private list, - checking that they are all strings, then write that list - without holding the interpreter lock, then come back for more. */ - for (index = 0; ; index += CHUNKSIZE) { - if (islist) { - Py_XDECREF(list); - list = PyList_GetSlice(seq, index, index+CHUNKSIZE); - if (list == NULL) - goto error; - j = PyList_GET_SIZE(list); - } - else { - for (j = 0; j < CHUNKSIZE; j++) { - line = PyIter_Next(iter); - if (line == NULL) { - if (PyErr_Occurred()) - goto error; - break; - } - PyList_SetItem(list, j, line); - } - } - if (j == 0) - break; - - /* Check that all entries are indeed byte strings. If not, - apply the same rules as for file.write() and - convert the rets to strings. This is slow, but - seems to be the only way since all conversion APIs - could potentially execute Python code. */ - for (i = 0; i < j; i++) { - PyObject *v = PyList_GET_ITEM(list, i); - if (!PyBytes_Check(v)) { - const char *buffer; - Py_ssize_t len; - if (PyObject_AsCharBuffer(v, &buffer, &len)) { - PyErr_SetString(PyExc_TypeError, - "writelines() " - "argument must be " - "a sequence of " - "bytes objects"); - goto error; - } - line = PyBytes_FromStringAndSize(buffer, - len); - if (line == NULL) - goto error; - Py_DECREF(v); - PyList_SET_ITEM(list, i, line); - } - } - - /* Since we are releasing the global lock, the - following code may *not* execute Python code. */ - Py_BEGIN_ALLOW_THREADS - for (i = 0; i < j; i++) { - line = PyList_GET_ITEM(list, i); - len = PyBytes_GET_SIZE(line); - BZ2_bzWrite (&bzerror, self->fp, - PyBytes_AS_STRING(line), len); - if (bzerror != BZ_OK) { - Py_BLOCK_THREADS - Util_CatchBZ2Error(bzerror); - goto error; - } - } - Py_END_ALLOW_THREADS - - if (j < CHUNKSIZE) - break; - } + if (PyErr_Occurred()) /* Check for error in PyIter_Next() */ + goto cleanup; Py_INCREF(Py_None); - ret = Py_None; + result = Py_None; - error: +cleanup: + Py_XDECREF(iter); RELEASE_LOCK(self); - Py_XDECREF(list); - Py_XDECREF(iter); - return ret; -#undef CHUNKSIZE + return result; } PyDoc_STRVAR(BZ2File_seek__doc__, -"seek(offset [, whence]) -> None\n\ -\n\ -Move to new file position. Argument offset is a byte count. Optional\n\ -argument whence defaults to 0 (offset from start of file, offset\n\ -should be >= 0); other values are 1 (move relative to current position,\n\ -positive or negative), and 2 (move relative to end of file, usually\n\ -negative, although many platforms allow seeking beyond the end of a file).\n\ -\n\ -Note that seeking of bz2 files is emulated, and depending on the parameters\n\ -the operation may be extremely slow.\n\ -"); +"seek(offset[, whence]) -> None\n" +"\n" +"Move to a new file position. Argument offset is a byte count.\n" +"Optional argument whence can be:\n" +"\n" +" 0: move relative to start of file (default); offset must be >= 0\n" +" 1: move relative to current position\n" +" 2: move relative to end of file; offset must be <= 0\n" +"\n" +"Note that seeking of bz2 files is emulated, and depending on the\n" +"parameters, may be extremely slow. Also note that seeking forward\n" +"works with any type of source file, but seeking backward will fail\n" +"if the source file does not have a seek() method.\n"); static PyObject * BZ2File_seek(BZ2FileObject *self, PyObject *args) { - int where = 0; - PyObject *offobj; Py_off_t offset; - char small_buffer[SMALLCHUNK]; - char *buffer = small_buffer; - size_t buffersize = SMALLCHUNK; - Py_off_t bytesread = 0; - size_t readsize; - int chunksize; - int bzerror; + int whence = 0; + char buffer[SMALLCHUNK]; PyObject *ret = NULL; - if (!PyArg_ParseTuple(args, "O|i:seek", &offobj, &where)) - return NULL; -#if !defined(HAVE_LARGEFILE_SUPPORT) - offset = PyLong_AsLong(offobj); +#ifdef HAVE_LARGEFILE_SUPPORT + if (!PyArg_ParseTuple(args, "L|i:seek", &offset, &whence)) #else - offset = PyLong_Check(offobj) ? - PyLong_AsLongLong(offobj) : PyLong_AsLong(offobj); + if (!PyArg_ParseTuple(args, "l|i:seek", &offset, &whence)) #endif - if (PyErr_Occurred()) return NULL; ACQUIRE_LOCK(self); - Util_DropReadAhead(self); - switch (self->mode) { - case MODE_READ: - case MODE_READ_EOF: - break; + if (!can_seek(self)) + goto cleanup; - case MODE_CLOSED: - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - - default: - PyErr_SetString(PyExc_IOError, - "seek works only while reading"); - goto cleanup; - } - - if (where == 2) { - if (self->size == -1) { - assert(self->mode != MODE_READ_EOF); - for (;;) { - Py_BEGIN_ALLOW_THREADS - chunksize = BZ2_bzRead(&bzerror, self->fp, - buffer, buffersize); - self->pos += chunksize; - Py_END_ALLOW_THREADS - - bytesread += chunksize; - if (bzerror == BZ_STREAM_END) { - break; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + if (whence == 2) { + /* Seeking relative to EOF. We need to know the file's size. */ + if (self->size == -1) + while (self->mode != MODE_READ_EOF) + if (read_decompressed(self, buffer, sizeof buffer) < 0) goto cleanup; - } - } - self->mode = MODE_READ_EOF; - self->size = self->pos; - bytesread = 0; - } offset = self->size + offset; - } else if (where == 1) { + } else if (whence == 1) { offset = self->pos + offset; } - /* Before getting here, offset must be the absolute position the file - * pointer should be set to. */ + /* At this point, offset is the absolute position to seek to. */ + if (offset >= self->pos) /* Moving forward */ + offset -= self->pos; + else /* Moving backward - have to rewind */ + if (rewind_stream(self) < 0) + goto cleanup; - if (offset >= self->pos) { - /* we can move forward */ - offset -= self->pos; - } else { - /* we cannot move back, so rewind the stream */ - BZ2_bzReadClose(&bzerror, self->fp); - if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + /* At this point, offset is the number of bytes to walk forward. */ + while (offset > 0 && self->mode != MODE_READ_EOF) { + ssize_t read_size = (offset < sizeof buffer) ? offset : sizeof buffer; + ssize_t nread = read_decompressed(self, buffer, read_size); + if (nread < 0) goto cleanup; - } - rewind(self->rawfp); - self->pos = 0; - self->fp = BZ2_bzReadOpen(&bzerror, self->rawfp, - 0, 0, NULL, 0); - if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - goto cleanup; - } - self->mode = MODE_READ; + offset -= nread; } - if (offset <= 0 || self->mode == MODE_READ_EOF) - goto exit; - - /* Before getting here, offset must be set to the number of bytes - * to walk forward. */ - for (;;) { - if (offset-bytesread > buffersize) - readsize = buffersize; - else - /* offset might be wider that readsize, but the result - * of the subtraction is bound by buffersize (see the - * condition above). buffersize is 8192. */ - readsize = (size_t)(offset-bytesread); - Py_BEGIN_ALLOW_THREADS - chunksize = BZ2_bzRead(&bzerror, self->fp, buffer, readsize); - self->pos += chunksize; - Py_END_ALLOW_THREADS - bytesread += chunksize; - if (bzerror == BZ_STREAM_END) { - self->size = self->pos; - self->mode = MODE_READ_EOF; - break; - } else if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); - goto cleanup; - } - if (bytesread == offset) - break; - } - -exit: Py_INCREF(Py_None); ret = Py_None; @@ -1049,108 +859,69 @@ } PyDoc_STRVAR(BZ2File_tell__doc__, -"tell() -> int\n\ -\n\ -Return the current file position, an integer (may be a long integer).\n\ -"); +"tell() -> int\n" +"\n" +"Return the current file position.\n"); static PyObject * -BZ2File_tell(BZ2FileObject *self, PyObject *args) +BZ2File_tell(BZ2FileObject *self, PyObject *noargs) +{ + if (self->mode == MODE_CLOSED) { + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); + return NULL; + } +#ifdef HAVE_LARGEFILE_SUPPORT + return PyLong_FromLongLong(self->pos); +#else + return PyLong_FromLong(self->pos); +#endif +} + +PyDoc_STRVAR(BZ2File_close__doc__, +"close() -> None\n" +"\n" +"Close the file. A closed file cannot be used for further I/O.\n" +"close() may be called more than once without error.\n"); + +static PyObject * +BZ2File_close(BZ2FileObject *self, PyObject *noargs) { PyObject *ret = NULL; - if (self->mode == MODE_CLOSED) { - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - goto cleanup; - } - -#if !defined(HAVE_LARGEFILE_SUPPORT) - ret = PyLong_FromLong(self->pos); -#else - ret = PyLong_FromLongLong(self->pos); -#endif - -cleanup: - return ret; -} - -PyDoc_STRVAR(BZ2File_close__doc__, -"close() -> None or (perhaps) an integer\n\ -\n\ -Close the file. Sets data attribute .closed to true. A closed file\n\ -cannot be used for further I/O operations. close() may be called more\n\ -than once without error.\n\ -"); - -static PyObject * -BZ2File_close(BZ2FileObject *self) -{ - PyObject *ret = NULL; - int bzerror = BZ_OK; - - if (self->mode == MODE_CLOSED) { - Py_RETURN_NONE; - } - ACQUIRE_LOCK(self); - switch (self->mode) { - case MODE_READ: - case MODE_READ_EOF: - BZ2_bzReadClose(&bzerror, self->fp); - break; - case MODE_WRITE: - BZ2_bzWriteClose(&bzerror, self->fp, - 0, NULL, NULL); - break; - } - self->mode = MODE_CLOSED; - fclose(self->rawfp); - self->rawfp = NULL; - if (bzerror == BZ_OK) { + if (flush_and_close(self) == 0) { Py_INCREF(Py_None); ret = Py_None; } - else { - Util_CatchBZ2Error(bzerror); - } - RELEASE_LOCK(self); return ret; } -PyDoc_STRVAR(BZ2File_enter_doc, -"__enter__() -> self."); +PyDoc_STRVAR(BZ2File_enter_doc, "__enter__() -> self"); static PyObject * -BZ2File_enter(BZ2FileObject *self) +BZ2File_enter(BZ2FileObject *self, PyObject *noargs) { if (self->mode == MODE_CLOSED) { - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); return NULL; } Py_INCREF(self); - return (PyObject *) self; + return (PyObject *)self; } PyDoc_STRVAR(BZ2File_exit_doc, -"__exit__(*excinfo) -> None. Closes the file."); +"__exit__(*excinfo) -> None\n" +"\n" +"Closes the file.\n"); static PyObject * BZ2File_exit(BZ2FileObject *self, PyObject *args) { - PyObject *ret = PyObject_CallMethod((PyObject *) self, "close", NULL); - if (!ret) - /* If error occurred, pass through */ - return NULL; - Py_DECREF(ret); - Py_RETURN_NONE; + return BZ2File_close(self, NULL); } -static PyObject *BZ2File_getiter(BZ2FileObject *self); - static PyMethodDef BZ2File_methods[] = { {"read", (PyCFunction)BZ2File_read, METH_VARARGS, BZ2File_read__doc__}, {"readline", (PyCFunction)BZ2File_readline, METH_VARARGS, BZ2File_readline__doc__}, @@ -1162,7 +933,7 @@ {"close", (PyCFunction)BZ2File_close, METH_NOARGS, BZ2File_close__doc__}, {"__enter__", (PyCFunction)BZ2File_enter, METH_NOARGS, BZ2File_enter_doc}, {"__exit__", (PyCFunction)BZ2File_exit, METH_VARARGS, BZ2File_exit_doc}, - {NULL, NULL} /* sentinel */ + {NULL, NULL} /* sentinel */ }; @@ -1176,9 +947,8 @@ } static PyGetSetDef BZ2File_getset[] = { - {"closed", (getter)BZ2File_get_closed, NULL, - "True if the file is closed"}, - {NULL} /* Sentinel */ + {"closed", (getter)BZ2File_get_closed, NULL, "True if the file is closed"}, + {NULL} /* sentinel */ }; @@ -1186,188 +956,174 @@ /* Slot definitions for BZ2File_Type. */ static int +parse_mode_string(const char *string) +{ + int mode = MODE_CLOSED; /* Here, MODE_CLOSED means 'no mode set yet' */ + const char *p; + for (p = string; *p != '\0'; p++) { + if (*p == 'b') { + /* ignore */ + } else if (*p == 'r' && mode == MODE_CLOSED) { + mode = MODE_READ; + } else if (*p == 'w' && mode == MODE_CLOSED) { + mode = MODE_WRITE; + } else { + PyErr_Format(PyExc_ValueError, "Invalid mode: '%s'", string); + return MODE_CLOSED; + } + } + if (mode == MODE_CLOSED) + mode = MODE_READ; /* Default to reading mode */ + return mode; +} + +static int BZ2File_init(BZ2FileObject *self, PyObject *args, PyObject *kwargs) { static char *kwlist[] = {"filename", "mode", "buffering", - "compresslevel", 0}; - PyObject *name_obj = NULL; - char *name; - char *mode = "r"; - int buffering = -1; + "compresslevel", "fileobj", NULL}; + PyObject *filename = Py_None; + PyObject *fileobj = Py_None; + const char *mode = "r"; + int buffering = 0; /* XXX this argument is ignored */ int compresslevel = 9; int bzerror; - int mode_char = 0; self->size = -1; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O&|sii:BZ2File", - kwlist, PyUnicode_FSConverter, &name_obj, - &mode, &buffering, - &compresslevel)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OsiiO:BZ2File", kwlist, + &filename, &mode, &buffering, + &compresslevel, &fileobj)) return -1; - name = PyBytes_AsString(name_obj); - if (compresslevel < 1 || compresslevel > 9) { - PyErr_SetString(PyExc_ValueError, - "compresslevel must be between 1 and 9"); - Py_DECREF(name_obj); - return -1; - } - - for (;;) { - int error = 0; - switch (*mode) { - case 'r': - case 'w': - if (mode_char) - error = 1; - mode_char = *mode; - break; - - case 'b': - break; - - default: - error = 1; - break; - } - if (error) { - PyErr_Format(PyExc_ValueError, - "invalid mode char %c", *mode); - Py_DECREF(name_obj); - return -1; - } - mode++; - if (*mode == '\0') - break; - } - - if (mode_char == 0) { - mode_char = 'r'; - } - - mode = (mode_char == 'r') ? "rb" : "wb"; - - self->rawfp = fopen(name, mode); - Py_DECREF(name_obj); - if (self->rawfp == NULL) { - PyErr_SetFromErrno(PyExc_IOError); - return -1; - } - /* XXX Ignore buffering */ - - /* From now on, we have stuff to dealloc, so jump to error label - * instead of returning */ - #ifdef WITH_THREAD self->lock = PyThread_allocate_lock(); - if (!self->lock) { - PyErr_SetString(PyExc_MemoryError, "unable to allocate lock"); + if (self->lock == NULL) { + PyErr_SetString(PyExc_MemoryError, "Unable to allocate lock"); goto error; } #endif - if (mode_char == 'r') - self->fp = BZ2_bzReadOpen(&bzerror, self->rawfp, - 0, 0, NULL, 0); - else - self->fp = BZ2_bzWriteOpen(&bzerror, self->rawfp, - compresslevel, 0, 0); + self->mode = parse_mode_string(mode); + if (self->mode == MODE_CLOSED) + goto error; - if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + if (filename != Py_None && fileobj == Py_None) { + self->fp = builtins_open(filename, self->mode); + if (self->fp == NULL) + goto error; + self->closefp = 1; + } else if (fileobj != Py_None && filename == Py_None) { + Py_INCREF(fileobj); + self->fp = fileobj; + self->closefp = 0; + } else { + PyErr_SetString(PyExc_ValueError, + "Must give exactly one of filename and fileobj"); goto error; } - self->mode = (mode_char == 'r') ? MODE_READ : MODE_WRITE; + if (compresslevel < 1 || compresslevel > 9) { + PyErr_SetString(PyExc_ValueError, + "compresslevel must be between 1 and 9"); + goto error; + } + if (self->mode == MODE_READ) + bzerror = BZ2_bzDecompressInit(&self->bzs, 0, 0); + else + bzerror = BZ2_bzCompressInit(&self->bzs, compresslevel, 0, 0); + if (catch_bz2_error(bzerror)) + goto error; return 0; error: - fclose(self->rawfp); - self->rawfp = NULL; #ifdef WITH_THREAD - if (self->lock) { + if (self->lock != NULL) { PyThread_free_lock(self->lock); self->lock = NULL; } #endif + if (self->fp != NULL) { + if (self->closefp) { + PyObject *result, *type, *value, *traceback; + + PyErr_Fetch(&type, &value, &traceback); + result = PyObject_CallMethod(self->fp, "close", NULL); + Py_XDECREF(result); + PyErr_Restore(type, value, traceback); /* In case close failed */ + } + Py_DECREF(self->fp); + self->fp = NULL; + } + self->mode = MODE_CLOSED; return -1; } static void BZ2File_dealloc(BZ2FileObject *self) { - int bzerror; + /* We can't call BZ2File_close(), since self->lock may not exist. */ + flush_and_close(self); + + Py_XDECREF(self->rawobj); + #ifdef WITH_THREAD - if (self->lock) + if (self->lock != NULL) PyThread_free_lock(self->lock); #endif - switch (self->mode) { - case MODE_READ: - case MODE_READ_EOF: - BZ2_bzReadClose(&bzerror, self->fp); - break; - case MODE_WRITE: - BZ2_bzWriteClose(&bzerror, self->fp, - 0, NULL, NULL); - break; - } - Util_DropReadAhead(self); - if (self->rawfp != NULL) - fclose(self->rawfp); + Py_TYPE(self)->tp_free((PyObject *)self); } -/* This is a hacked version of Python's fileobject.c:file_getiter(). */ static PyObject * BZ2File_getiter(BZ2FileObject *self) { if (self->mode == MODE_CLOSED) { - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); + PyErr_SetString(PyExc_ValueError, "I/O operation on closed file"); return NULL; } - Py_INCREF((PyObject*)self); + Py_INCREF((PyObject *)self); return (PyObject *)self; } -/* This is a hacked version of Python's fileobject.c:file_iternext(). */ -#define READAHEAD_BUFSIZE 8192 static PyObject * BZ2File_iternext(BZ2FileObject *self) { - PyBytesObject* ret; + PyObject *result = NULL; + ACQUIRE_LOCK(self); - if (self->mode == MODE_CLOSED) { - RELEASE_LOCK(self); - PyErr_SetString(PyExc_ValueError, - "I/O operation on closed file"); - return NULL; + if (can_read(self)) { + PyObject *line = read_line(self, -1); + if (line != NULL) { + if (PyBytes_GET_SIZE(line) == 0) /* Check for EOF */ + Py_DECREF(line); + else + result = line; + } } - ret = Util_ReadAheadGetLineSkip(self, 0, READAHEAD_BUFSIZE); RELEASE_LOCK(self); - if (ret == NULL || PyBytes_GET_SIZE(ret) == 0) { - Py_XDECREF(ret); - return NULL; - } - return (PyObject *)ret; + return result; } + /* ===================================================================== */ /* BZ2File_Type definition. */ -PyDoc_VAR(BZ2File__doc__) = -PyDoc_STR( -"BZ2File(name [, mode='r', buffering=0, compresslevel=9]) -> file object\n\ -\n\ -Open a bz2 file. The mode can be 'r' or 'w', for reading (default) or\n\ -writing. When opened for writing, the file will be created if it doesn't\n\ -exist, and truncated otherwise. If the buffering argument is given, 0 means\n\ -unbuffered, and larger numbers specify the buffer size. If compresslevel\n\ -is given, must be a number between 1 and 9.\n\ -Data read is always returned in bytes; data written ought to be bytes.\n\ -"); +PyDoc_STRVAR(BZ2File__doc__, +"BZ2File(filename=None, mode='r', buffering=0, compresslevel=9,\n" +" fileobj=None) -> file object\n" +"\n" +"Open a bz2 file. Exactly one of filename and fileobj should be\n" +"provided. If fileobj is provided, it should be a file-like object;\n" +"data will be read from / written to this object. Otherwise, the\n" +"file name by filename will be opened.\n" +"\n" +"mode can be 'r' for reading/decompression (default), or 'w' for\n" +"writing/decompression. If compresslevel is provided, it should be\n" +"a number between 1 and 9. The buffering argument is ignored.\n" +"\n" +"Data read is returned in bytes; data to be written must be bytes.\n"); static PyTypeObject BZ2File_Type = { PyVarObject_HEAD_INIT(NULL, 0) @@ -1398,7 +1154,7 @@ (getiterfunc)BZ2File_getiter, /*tp_iter*/ (iternextfunc)BZ2File_iternext, /*tp_iternext*/ BZ2File_methods, /*tp_methods*/ - 0, /*tp_members*/ + 0, /*tp_members*/ BZ2File_getset, /*tp_getset*/ 0, /*tp_base*/ 0, /*tp_dict*/ @@ -1470,13 +1226,13 @@ bzerror = BZ2_bzCompress(bzs, BZ_RUN); Py_END_ALLOW_THREADS if (bzerror != BZ_RUN_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); goto error; } if (bzs->avail_in == 0) break; /* no more input data */ if (bzs->avail_out == 0) { - bufsize = Util_NewBufferSize(bufsize); + bufsize = grow_buffer_size(bufsize); if (_PyBytes_Resize(&ret, bufsize) < 0) { BZ2_bzCompressEnd(bzs); goto error; @@ -1542,11 +1298,11 @@ if (bzerror == BZ_STREAM_END) { break; } else if (bzerror != BZ_FINISH_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); goto error; } if (bzs->avail_out == 0) { - bufsize = Util_NewBufferSize(bufsize); + bufsize = grow_buffer_size(bufsize); if (_PyBytes_Resize(&ret, bufsize) < 0) goto error; bzs->next_out = BUF(ret); @@ -1611,7 +1367,7 @@ memset(&self->bzs, 0, sizeof(bz_stream)); bzerror = BZ2_bzCompressInit(&self->bzs, compresslevel, 0, 0); if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); goto error; } @@ -1771,13 +1527,13 @@ break; } if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); goto error; } if (bzs->avail_in == 0) break; /* no more input data */ if (bzs->avail_out == 0) { - bufsize = Util_NewBufferSize(bufsize); + bufsize = grow_buffer_size(bufsize); if (_PyBytes_Resize(&ret, bufsize) < 0) { BZ2_bzDecompressEnd(bzs); goto error; @@ -1838,7 +1594,7 @@ memset(&self->bzs, 0, sizeof(bz_stream)); bzerror = BZ2_bzDecompressInit(&self->bzs, 0, 0); if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); goto error; } @@ -1983,7 +1739,7 @@ bzerror = BZ2_bzCompressInit(bzs, compresslevel, 0, 0); if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); PyBuffer_Release(&pdata); Py_DECREF(ret); return NULL; @@ -1997,13 +1753,13 @@ break; } else if (bzerror != BZ_FINISH_OK) { BZ2_bzCompressEnd(bzs); - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); PyBuffer_Release(&pdata); Py_DECREF(ret); return NULL; } if (bzs->avail_out == 0) { - bufsize = Util_NewBufferSize(bufsize); + bufsize = grow_buffer_size(bufsize); if (_PyBytes_Resize(&ret, bufsize) < 0) { BZ2_bzCompressEnd(bzs); PyBuffer_Release(&pdata); @@ -2069,7 +1825,7 @@ bzerror = BZ2_bzDecompressInit(bzs, 0, 0); if (bzerror != BZ_OK) { - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); Py_DECREF(ret); PyBuffer_Release(&pdata); return NULL; @@ -2083,7 +1839,7 @@ break; } else if (bzerror != BZ_OK) { BZ2_bzDecompressEnd(bzs); - Util_CatchBZ2Error(bzerror); + catch_bz2_error(bzerror); PyBuffer_Release(&pdata); Py_DECREF(ret); return NULL; @@ -2097,7 +1853,7 @@ return NULL; } if (bzs->avail_out == 0) { - bufsize = Util_NewBufferSize(bufsize); + bufsize = grow_buffer_size(bufsize); if (_PyBytes_Resize(&ret, bufsize) < 0) { BZ2_bzDecompressEnd(bzs); PyBuffer_Release(&pdata);