From f8f30fad4dfdad4a655942963c1e5fc3ae0d9cfb Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Sun, 6 Jul 2008 04:04:07 +0000 Subject: [PATCH] Backport r62627 + r62744 from trunk to fix issue 2632. - Issue #2632: Prevent socket.read(bignumber) from over allocating memory in the common case when the data is returned from the underlying socket in increments much smaller than bignumber. --- Lib/socket.py | 168 +++++++++++++++++++++++++--------------- Lib/test/test_socket.py | 27 +++++++ Misc/NEWS | 3 + 3 files changed, 136 insertions(+), 62 deletions(-) diff --git a/Lib/socket.py b/Lib/socket.py index f79b28268d2..7f5a91e22c4 100644 --- a/Lib/socket.py +++ b/Lib/socket.py @@ -55,6 +55,11 @@ except ImportError: import os, sys +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + try: from errno import EBADF except ImportError: @@ -211,6 +216,9 @@ class _fileobject(object): bufsize = self.default_bufsize self.bufsize = bufsize self.softspace = False + # _rbufsize is the suggested recv buffer size. It is *strictly* + # obeyed within readline() for recv calls. If it is larger than + # default_bufsize it will be used for recv calls within read(). if bufsize == 0: self._rbufsize = 1 elif bufsize == 1: @@ -218,7 +226,11 @@ class _fileobject(object): else: self._rbufsize = bufsize self._wbufsize = bufsize - self._rbuf = "" # A string + # We use StringIO for the read buffer to avoid holding a list + # of variously sized string objects which have been known to + # fragment the heap due to how they are malloc()ed and often + # realloc()ed down much smaller than their original allocation. + self._rbuf = StringIO() self._wbuf = [] # A list of strings self._close = close @@ -276,56 +288,85 @@ class _fileobject(object): return buf_len def read(self, size=-1): - data = self._rbuf + # Use max, disallow tiny reads in a loop as they are very inefficient. + # We never leave read() with any leftover data from a new recv() call + # in our internal buffer. + rbufsize = max(self._rbufsize, self.default_bufsize) + # Our use of StringIO rather than lists of string objects returned by + # recv() minimizes memory usage and fragmentation that occurs when + # rbufsize is large compared to the typical return value of recv(). + buf = self._rbuf + buf.seek(0, 2) # seek end if size < 0: # Read until EOF - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" - if self._rbufsize <= 1: - recv_size = self.default_bufsize - else: - recv_size = self._rbufsize + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: - data = self._sock.recv(recv_size) + data = self._sock.recv(rbufsize) if not data: break - buffers.append(data) - return "".join(buffers) + buf.write(data) + return buf.getvalue() else: # Read until size bytes or EOF seen, whichever comes first - buf_len = len(data) + buf_len = buf.tell() if buf_len >= size: - self._rbuf = data[size:] - return data[:size] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + # Already have size bytes in our buffer? Extract and return. + buf.seek(0) + rv = buf.read(size) + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return rv + + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: left = size - buf_len - recv_size = min(self._rbufsize, left) - data = self._sock.recv(recv_size) + # recv() will malloc the amount of memory given as its + # parameter even though it often returns much less data + # than that. The returned data string is short lived + # as we copy it into a StringIO and free it. This avoids + # fragmentation issues on many platforms. + data = self._sock.recv(left) if not data: break - buffers.append(data) n = len(data) - if n >= left: - self._rbuf = data[left:] - buffers[-1] = data[:left] + if n == size and not buf_len: + # Shortcut. Avoid buffer data copies when: + # - We have no data in our buffer. + # AND + # - Our call to recv returned exactly the + # number of bytes we were asked to read. + return data + if n == left: + buf.write(data) + del data # explicit free break + assert n <= left, "recv(%d) returned %d bytes" % (left, n) + buf.write(data) buf_len += n - return "".join(buffers) + del data # explicit free + #assert buf_len == buf.tell() + return buf.getvalue() def readline(self, size=-1): - data = self._rbuf + buf = self._rbuf + buf.seek(0, 2) # seek end + if buf.tell() > 0: + # check if we already have it in our buffer + buf.seek(0) + bline = buf.readline(size) + if bline.endswith('\n') or len(bline) == size: + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return bline + del bline if size < 0: # Read until \n or EOF, whichever comes first if self._rbufsize <= 1: # Speed up unbuffered case - assert data == "" - buffers = [] + buf.seek(0) + buffers = [buf.read()] + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. + data = None recv = self._sock.recv while data != "\n": data = recv(1) @@ -333,61 +374,64 @@ class _fileobject(object): break buffers.append(data) return "".join(buffers) - nl = data.find('\n') - if nl >= 0: - nl += 1 - self._rbuf = data[nl:] - return data[:nl] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + + buf.seek(0, 2) # seek end + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: data = self._sock.recv(self._rbufsize) if not data: break - buffers.append(data) nl = data.find('\n') if nl >= 0: nl += 1 - self._rbuf = data[nl:] - buffers[-1] = data[:nl] + buf.write(buffer(data, 0, nl)) + self._rbuf.write(buffer(data, nl)) + del data break - return "".join(buffers) + buf.write(data) + return buf.getvalue() else: # Read until size bytes or \n or EOF seen, whichever comes first - nl = data.find('\n', 0, size) - if nl >= 0: - nl += 1 - self._rbuf = data[nl:] - return data[:nl] - buf_len = len(data) + buf.seek(0, 2) # seek end + buf_len = buf.tell() if buf_len >= size: - self._rbuf = data[size:] - return data[:size] - buffers = [] - if data: - buffers.append(data) - self._rbuf = "" + buf.seek(0) + rv = buf.read(size) + self._rbuf = StringIO() + self._rbuf.write(buf.read()) + return rv + self._rbuf = StringIO() # reset _rbuf. we consume it via buf. while True: data = self._sock.recv(self._rbufsize) if not data: break - buffers.append(data) left = size - buf_len + # did we just receive a newline? nl = data.find('\n', 0, left) if nl >= 0: nl += 1 - self._rbuf = data[nl:] - buffers[-1] = data[:nl] - break + # save the excess data to _rbuf + self._rbuf.write(buffer(data, nl)) + if buf_len: + buf.write(buffer(data, 0, nl)) + break + else: + # Shortcut. Avoid data copy through buf when returning + # a substring of our first recv(). + return data[:nl] n = len(data) + if n == size and not buf_len: + # Shortcut. Avoid data copy through buf when + # returning exactly all of our first recv(). + return data if n >= left: - self._rbuf = data[left:] - buffers[-1] = data[:left] + buf.write(buffer(data, 0, left)) + self._rbuf.write(buffer(data, left)) break + buf.write(data) buf_len += n - return "".join(buffers) + #assert buf_len == buf.tell() + return buf.getvalue() def readlines(self, sizehint=0): total = 0 diff --git a/Lib/test/test_socket.py b/Lib/test/test_socket.py index b5215214d1b..f61aacc8711 100644 --- a/Lib/test/test_socket.py +++ b/Lib/test/test_socket.py @@ -762,6 +762,33 @@ class FileObjectClassTestCase(SocketConnectedTest): self.cli_file.write(MSG) self.cli_file.flush() + def testReadlineAfterRead(self): + a_baloo_is = self.serv_file.read(len("A baloo is")) + self.assertEqual("A baloo is", a_baloo_is) + _a_bear = self.serv_file.read(len(" a bear")) + self.assertEqual(" a bear", _a_bear) + line = self.serv_file.readline() + self.assertEqual("\n", line) + line = self.serv_file.readline() + self.assertEqual("A BALOO IS A BEAR.\n", line) + line = self.serv_file.readline() + self.assertEqual(MSG, line) + + def _testReadlineAfterRead(self): + self.cli_file.write("A baloo is a bear\n") + self.cli_file.write("A BALOO IS A BEAR.\n") + self.cli_file.write(MSG) + self.cli_file.flush() + + def testReadlineAfterReadNoNewline(self): + end_of_ = self.serv_file.read(len("End Of ")) + self.assertEqual("End Of ", end_of_) + line = self.serv_file.readline() + self.assertEqual("Line", line) + + def _testReadlineAfterReadNoNewline(self): + self.cli_file.write("End Of Line") + def testClosedAttr(self): self.assert_(not self.serv_file.closed) diff --git a/Misc/NEWS b/Misc/NEWS index c8fbedf8f2d..c5cf068ea5c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -102,6 +102,9 @@ Library stdout and stderr fds rather than leaving them open until the instance is destroyed. +- Issue #2632: Prevent socket.read(bignumber) from over allocating memory + in the common case when the data is returned from the underlying socket + in increments much smaller than bignumber. Extension Modules -----------------