diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index d9aa92270669d..a5fca8f268d9c 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -31,6 +31,7 @@ Bug Fixes - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) +- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 65b62601c7022..03e0cae6cc83f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -83,4 +83,3 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 2a82fd7a53222..91f417abc0502 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -225,6 +225,12 @@ def _get_properties(self): self.os_name = self.os_name.decode( self.encoding or self.default_encoding) + def __next__(self): + da = self.read(nrows=self.chunksize or 1) + if da is None: + raise StopIteration + return da + # Read a single float of the given width (4 or 8). def _read_float(self, offset, width): if width not in (4, 8): @@ -591,6 +597,10 @@ def read(self, nrows=None): if self._current_row_in_file_index >= self.row_count: return None + m = self.row_count - self._current_row_in_file_index + if nrows > m: + nrows = m + nd = (self.column_types == b'd').sum() ns = (self.column_types == b's').sum() diff --git a/pandas/io/tests/sas/test_sas7bdat.py b/pandas/io/tests/sas/test_sas7bdat.py index 06eb9774679b1..e20ea48247119 100644 --- a/pandas/io/tests/sas/test_sas7bdat.py +++ b/pandas/io/tests/sas/test_sas7bdat.py @@ -47,7 +47,9 @@ def test_from_buffer(self): with open(fname, 'rb') as f: byts = f.read() buf = io.BytesIO(byts) - df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8') + rdr = pd.read_sas(buf, format="sas7bdat", + iterator=True, encoding='utf-8') + df = rdr.read() tm.assert_frame_equal(df, df0, check_exact=False) def test_from_iterator(self): @@ -55,16 +57,35 @@ def test_from_iterator(self): df0 = self.data[j] for k in self.test_ix[j]: fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) - with open(fname, 'rb') as f: - byts = f.read() - buf = io.BytesIO(byts) - rdr = pd.read_sas(buf, format="sas7bdat", - iterator=True, encoding='utf-8') + rdr = pd.read_sas(fname, iterator=True, encoding='utf-8') df = rdr.read(2) tm.assert_frame_equal(df, df0.iloc[0:2, :]) df = rdr.read(3) tm.assert_frame_equal(df, df0.iloc[2:5, :]) + def test_iterator_loop(self): + # github #13654 + for j in 0, 1: + for k in self.test_ix[j]: + for chunksize in 3, 5, 10, 11: + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8') + y = 0 + for x in rdr: + y += x.shape[0] + self.assertTrue(y == rdr.row_count) + + def test_iterator_read_too_much(self): + # github #14734 + k = self.test_ix[0][0] + fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k) + rdr = pd.read_sas(fname, format="sas7bdat", + iterator=True, encoding='utf-8') + d1 = rdr.read(rdr.row_count + 20) + rdr = pd.read_sas(fname, iterator=True, encoding="utf-8") + d2 = rdr.read(rdr.row_count + 20) + tm.assert_frame_equal(d1, d2) + def test_encoding_options(): dirpath = tm.get_data_path() diff --git a/pandas/io/tests/sas/test_xport.py b/pandas/io/tests/sas/test_xport.py index d0627a80f9604..fe2f7cb4bf4be 100644 --- a/pandas/io/tests/sas/test_xport.py +++ b/pandas/io/tests/sas/test_xport.py @@ -35,6 +35,13 @@ def test1_basic(self): # Read full file data = read_sas(self.file01, format="xport") tm.assert_frame_equal(data, data_csv) + num_rows = data.shape[0] + + # Test reading beyond end of file + reader = read_sas(self.file01, format="xport", iterator=True) + data = reader.read(num_rows + 100) + self.assertTrue(data.shape[0] == num_rows) + reader.close() # Test incremental read with `read` method. reader = read_sas(self.file01, format="xport", iterator=True) @@ -48,6 +55,14 @@ def test1_basic(self): reader.close() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) + # Test read in loop + m = 0 + reader = read_sas(self.file01, format="xport", chunksize=100) + for x in reader: + m += x.shape[0] + reader.close() + self.assertTrue(m == num_rows) + # Read full file with `read_sas` method data = read_sas(self.file01) tm.assert_frame_equal(data, data_csv)