Skip to content

Commit c5f219a

Browse files
ksheddenjorisvandenbossche
authored andcommitted
BUG: SAS chunksize / iteration issues (pandas-dev#14743)
closes pandas-dev#14734 closes pandas-dev#13654
1 parent 837db72 commit c5f219a

File tree

5 files changed

+53
-7
lines changed

5 files changed

+53
-7
lines changed

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Bug Fixes
3131
- Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`)
3232
- Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`)
3333
- Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`)
34+
- Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally.
3435

3536

3637

doc/source/whatsnew/v0.20.0.txt

-1
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,3 @@ Performance Improvements
9696

9797
Bug Fixes
9898
~~~~~~~~~
99-

pandas/io/sas/sas7bdat.py

+10
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,12 @@ def _get_properties(self):
225225
self.os_name = self.os_name.decode(
226226
self.encoding or self.default_encoding)
227227

228+
def __next__(self):
229+
da = self.read(nrows=self.chunksize or 1)
230+
if da is None:
231+
raise StopIteration
232+
return da
233+
228234
# Read a single float of the given width (4 or 8).
229235
def _read_float(self, offset, width):
230236
if width not in (4, 8):
@@ -591,6 +597,10 @@ def read(self, nrows=None):
591597
if self._current_row_in_file_index >= self.row_count:
592598
return None
593599

600+
m = self.row_count - self._current_row_in_file_index
601+
if nrows > m:
602+
nrows = m
603+
594604
nd = (self.column_types == b'd').sum()
595605
ns = (self.column_types == b's').sum()
596606

pandas/io/tests/sas/test_sas7bdat.py

+27-6
Original file line numberDiff line numberDiff line change
@@ -47,24 +47,45 @@ def test_from_buffer(self):
4747
with open(fname, 'rb') as f:
4848
byts = f.read()
4949
buf = io.BytesIO(byts)
50-
df = pd.read_sas(buf, format="sas7bdat", encoding='utf-8')
50+
rdr = pd.read_sas(buf, format="sas7bdat",
51+
iterator=True, encoding='utf-8')
52+
df = rdr.read()
5153
tm.assert_frame_equal(df, df0, check_exact=False)
5254

5355
def test_from_iterator(self):
5456
for j in 0, 1:
5557
df0 = self.data[j]
5658
for k in self.test_ix[j]:
5759
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
58-
with open(fname, 'rb') as f:
59-
byts = f.read()
60-
buf = io.BytesIO(byts)
61-
rdr = pd.read_sas(buf, format="sas7bdat",
62-
iterator=True, encoding='utf-8')
60+
rdr = pd.read_sas(fname, iterator=True, encoding='utf-8')
6361
df = rdr.read(2)
6462
tm.assert_frame_equal(df, df0.iloc[0:2, :])
6563
df = rdr.read(3)
6664
tm.assert_frame_equal(df, df0.iloc[2:5, :])
6765

66+
def test_iterator_loop(self):
67+
# github #13654
68+
for j in 0, 1:
69+
for k in self.test_ix[j]:
70+
for chunksize in 3, 5, 10, 11:
71+
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
72+
rdr = pd.read_sas(fname, chunksize=10, encoding='utf-8')
73+
y = 0
74+
for x in rdr:
75+
y += x.shape[0]
76+
self.assertTrue(y == rdr.row_count)
77+
78+
def test_iterator_read_too_much(self):
79+
# github #14734
80+
k = self.test_ix[0][0]
81+
fname = os.path.join(self.dirpath, "test%d.sas7bdat" % k)
82+
rdr = pd.read_sas(fname, format="sas7bdat",
83+
iterator=True, encoding='utf-8')
84+
d1 = rdr.read(rdr.row_count + 20)
85+
rdr = pd.read_sas(fname, iterator=True, encoding="utf-8")
86+
d2 = rdr.read(rdr.row_count + 20)
87+
tm.assert_frame_equal(d1, d2)
88+
6889

6990
def test_encoding_options():
7091
dirpath = tm.get_data_path()

pandas/io/tests/sas/test_xport.py

+15
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ def test1_basic(self):
3535
# Read full file
3636
data = read_sas(self.file01, format="xport")
3737
tm.assert_frame_equal(data, data_csv)
38+
num_rows = data.shape[0]
39+
40+
# Test reading beyond end of file
41+
reader = read_sas(self.file01, format="xport", iterator=True)
42+
data = reader.read(num_rows + 100)
43+
self.assertTrue(data.shape[0] == num_rows)
44+
reader.close()
3845

3946
# Test incremental read with `read` method.
4047
reader = read_sas(self.file01, format="xport", iterator=True)
@@ -48,6 +55,14 @@ def test1_basic(self):
4855
reader.close()
4956
tm.assert_frame_equal(data, data_csv.iloc[0:10, :])
5057

58+
# Test read in loop
59+
m = 0
60+
reader = read_sas(self.file01, format="xport", chunksize=100)
61+
for x in reader:
62+
m += x.shape[0]
63+
reader.close()
64+
self.assertTrue(m == num_rows)
65+
5166
# Read full file with `read_sas` method
5267
data = read_sas(self.file01)
5368
tm.assert_frame_equal(data, data_csv)

0 commit comments

Comments
 (0)