Skip to content

Commit 2dcaf80

Browse files
committed
BUG: Correct behavior when reading empty dta files
Correct column selection when reading empty dta files Correct omitted dtype information in empty dta files closes pandas-dev#46240
1 parent 3cfd868 commit 2dcaf80

File tree

3 files changed

+38
-9
lines changed

3 files changed

+38
-9
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ Missing
377377
MultiIndex
378378
^^^^^^^^^^
379379
- Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`)
380+
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
380381

381382
I/O
382383
^^^
@@ -386,7 +387,7 @@ I/O
386387
- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
387388
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
388389
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
389-
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
390+
- Bug when reading empty Stata dta files (:issue:`46240`)
390391

391392
Period
392393
^^^^^^

pandas/io/stata.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -1700,13 +1700,6 @@ def read(
17001700
order_categoricals: bool | None = None,
17011701
) -> DataFrame:
17021702
self._ensure_open()
1703-
# Handle empty file or chunk. If reading incrementally raise
1704-
# StopIteration. If reading the whole thing return an empty
1705-
# data frame.
1706-
if (self._nobs == 0) and (nrows is None):
1707-
self._can_read_value_labels = True
1708-
self._data_read = True
1709-
return DataFrame(columns=self._varlist)
17101703

17111704
# Handle options
17121705
if convert_dates is None:
@@ -1723,10 +1716,27 @@ def read(
17231716
order_categoricals = self._order_categoricals
17241717
if index_col is None:
17251718
index_col = self._index_col
1726-
17271719
if nrows is None:
17281720
nrows = self._nobs
17291721

1722+
# Handle empty file or chunk. If reading incrementally raise
1723+
# StopIteration. If reading the whole thing return an empty
1724+
# data frame.
1725+
if (self._nobs == 0) and nrows == 0:
1726+
self._can_read_value_labels = True
1727+
self._data_read = True
1728+
data = DataFrame(columns=self._varlist)
1729+
# Apply dtypes correctly
1730+
for i, col in enumerate(data.columns):
1731+
if (
1732+
isinstance(self._dtyplist[i], np.dtype)
1733+
and self._dtyplist[i].char != "S"
1734+
):
1735+
data[col] = data[col].astype(self._dtyplist[i])
1736+
if columns is not None:
1737+
data = self._do_select_columns(data, columns)
1738+
return data
1739+
17301740
if (self._format_version >= 117) and (not self._value_labels_read):
17311741
self._can_read_value_labels = True
17321742
self._read_strls()

pandas/tests/io/test_stata.py

+18
Original file line numberDiff line numberDiff line change
@@ -2274,3 +2274,21 @@ def test_nullable_support(dtype, version):
22742274
tm.assert_series_equal(df.a, reread.a)
22752275
tm.assert_series_equal(reread.b, expected_b)
22762276
tm.assert_series_equal(reread.c, expected_c)
2277+
2278+
2279+
def test_empty_frame():
2280+
# GH 46240
2281+
# create an empty DataFrame with int64 and float64 dtypes
2282+
df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
2283+
with tm.ensure_clean() as path:
2284+
df.to_stata(path, write_index=False, version=117)
2285+
# Read entire dataframe
2286+
df2 = read_stata(path)
2287+
assert "b" in df2
2288+
# Dtypes don't match since empty dataframe get written as float64
2289+
dtypes = Series({"a": np.dtype("float64"), "b": np.dtype("float64")})
2290+
tm.assert_series_equal(df2.dtypes, dtypes)
2291+
# read one column of empty .dta file
2292+
df3 = read_stata(path, columns=["a"])
2293+
assert "b" not in df3
2294+
tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])

0 commit comments

Comments
 (0)