Skip to content

Commit d991eb2

Browse files
committed
BUG: Correct behavior when reading empty dta files
Correct column selection when reading empty dta files Correct omitted dtype information in empty dta files closes pandas-dev#46240
1 parent 3cfd868 commit d991eb2

File tree

3 files changed

+80
-13
lines changed

3 files changed

+80
-13
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ Missing
377377
MultiIndex
378378
^^^^^^^^^^
379379
- Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`)
380+
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
380381

381382
I/O
382383
^^^
@@ -386,7 +387,7 @@ I/O
386387
- Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
387388
- Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
388389
- Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
389-
- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
390+
- Bug when writing and reading empty Stata dta files where dtype information was lost (:issue:`46240`)
390391

391392
Period
392393
^^^^^^

pandas/io/stata.py

+25-12
Original file line numberDiff line numberDiff line change
@@ -608,9 +608,10 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
608608
# Replace with NumPy-compatible column
609609
data[col] = data[col].astype(data[col].dtype.numpy_dtype)
610610
dtype = data[col].dtype
611+
empty_df = data.shape[0] == 0
611612
for c_data in conversion_data:
612613
if dtype == c_data[0]:
613-
if data[col].max() <= np.iinfo(c_data[1]).max:
614+
if empty_df or data[col].max() <= np.iinfo(c_data[1]).max:
614615
dtype = c_data[1]
615616
else:
616617
dtype = c_data[2]
@@ -621,14 +622,17 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
621622
data[col] = data[col].astype(dtype)
622623

623624
# Check values and upcast if necessary
624-
if dtype == np.int8:
625+
626+
if dtype == np.int8 and not empty_df:
625627
if data[col].max() > 100 or data[col].min() < -127:
626628
data[col] = data[col].astype(np.int16)
627-
elif dtype == np.int16:
629+
elif dtype == np.int16 and not empty_df:
628630
if data[col].max() > 32740 or data[col].min() < -32767:
629631
data[col] = data[col].astype(np.int32)
630632
elif dtype == np.int64:
631-
if data[col].max() <= 2147483620 and data[col].min() >= -2147483647:
633+
if empty_df or (
634+
data[col].max() <= 2147483620 and data[col].min() >= -2147483647
635+
):
632636
data[col] = data[col].astype(np.int32)
633637
else:
634638
data[col] = data[col].astype(np.float64)
@@ -1700,13 +1704,6 @@ def read(
17001704
order_categoricals: bool | None = None,
17011705
) -> DataFrame:
17021706
self._ensure_open()
1703-
# Handle empty file or chunk. If reading incrementally raise
1704-
# StopIteration. If reading the whole thing return an empty
1705-
# data frame.
1706-
if (self._nobs == 0) and (nrows is None):
1707-
self._can_read_value_labels = True
1708-
self._data_read = True
1709-
return DataFrame(columns=self._varlist)
17101707

17111708
# Handle options
17121709
if convert_dates is None:
@@ -1723,10 +1720,26 @@ def read(
17231720
order_categoricals = self._order_categoricals
17241721
if index_col is None:
17251722
index_col = self._index_col
1726-
17271723
if nrows is None:
17281724
nrows = self._nobs
17291725

1726+
# Handle empty file or chunk. If reading incrementally raise
1727+
# StopIteration. If reading the whole thing return an empty
1728+
# data frame.
1729+
if (self._nobs == 0) and nrows == 0:
1730+
self._can_read_value_labels = True
1731+
self._data_read = True
1732+
data = DataFrame(columns=self._varlist)
1733+
# Apply dtypes correctly
1734+
for i, col in enumerate(data.columns):
1735+
dt = self._dtyplist[i]
1736+
if isinstance(dt, np.dtype):
1737+
if dt.char != "S":
1738+
data[col] = data[col].astype(dt)
1739+
if columns is not None:
1740+
data = self._do_select_columns(data, columns)
1741+
return data
1742+
17301743
if (self._format_version >= 117) and (not self._value_labels_read):
17311744
self._can_read_value_labels = True
17321745
self._read_strls()

pandas/tests/io/test_stata.py

+53
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,41 @@ def test_read_empty_dta(self, version):
7171
empty_ds2 = read_stata(path)
7272
tm.assert_frame_equal(empty_ds, empty_ds2)
7373

74+
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
75+
def test_read_empty_dta_with_dtypes(self, version):
76+
# GH 46240
77+
# Fixing above bug revealed that types are not correctly preserved when
78+
# writing empty DataFrames
79+
empty_df_typed = DataFrame(
80+
{
81+
"i8": np.array([0], dtype=np.int8),
82+
"i16": np.array([0], dtype=np.int16),
83+
"i32": np.array([0], dtype=np.int32),
84+
"i64": np.array([0], dtype=np.int64),
85+
"u8": np.array([0], dtype=np.uint8),
86+
"u16": np.array([0], dtype=np.uint16),
87+
"u32": np.array([0], dtype=np.uint32),
88+
"u64": np.array([0], dtype=np.uint64),
89+
"f32": np.array([0], dtype=np.float32),
90+
"f64": np.array([0], dtype=np.float64),
91+
}
92+
)
93+
expected = empty_df_typed.copy()
94+
# No uint# support. Downcast since values in range for int#
95+
expected["u8"] = expected["u8"].astype(np.int8)
96+
expected["u16"] = expected["u16"].astype(np.int16)
97+
expected["u32"] = expected["u32"].astype(np.int32)
98+
# No int64 supported at all. Downcast since values in range for int32
99+
expected["u64"] = expected["u64"].astype(np.int32)
100+
expected["i64"] = expected["i64"].astype(np.int32)
101+
102+
# GH 7369, make sure can read a 0-obs dta file
103+
with tm.ensure_clean() as path:
104+
empty_df_typed.to_stata(path, write_index=False, version=version)
105+
empty_reread = read_stata(path)
106+
tm.assert_frame_equal(expected, empty_reread)
107+
tm.assert_series_equal(expected.dtypes, empty_reread.dtypes)
108+
74109
@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
75110
def test_read_index_col_none(self, version):
76111
df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]})
@@ -2274,3 +2309,21 @@ def test_nullable_support(dtype, version):
22742309
tm.assert_series_equal(df.a, reread.a)
22752310
tm.assert_series_equal(reread.b, expected_b)
22762311
tm.assert_series_equal(reread.c, expected_c)
2312+
2313+
2314+
def test_empty_frame():
2315+
# GH 46240
2316+
# create an empty DataFrame with int64 and float64 dtypes
2317+
df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
2318+
with tm.ensure_clean() as path:
2319+
df.to_stata(path, write_index=False, version=117)
2320+
# Read entire dataframe
2321+
df2 = read_stata(path)
2322+
assert "b" in df2
2323+
# Dtypes don't match since no support for int32
2324+
dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")})
2325+
tm.assert_series_equal(df2.dtypes, dtypes)
2326+
# read one column of empty .dta file
2327+
df3 = read_stata(path, columns=["a"])
2328+
assert "b" not in df3
2329+
tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])

0 commit comments

Comments
 (0)