BUG: Correct behavior when reading empty dta files

bashtage · bashtage · commit 2dcaf80d218a · 2023-05-15T10:08:21.000+01:00
Correct column selection when reading empty dta files Correct omitted dtype information in empty dta files closes pandas-dev#46240
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -377,6 +377,7 @@ Missing
 MultiIndex
 ^^^^^^^^^^
 - Bug in :meth:`MultiIndex.set_levels` not preserving dtypes for :class:`Categorical` (:issue:`52125`)
+- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
 
 I/O
 ^^^
@@ -386,7 +387,7 @@ I/O
 - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`)
 - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`)
 - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`)
-- Bug in displaying a :class:`MultiIndex` with a long element (:issue:`52960`)
+- Bug when reading empty Stata dta files (:issue:`46240`)
 
 Period
 ^^^^^^
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1700,13 +1700,6 @@ def read(
         order_categoricals: bool | None = None,
     ) -> DataFrame:
         self._ensure_open()
-        # Handle empty file or chunk.  If reading incrementally raise
-        # StopIteration.  If reading the whole thing return an empty
-        # data frame.
-        if (self._nobs == 0) and (nrows is None):
-            self._can_read_value_labels = True
-            self._data_read = True
-            return DataFrame(columns=self._varlist)
 
         # Handle options
         if convert_dates is None:
@@ -1723,10 +1716,27 @@ def read(
             order_categoricals = self._order_categoricals
         if index_col is None:
             index_col = self._index_col
-
         if nrows is None:
             nrows = self._nobs
 
+        # Handle empty file or chunk.  If reading incrementally raise
+        # StopIteration.  If reading the whole thing return an empty
+        # data frame.
+        if (self._nobs == 0) and nrows == 0:
+            self._can_read_value_labels = True
+            self._data_read = True
+            data = DataFrame(columns=self._varlist)
+            # Apply dtypes correctly
+            for i, col in enumerate(data.columns):
+                if (
+                    isinstance(self._dtyplist[i], np.dtype)
+                    and self._dtyplist[i].char != "S"
+                ):
+                    data[col] = data[col].astype(self._dtyplist[i])
+            if columns is not None:
+                data = self._do_select_columns(data, columns)
+            return data
+
         if (self._format_version >= 117) and (not self._value_labels_read):
             self._can_read_value_labels = True
             self._read_strls()
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -2274,3 +2274,21 @@ def test_nullable_support(dtype, version):
         tm.assert_series_equal(df.a, reread.a)
         tm.assert_series_equal(reread.b, expected_b)
         tm.assert_series_equal(reread.c, expected_c)
+
+
+def test_empty_frame():
+    # GH 46240
+    # create an empty DataFrame with int64 and float64 dtypes
+    df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
+    with tm.ensure_clean() as path:
+        df.to_stata(path, write_index=False, version=117)
+        # Read entire dataframe
+        df2 = read_stata(path)
+        assert "b" in df2
+        # Dtypes don't match since empty dataframe get written as float64
+        dtypes = Series({"a": np.dtype("float64"), "b": np.dtype("float64")})
+        tm.assert_series_equal(df2.dtypes, dtypes)
+        # read one column of empty .dta file
+        df3 = read_stata(path, columns=["a"])
+        assert "b" not in df3
+        tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])