ENH: Add support for nullable boolean and integers in Stata writers

bashtage · bashtage · commit 680ef1c5bdc0 · 2021-07-19T09:02:27.000+01:00
Add code that allows nullable arrays to be written closes #40855
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -31,10 +31,10 @@ Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - Add support for assigning values to ``by`` argument in :meth:`DataFrame.plot.hist` and :meth:`DataFrame.plot.box` (:issue:`15079`)
 - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`)
--  Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
+- Additional options added to :meth:`.Styler.bar` to control alignment and display, with keyword only arguments (:issue:`26070`, :issue:`36419`)
 - :meth:`Styler.bar` now validates the input argument ``width`` and ``height`` (:issue:`42511`)
 - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`42273`)
--
+- Added support for nullable boolean and integer types in :meth:`DataFrame.to_stata`, :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`40855`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -59,6 +59,8 @@
     to_timedelta,
 )
 from pandas.core import generic
+from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.integer import _IntegerDtype
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
 from pandas.core.series import Series
@@ -569,14 +571,24 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
         (np.uint8, np.int8, np.int16),
         (np.uint16, np.int16, np.int32),
         (np.uint32, np.int32, np.int64),
+        (np.uint64, np.int64, np.float64),
     )
 
     float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]
     float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]
 
     for col in data:
-        dtype = data[col].dtype
         # Cast from unsupported types to supported types
+        is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype))
+        orig = data[col]
+        if is_nullable_int:
+            missing_loc = data[col].isna()
+            if missing_loc.any():
+                # Replace with always safe value
+                data.loc[missing_loc, col] = 0
+            # Replace with NumPy-compatible column
+            data[col] = data[col].astype(data[col].dtype.numpy_dtype)
+        dtype = data[col].dtype
         for c_data in conversion_data:
             if dtype == c_data[0]:
                 if data[col].max() <= np.iinfo(c_data[1]).max:
@@ -618,7 +630,12 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
                         f"Column {col} has a maximum value ({value}) outside the range "
                         f"supported by Stata ({float64_max})"
                     )
-
+        if is_nullable_int:
+            missing = orig.isna()
+            if missing.any():
+                # Replace missing by Stata sentinel value
+                sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
+                data.loc[missing, col] = sentinel
     if ws:
         warnings.warn(ws, PossiblePrecisionLoss)
 
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -2047,3 +2047,45 @@ def test_stata_compression(compression_only, read_infer, to_infer):
         df.to_stata(path, compression=to_compression)
         result = read_stata(path, compression=read_compression, index_col="index")
         tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pd.BooleanDtype,
+        pd.Int8Dtype,
+        pd.Int16Dtype,
+        pd.Int32Dtype,
+        pd.Int64Dtype,
+        pd.UInt8Dtype,
+        pd.UInt16Dtype,
+        pd.UInt32Dtype,
+        pd.UInt64Dtype,
+    ],
+)
+def test_nullable_support(dtype, version):
+    df = DataFrame(
+        {
+            "a": Series([1.0, 2.0, 3.0]),
+            "b": Series([1, pd.NA, pd.NA], dtype=dtype.name),
+            "c": Series(["a", "b", None]),
+        }
+    )
+    dtype_name = df.b.dtype.numpy_dtype.name
+    # Only use supported names: no uint, bool or int64
+    dtype_name = dtype_name.replace("u", "")
+    if dtype_name == "int64":
+        dtype_name = "int32"
+    elif dtype_name == "bool":
+        dtype_name = "int8"
+    value = StataMissingValue.BASE_MISSING_VALUES[dtype_name]
+    smv = StataMissingValue(value)
+    expected_b = Series([1, smv, smv], dtype=object, name="b")
+    expected_c = Series(["a", "b", ""], name="c")
+    with tm.ensure_clean() as path:
+        df.to_stata(path, write_index=False, version=version)
+        reread = read_stata(path, convert_missing=True)
+        tm.assert_series_equal(df.a, reread.a)
+        tm.assert_series_equal(reread.b, expected_b)
+        tm.assert_series_equal(reread.c, expected_c)