Add dtype parameters instead of fix-string-like

gfyoung · gfyoung · commit 83a30c9048c2 · 2018-12-30T00:58:09.000Z
The original parameter was causing a lot of acrobatics
with regards to string dtypes between 2.x and 3.x.

The new parameters simplify the internal logic and
pass the responsibility and motivation of memory
efficiency back to the users.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -411,7 +411,7 @@ Other Enhancements
 - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
 - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`)
 - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`)
-- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``)  (:issue:`18146`)
+- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`)
 - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
 - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method <io.sql.method>` section in the documentation. (:issue:`8953`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -35,7 +35,6 @@
                            OrderedDict, PY36, raise_with_traceback,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
-from pandas.api.types import infer_dtype
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
@@ -1541,7 +1540,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
         return cls(mgr)
 
     def to_records(self, index=True, convert_datetime64=None,
-                   stringlike_as_fixed_length=False):
+                   column_dtypes=None, index_dtypes=None):
         """
         Convert DataFrame to a NumPy record array.
 
@@ -1558,11 +1557,20 @@ def to_records(self, index=True, convert_datetime64=None,
 
             Whether to convert the index to datetime.datetime if it is a
             DatetimeIndex.
-         stringlike_as_fixed_length : bool, default False
-             .. versionadded:: 0.24.0
+        column_dtypes : str, type, dict, default None
+            .. versionadded:: 0.24.0
+
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types.
+        index_dtypes : str, type, dict, default None
+            .. versionadded:: 0.24.0
 
-             Store string-likes as fixed-length string-like dtypes
-             (e.g. ``S1`` dtype) instead of Python objects (``O`` dtype).
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+
+            This mapping is applied only if `index=True`.
 
         Returns
         -------
@@ -1605,26 +1613,22 @@ def to_records(self, index=True, convert_datetime64=None,
         rec.array([(1, 0.5 ), (2, 0.75)],
                   dtype=[('A', '<i8'), ('B', '<f8')])
 
-         By default, strings are recorded as dtype 'O' for object:
+        Data types can be specified for the columns:
 
-         >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
-         ...                   index=['a', 'b'])
-         >>> df.to_records()
-         rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
-                   dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
+        >>> df.to_records(column_dtypes={"A": "int32"})
+        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+                  dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
 
-         This can be inefficient (e.g. for short strings, or when storing with
-         `np.save()`). They can be recorded as fix-length string-like dtypes
-         such as 'S1' for zero-terminated bytes instead:
+        As well as for the index:
 
-         >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
-         ...                   index=['a', 'b'])
-         >>> df.to_records(stringlike_as_fixed_length=True)
-         rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
-                   dtype=[('index', '<U1'), ('A', '<i8'), ('B', '<U4')])
+        >>> df.to_records(index_dtypes="<S2")
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
 
-        Notice how the 'B' column is now stored as '<U4' for length-four
-        strings ('S4' for Python 2.x) instead of the 'O' object dtype.
+        >>> index_dtypes = "<S{}".format(df.index.str.len().max())
+        >>> df.to_records(index_dtypes=index_dtypes)
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
         """
 
         if convert_datetime64 is not None:
@@ -1647,59 +1651,54 @@ def to_records(self, index=True, convert_datetime64=None,
 
             count = 0
             index_names = list(self.index.names)
+
             if isinstance(self.index, MultiIndex):
                 for i, n in enumerate(index_names):
                     if n is None:
                         index_names[i] = 'level_%d' % count
                         count += 1
             elif index_names[0] is None:
                 index_names = ['index']
+
             names = (lmap(compat.text_type, index_names) +
                      lmap(compat.text_type, self.columns))
         else:
             arrays = [self[c].get_values() for c in self.columns]
             names = lmap(compat.text_type, self.columns)
+            index_names = []
 
+        index_len = len(index_names)
         formats = []
 
-        for v in arrays:
-            if not stringlike_as_fixed_length:
-                formats.append(v.dtype)
+        for i, v in enumerate(arrays):
+            index = i
+
+            if index < index_len:
+                dtype_mapping = index_dtypes
+                name = index_names[index]
             else:
-                # gh-18146
-                #
-                # For string-like arrays, set dtype as zero-terminated bytes
-                # with max length equal to that of the longest string-like.
-                dtype = infer_dtype(v)
-                symbol = None
-
-                if dtype == "string":
-                    # In Python 3.x, infer_dtype does not
-                    # differentiate string from unicode
-                    # like NumPy arrays do, so we
-                    # specify unicode to be safe.
-                    symbol = "S" if compat.PY2 else "U"
-                elif dtype == "unicode":
-                    # In Python 3.x, infer_dtype does not
-                    # differentiate string from unicode.
-                    #
-                    # Thus, we can only get this result
-                    # in Python 2.x.
-                    symbol = "U"
-                elif dtype == "bytes":
-                    # In Python 2.x, infer_dtype does not
-                    # differentiate string from bytes.
-                    #
-                    # Thus, we can only get this result
-                    # in Python 3.x. However, NumPy does
-                    # not have a fixed-length bytes dtype
-                    # and just uses string instead.
-                    symbol = "S"
-
-                if symbol is not None:
-                    formats.append("{}{}".format(symbol, max(map(len, v))))
+                index -= index_len
+                dtype_mapping = column_dtypes
+                name = self.columns[index]
+
+            if isinstance(dtype_mapping, dict):
+                if name in dtype_mapping:
+                    dtype_mapping = dtype_mapping[name]
+                elif index in dtype_mapping:
+                    dtype_mapping = dtype_mapping[index]
                 else:
-                    formats.append(v.dtype)
+                    dtype_mapping = None
+
+            if dtype_mapping is None:
+                formats.append(v.dtype)
+            elif isinstance(dtype_mapping, (type, compat.string_types)):
+                formats.append(dtype_mapping)
+            else:
+                element = "row" if i < index_len else "column"
+                msg = ("Invalid dtype {dtype} specified for "
+                       "{element} {name}").format(dtype=dtype_mapping,
+                                                  element=element, name=name)
+                raise ValueError(msg)
 
         return np.rec.fromarrays(
             arrays,
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -191,42 +191,127 @@ def test_to_records_with_categorical(self):
                                 dtype=[('index', '=i8'), ('0', 'O')])
         tm.assert_almost_equal(result, expected)
 
-    @pytest.mark.parametrize("fixed_length", [True, False])
-    @pytest.mark.parametrize("values,dtype_getter", [
-        # Integer --> just take the dtype.
-        ([1, 2], lambda fixed, isPY2: "<i8"),
-
-        # Mixed --> cast to object.
-        ([1, "1"], lambda fixed, isPY2: "O"),
-
-        # String --> cast to string is PY2 else unicode in PY3.
-        (["1", "2"], lambda fixed, isPY2: (
-            ("S" if isPY2 else "U") + "1") if fixed else "O"),
-
-        # String + max-length of longest string.
-        (["12", "2"], lambda fixed, isPY2: (
-            ("S" if isPY2 else "U") + "2") if fixed else "O"),
-
-        # Unicode --> cast to unicode for both PY2 and PY3.
-        ([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"),
-
-        # Bytes --> cast to string for both PY2 and PY3.
-        ([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"),
-    ], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"])
-    def test_to_records_with_strings_as_fixed_length(self, fixed_length,
-                                                     values, dtype_getter):
-
+    @pytest.mark.parametrize("kwargs,expected", [
+        # No dtypes --> default to array dtypes.
+        (dict(),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "<i8"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Should have no effect in this case.
+        (dict(index=True),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "<i8"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Column dtype applied across the board. Index unaffected.
+        (dict(column_dtypes="<U4"),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "<U4"),
+                             ("B", "<U4"), ("C", "<U4")])),
+
+        # Index dtype applied across the board. Columns unaffected.
+        (dict(index_dtypes="<U1"),
+         np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
+                      dtype=[("index", "<U1"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Pass in a type instance.
+        (dict(column_dtypes=np.unicode),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "<U"),
+                             ("B", "<U"), ("C", "<U")])),
+
+        # Pass in a dictionary (name-only).
+        (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "<U2")])),
+
+        # Pass in a dictionary (indices-only).
+        (dict(index_dtypes={0: "int16"}),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "i2"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Ignore index mappings if index is not True.
+        (dict(index=False, index_dtypes="<U2"),
+         np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
+                      dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
+
+        # Non-existent names / indices in mapping should not error.
+        (dict(index_dtypes={0: "int16", "not-there": "float32"}),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "i2"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Names / indices not in mapping default to array dtype.
+        (dict(column_dtypes={"A": np.int8, "B": np.float32}),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "O")])),
+
+        # Mixture of everything.
+        (dict(column_dtypes={"A": np.int8, "B": np.float32},
+              index_dtypes="<U2"),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<U2"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "O")])),
+
+        # Invalid dype values.
+        (dict(index=False, column_dtypes=list()),
+         "Invalid dtype \\[\\] specified for column A"),
+
+        (dict(index=False, column_dtypes={"A": "int32", "B": 5}),
+         "Invalid dtype 5 specified for column B"),
+    ])
+    def test_to_records_dtype(self, kwargs, expected):
         # see gh-18146
-        df = DataFrame({"values": values}, index=["a", "b"])
-        result = df.to_records(stringlike_as_fixed_length=fixed_length)
-
-        ind_dtype = ((("S" if compat.PY2 else "U") + "1")
-                     if fixed_length else "O")
-        val_dtype = dtype_getter(fixed_length, compat.PY2)
-
-        expected = np.rec.array([("a", values[0]), ("b", values[1])],
-                                dtype=[("index", ind_dtype),
-                                       ("values", val_dtype)])
+        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
+
+        if isinstance(expected, str):
+            with pytest.raises(ValueError, match=expected):
+                df.to_records(**kwargs)
+        else:
+            result = df.to_records(**kwargs)
+            tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("df,kwargs,expected", [
+        # MultiIndex in the index.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=list("abc")).set_index(["a", "b"]),
+         dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
+         np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
+                      dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
+
+        # MultiIndex in the columns.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
+                                                   ("c", "f")])),
+         dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
+         np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
+                       (2., u"7", 8, 9.)],
+                      dtype=[("index", "<f4"),
+                             ("('a', 'd')", "<U1"),
+                             ("('b', 'e')", "<i8"),
+                             ("('c', 'f')", "<f4")])),
+
+        # MultiIndex in both the columns and index.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=MultiIndex.from_tuples([
+                       ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
+                   index=MultiIndex.from_tuples([
+                       ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
+         dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
+         np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
+                       ("f", -6, 7, 8, 9.)],
+                      dtype=[("c", "<U2"), ("d", "i1"),
+                             ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
+                             ("('c', 'f')", "<f8")]))
+    ])
+    def test_to_records_dtype_mi(self, df, kwargs, expected):
+        # see gh-18146
+        result = df.to_records(**kwargs)
         tm.assert_almost_equal(result, expected)
 
     @pytest.mark.parametrize('mapping', [