ENH: Add strings_as_fixed_length parameter for df.to_records() (#18146) (#22229)

qinghao1 · gfyoung · commit 07696887f362 · 2018-12-30T14:40:38.000-08:00
* ENH: Allow fixed-length strings in df.to_records() Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes gh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts. * Add dtype parameters instead of fix-string-like The original parameter was causing a lot of acrobatics with regards to string dtypes between 2.x and 3.x. The new parameters simplify the internal logic and pass the responsibility and motivation of memory efficiency back to the users. * MAINT: Use is_dict_like in to_records More generic than checking whether our mappings are instances of dict. Expands is_dict_like check to include whether it has a __contains__ method. * TST: Add test for is_dict_like expanded def * MAINT: Address final comments
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -411,6 +411,7 @@ Other Enhancements
 - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
 - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`)
 - The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`)
+- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`)
 - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
 - :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method <io.sql.method>` section in the documentation. (:issue:`8953`)
 
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -398,8 +398,11 @@ def is_dict_like(obj):
     >>> is_dict_like([1, 2, 3])
     False
     """
+    for attr in ("__getitem__", "keys", "__contains__"):
+        if not hasattr(obj, attr):
+            return False
 
-    return hasattr(obj, '__getitem__') and hasattr(obj, 'keys')
+    return True
 
 
 def is_named_tuple(obj):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -35,7 +35,6 @@
                            OrderedDict, PY36, raise_with_traceback,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
-
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
@@ -49,6 +48,7 @@
     maybe_upcast_putmask,
     find_common_type)
 from pandas.core.dtypes.common import (
+    is_dict_like,
     is_object_dtype,
     is_extension_type,
     is_extension_array_dtype,
@@ -1540,7 +1540,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
 
         return cls(mgr)
 
-    def to_records(self, index=True, convert_datetime64=None):
+    def to_records(self, index=True, convert_datetime64=None,
+                   column_dtypes=None, index_dtypes=None):
         """
         Convert DataFrame to a NumPy record array.
 
@@ -1557,6 +1558,20 @@ def to_records(self, index=True, convert_datetime64=None):
 
             Whether to convert the index to datetime.datetime if it is a
             DatetimeIndex.
+        column_dtypes : str, type, dict, default None
+            .. versionadded:: 0.24.0
+
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types.
+        index_dtypes : str, type, dict, default None
+            .. versionadded:: 0.24.0
+
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+
+            This mapping is applied only if `index=True`.
 
         Returns
         -------
@@ -1598,6 +1613,23 @@ def to_records(self, index=True, convert_datetime64=None):
         >>> df.to_records(index=False)
         rec.array([(1, 0.5 ), (2, 0.75)],
                   dtype=[('A', '<i8'), ('B', '<f8')])
+
+        Data types can be specified for the columns:
+
+        >>> df.to_records(column_dtypes={"A": "int32"})
+        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+                  dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
+
+        As well as for the index:
+
+        >>> df.to_records(index_dtypes="<S2")
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
+
+        >>> index_dtypes = "<S{}".format(df.index.str.len().max())
+        >>> df.to_records(index_dtypes=index_dtypes)
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
         """
 
         if convert_datetime64 is not None:
@@ -1620,20 +1652,74 @@ def to_records(self, index=True, convert_datetime64=None):
 
             count = 0
             index_names = list(self.index.names)
+
             if isinstance(self.index, MultiIndex):
                 for i, n in enumerate(index_names):
                     if n is None:
                         index_names[i] = 'level_%d' % count
                         count += 1
             elif index_names[0] is None:
                 index_names = ['index']
+
             names = (lmap(compat.text_type, index_names) +
                      lmap(compat.text_type, self.columns))
         else:
             arrays = [self[c].get_values() for c in self.columns]
             names = lmap(compat.text_type, self.columns)
+            index_names = []
+
+        index_len = len(index_names)
+        formats = []
+
+        for i, v in enumerate(arrays):
+            index = i
+
+            # When the names and arrays are collected, we
+            # first collect those in the DataFrame's index,
+            # followed by those in its columns.
+            #
+            # Thus, the total length of the array is:
+            # len(index_names) + len(DataFrame.columns).
+            #
+            # This check allows us to see whether we are
+            # handling a name / array in the index or column.
+            if index < index_len:
+                dtype_mapping = index_dtypes
+                name = index_names[index]
+            else:
+                index -= index_len
+                dtype_mapping = column_dtypes
+                name = self.columns[index]
+
+            # We have a dictionary, so we get the data type
+            # associated with the index or column (which can
+            # be denoted by its name in the DataFrame or its
+            # position in DataFrame's array of indices or
+            # columns, whichever is applicable.
+            if is_dict_like(dtype_mapping):
+                if name in dtype_mapping:
+                    dtype_mapping = dtype_mapping[name]
+                elif index in dtype_mapping:
+                    dtype_mapping = dtype_mapping[index]
+                else:
+                    dtype_mapping = None
+
+            # If no mapping can be found, use the array's
+            # dtype attribute for formatting.
+            #
+            # A valid dtype must either be a type or
+            # string naming a type.
+            if dtype_mapping is None:
+                formats.append(v.dtype)
+            elif isinstance(dtype_mapping, (type, compat.string_types)):
+                formats.append(dtype_mapping)
+            else:
+                element = "row" if i < index_len else "column"
+                msg = ("Invalid dtype {dtype} specified for "
+                       "{element} {name}").format(dtype=dtype_mapping,
+                                                  element=element, name=name)
+                raise ValueError(msg)
 
-        formats = [v.dtype for v in arrays]
         return np.rec.fromarrays(
             arrays,
             dtype={'names': names, 'formats': formats}
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
@@ -178,6 +178,33 @@ def test_is_dict_like_fails(ll):
     assert not inference.is_dict_like(ll)
 
 
+@pytest.mark.parametrize("has_keys", [True, False])
+@pytest.mark.parametrize("has_getitem", [True, False])
+@pytest.mark.parametrize("has_contains", [True, False])
+def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains):
+    class DictLike(object):
+        def __init__(self, d):
+            self.d = d
+
+        if has_keys:
+            def keys(self):
+                return self.d.keys()
+
+        if has_getitem:
+            def __getitem__(self, key):
+                return self.d.__getitem__(key)
+
+        if has_contains:
+            def __contains__(self, key):
+                return self.d.__contains__(key)
+
+    d = DictLike({1: 2})
+    result = inference.is_dict_like(d)
+    expected = has_keys and has_getitem and has_contains
+
+    assert result is expected
+
+
 def test_is_file_like(mock):
     class MockFile(object):
         pass
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -191,6 +191,157 @@ def test_to_records_with_categorical(self):
                                 dtype=[('index', '=i8'), ('0', 'O')])
         tm.assert_almost_equal(result, expected)
 
+    @pytest.mark.parametrize("kwargs,expected", [
+        # No dtypes --> default to array dtypes.
+        (dict(),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "<i8"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Should have no effect in this case.
+        (dict(index=True),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "<i8"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Column dtype applied across the board. Index unaffected.
+        (dict(column_dtypes="<U4"),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "<U4"),
+                             ("B", "<U4"), ("C", "<U4")])),
+
+        # Index dtype applied across the board. Columns unaffected.
+        (dict(index_dtypes="<U1"),
+         np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
+                      dtype=[("index", "<U1"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Pass in a type instance.
+        (dict(column_dtypes=np.unicode),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "<U"),
+                             ("B", "<U"), ("C", "<U")])),
+
+        # Pass in a dictionary (name-only).
+        (dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "<U2")])),
+
+        # Pass in a dictionary (indices-only).
+        (dict(index_dtypes={0: "int16"}),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "i2"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Ignore index mappings if index is not True.
+        (dict(index=False, index_dtypes="<U2"),
+         np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
+                      dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
+
+        # Non-existent names / indices in mapping should not error.
+        (dict(index_dtypes={0: "int16", "not-there": "float32"}),
+         np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
+                      dtype=[("index", "i2"), ("A", "<i8"),
+                             ("B", "<f8"), ("C", "O")])),
+
+        # Names / indices not in mapping default to array dtype.
+        (dict(column_dtypes={"A": np.int8, "B": np.float32}),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<i8"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "O")])),
+
+        # Mixture of everything.
+        (dict(column_dtypes={"A": np.int8, "B": np.float32},
+              index_dtypes="<U2"),
+         np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
+                      dtype=[("index", "<U2"), ("A", "i1"),
+                             ("B", "<f4"), ("C", "O")])),
+
+        # Invalid dype values.
+        (dict(index=False, column_dtypes=list()),
+         "Invalid dtype \\[\\] specified for column A"),
+
+        (dict(index=False, column_dtypes={"A": "int32", "B": 5}),
+         "Invalid dtype 5 specified for column B"),
+    ])
+    def test_to_records_dtype(self, kwargs, expected):
+        # see gh-18146
+        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
+
+        if isinstance(expected, str):
+            with pytest.raises(ValueError, match=expected):
+                df.to_records(**kwargs)
+        else:
+            result = df.to_records(**kwargs)
+            tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("df,kwargs,expected", [
+        # MultiIndex in the index.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=list("abc")).set_index(["a", "b"]),
+         dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
+         np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
+                      dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
+
+        # MultiIndex in the columns.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
+                                                   ("c", "f")])),
+         dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
+         np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
+                       (2., u"7", 8, 9.)],
+                      dtype=[("index", "<f4"),
+                             ("('a', 'd')", "<U1"),
+                             ("('b', 'e')", "<i8"),
+                             ("('c', 'f')", "<f4")])),
+
+        # MultiIndex in both the columns and index.
+        (DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+                   columns=MultiIndex.from_tuples([
+                       ("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
+                   index=MultiIndex.from_tuples([
+                       ("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
+         dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
+         np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
+                       ("f", -6, 7, 8, 9.)],
+                      dtype=[("c", "<U2"), ("d", "i1"),
+                             ("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
+                             ("('c', 'f')", "<f8")]))
+    ])
+    def test_to_records_dtype_mi(self, df, kwargs, expected):
+        # see gh-18146
+        result = df.to_records(**kwargs)
+        tm.assert_almost_equal(result, expected)
+
+    def test_to_records_dict_like(self):
+        # see gh-18146
+        class DictLike(object):
+            def __init__(self, **kwargs):
+                self.d = kwargs.copy()
+
+            def __getitem__(self, key):
+                return self.d.__getitem__(key)
+
+            def __contains__(self, key):
+                return key in self.d
+
+            def keys(self):
+                return self.d.keys()
+
+        df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
+
+        dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
+                                                        "B": np.float32}),
+                              index_dtypes="<U2")
+
+        result = df.to_records(**dtype_mappings)
+        expected = np.rec.array([("0", "1", "0.2", "a"),
+                                 ("1", "2", "1.5", "bc")],
+                                dtype=[("index", "<U2"), ("A", "i1"),
+                                       ("B", "<f4"), ("C", "O")])
+        tm.assert_almost_equal(result, expected)
+
     @pytest.mark.parametrize('mapping', [
         dict,
         collections.defaultdict(list),