ENH: Allow fixed-length strings in df.to_records()

gfyoung · gfyoung · commit 0fd56e2cafba · 2018-12-19T07:19:46.000Z
Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes pandas-devgh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -372,6 +372,7 @@ Other Enhancements
 - :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
 - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue:`8839`)
 - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
+- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``)  (:issue:`18146`)
 
 .. _whatsnew_0240.api_breaking:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -35,7 +35,7 @@
                            OrderedDict, PY36, raise_with_traceback,
                            string_and_binary_types)
 from pandas.compat.numpy import function as nv
-
+from pandas.api.types import infer_dtype
 from pandas.core.dtypes.cast import (
     maybe_upcast,
     cast_scalar_to_array,
@@ -1476,7 +1476,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
 
         return cls(mgr)
 
-    def to_records(self, index=True, convert_datetime64=None):
+    def to_records(self, index=True, convert_datetime64=None,
+                   stringlike_as_fixed_length=False):
         """
         Convert DataFrame to a NumPy record array.
 
@@ -1493,6 +1494,11 @@ def to_records(self, index=True, convert_datetime64=None):
 
             Whether to convert the index to datetime.datetime if it is a
             DatetimeIndex.
+         stringlike_as_fixed_length : bool, default False
+             .. versionadded:: 0.24.0
+
+             Store string-likes as fixed-length string-like dtypes
+             (e.g. ``S1`` dtype) instead of Python objects (``O`` dtype).
 
         Returns
         -------
@@ -1534,6 +1540,27 @@ def to_records(self, index=True, convert_datetime64=None):
         >>> df.to_records(index=False)
         rec.array([(1, 0.5 ), (2, 0.75)],
                   dtype=[('A', '<i8'), ('B', '<f8')])
+
+         By default, strings are recorded as dtype 'O' for object:
+
+         >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
+         ...                   index=['a', 'b'])
+         >>> df.to_records()
+         rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
+                   dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
+
+         This can be inefficient (e.g. for short strings, or when storing with
+         `np.save()`). They can be recorded as fix-length string-like dtypes
+         such as 'S1' for zero-terminated bytes instead:
+
+         >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
+         ...                   index=['a', 'b'])
+         >>> df.to_records(stringlike_as_fixed_length=True)
+         rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
+                   dtype=[('index', '<U1'), ('A', '<i8'), ('B', '<U4')])
+
+        Notice how the 'B' column is now stored as '<U4' for length-four
+        strings ('S4' for Python 2.x) instead of the 'O' object dtype.
         """
 
         if convert_datetime64 is not None:
@@ -1569,7 +1596,47 @@ def to_records(self, index=True, convert_datetime64=None):
             arrays = [self[c].get_values() for c in self.columns]
             names = lmap(compat.text_type, self.columns)
 
-        formats = [v.dtype for v in arrays]
+        formats = []
+
+        for v in arrays:
+            if not stringlike_as_fixed_length:
+                formats.append(v.dtype)
+            else:
+                # gh-18146
+                #
+                # For string-like arrays, set dtype as zero-terminated bytes
+                # with max length equal to that of the longest string-like.
+                dtype = infer_dtype(v)
+                symbol = None
+
+                if dtype == "string":
+                    # In Python 3.x, infer_dtype does not
+                    # differentiate string from unicode
+                    # like NumPy arrays do, so we
+                    # specify unicode to be safe.
+                    symbol = "S" if compat.PY2 else "U"
+                elif dtype == "unicode":
+                    # In Python 3.x, infer_dtype does not
+                    # differentiate string from unicode.
+                    #
+                    # Thus, we can only get this result
+                    # in Python 2.x.
+                    symbol = "U"
+                elif dtype == "bytes":
+                    # In Python 2.x, infer_dtype does not
+                    # differentiate string from bytes.
+                    #
+                    # Thus, we can only get this result
+                    # in Python 3.x. However, NumPy does
+                    # not have a fixed-length bytes dtype
+                    # and just uses string instead.
+                    symbol = "S"
+
+                if symbol is not None:
+                    formats.append("{}{}".format(symbol, max(map(len, v))))
+                else:
+                    formats.append(v.dtype)
+
         return np.rec.fromarrays(
             arrays,
             dtype={'names': names, 'formats': formats}
diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py
@@ -191,6 +191,44 @@ def test_to_records_with_categorical(self):
                                 dtype=[('index', '=i8'), ('0', 'O')])
         tm.assert_almost_equal(result, expected)
 
+    @pytest.mark.parametrize("fixed_length", [True, False])
+    @pytest.mark.parametrize("values,dtype_getter", [
+        # Integer --> just take the dtype.
+        ([1, 2], lambda fixed, isPY2: "<i8"),
+
+        # Mixed --> cast to object.
+        ([1, "1"], lambda fixed, isPY2: "O"),
+
+        # String --> cast to string is PY2 else unicode in PY3.
+        (["1", "2"], lambda fixed, isPY2: (
+            ("S" if isPY2 else "U") + "1") if fixed else "O"),
+
+        # String + max-length of longest string.
+        (["12", "2"], lambda fixed, isPY2: (
+            ("S" if isPY2 else "U") + "2") if fixed else "O"),
+
+        # Unicode --> cast to unicode for both PY2 and PY3.
+        ([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"),
+
+        # Bytes --> cast to string for both PY2 and PY3.
+        ([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"),
+    ], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"])
+    def test_to_records_with_strings_as_fixed_length(self, fixed_length,
+                                                     values, dtype_getter):
+
+        # see gh-18146
+        df = DataFrame({"values": values}, index=["a", "b"])
+        result = df.to_records(stringlike_as_fixed_length=fixed_length)
+
+        ind_dtype = ((("S" if compat.PY2 else "U") + "1")
+                     if fixed_length else "O")
+        val_dtype = dtype_getter(fixed_length, compat.PY2)
+
+        expected = np.rec.array([("a", values[0]), ("b", values[1])],
+                                dtype=[("index", ind_dtype),
+                                       ("values", val_dtype)])
+        tm.assert_almost_equal(result, expected)
+
     @pytest.mark.parametrize('mapping', [
         dict,
         collections.defaultdict(list),