Skip to content

Commit 0fd56e2

Browse files
committed
ENH: Allow fixed-length strings in df.to_records()
Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes pandas-devgh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts.
1 parent 6111f64 commit 0fd56e2

File tree

3 files changed

+109
-3
lines changed

3 files changed

+109
-3
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,7 @@ Other Enhancements
372372
- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
373373
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue:`8839`)
374374
- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
375+
- :meth:`DataFrame.to_records` now accepts a ``stringlike_as_fixed_length`` parameter to efficiently store string-likes as fixed-length string-like dtypes (e.g. ``S1``) instead of object dtype (``O``) (:issue:`18146`)
375376

376377
.. _whatsnew_0240.api_breaking:
377378

pandas/core/frame.py

+70-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
OrderedDict, PY36, raise_with_traceback,
3636
string_and_binary_types)
3737
from pandas.compat.numpy import function as nv
38-
38+
from pandas.api.types import infer_dtype
3939
from pandas.core.dtypes.cast import (
4040
maybe_upcast,
4141
cast_scalar_to_array,
@@ -1476,7 +1476,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
14761476

14771477
return cls(mgr)
14781478

1479-
def to_records(self, index=True, convert_datetime64=None):
1479+
def to_records(self, index=True, convert_datetime64=None,
1480+
stringlike_as_fixed_length=False):
14801481
"""
14811482
Convert DataFrame to a NumPy record array.
14821483
@@ -1493,6 +1494,11 @@ def to_records(self, index=True, convert_datetime64=None):
14931494
14941495
Whether to convert the index to datetime.datetime if it is a
14951496
DatetimeIndex.
1497+
stringlike_as_fixed_length : bool, default False
1498+
.. versionadded:: 0.24.0
1499+
1500+
Store string-likes as fixed-length string-like dtypes
1501+
(e.g. ``S1`` dtype) instead of Python objects (``O`` dtype).
14961502
14971503
Returns
14981504
-------
@@ -1534,6 +1540,27 @@ def to_records(self, index=True, convert_datetime64=None):
15341540
>>> df.to_records(index=False)
15351541
rec.array([(1, 0.5 ), (2, 0.75)],
15361542
dtype=[('A', '<i8'), ('B', '<f8')])
1543+
1544+
By default, strings are recorded as dtype 'O' for object:
1545+
1546+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1547+
... index=['a', 'b'])
1548+
>>> df.to_records()
1549+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1550+
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
1551+
1552+
This can be inefficient (e.g. for short strings, or when storing with
1553+
`np.save()`). They can be recorded as fix-length string-like dtypes
1554+
such as 'S1' for zero-terminated bytes instead:
1555+
1556+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1557+
... index=['a', 'b'])
1558+
>>> df.to_records(stringlike_as_fixed_length=True)
1559+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1560+
dtype=[('index', '<U1'), ('A', '<i8'), ('B', '<U4')])
1561+
1562+
Notice how the 'B' column is now stored as '<U4' for length-four
1563+
strings ('S4' for Python 2.x) instead of the 'O' object dtype.
15371564
"""
15381565

15391566
if convert_datetime64 is not None:
@@ -1569,7 +1596,47 @@ def to_records(self, index=True, convert_datetime64=None):
15691596
arrays = [self[c].get_values() for c in self.columns]
15701597
names = lmap(compat.text_type, self.columns)
15711598

1572-
formats = [v.dtype for v in arrays]
1599+
formats = []
1600+
1601+
for v in arrays:
1602+
if not stringlike_as_fixed_length:
1603+
formats.append(v.dtype)
1604+
else:
1605+
# gh-18146
1606+
#
1607+
# For string-like arrays, set dtype as zero-terminated bytes
1608+
# with max length equal to that of the longest string-like.
1609+
dtype = infer_dtype(v)
1610+
symbol = None
1611+
1612+
if dtype == "string":
1613+
# In Python 3.x, infer_dtype does not
1614+
# differentiate string from unicode
1615+
# like NumPy arrays do, so we
1616+
# specify unicode to be safe.
1617+
symbol = "S" if compat.PY2 else "U"
1618+
elif dtype == "unicode":
1619+
# In Python 3.x, infer_dtype does not
1620+
# differentiate string from unicode.
1621+
#
1622+
# Thus, we can only get this result
1623+
# in Python 2.x.
1624+
symbol = "U"
1625+
elif dtype == "bytes":
1626+
# In Python 2.x, infer_dtype does not
1627+
# differentiate string from bytes.
1628+
#
1629+
# Thus, we can only get this result
1630+
# in Python 3.x. However, NumPy does
1631+
# not have a fixed-length bytes dtype
1632+
# and just uses string instead.
1633+
symbol = "S"
1634+
1635+
if symbol is not None:
1636+
formats.append("{}{}".format(symbol, max(map(len, v))))
1637+
else:
1638+
formats.append(v.dtype)
1639+
15731640
return np.rec.fromarrays(
15741641
arrays,
15751642
dtype={'names': names, 'formats': formats}

pandas/tests/frame/test_convert_to.py

+38
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,44 @@ def test_to_records_with_categorical(self):
191191
dtype=[('index', '=i8'), ('0', 'O')])
192192
tm.assert_almost_equal(result, expected)
193193

194+
@pytest.mark.parametrize("fixed_length", [True, False])
195+
@pytest.mark.parametrize("values,dtype_getter", [
196+
# Integer --> just take the dtype.
197+
([1, 2], lambda fixed, isPY2: "<i8"),
198+
199+
# Mixed --> cast to object.
200+
([1, "1"], lambda fixed, isPY2: "O"),
201+
202+
# String --> cast to string is PY2 else unicode in PY3.
203+
(["1", "2"], lambda fixed, isPY2: (
204+
("S" if isPY2 else "U") + "1") if fixed else "O"),
205+
206+
# String + max-length of longest string.
207+
(["12", "2"], lambda fixed, isPY2: (
208+
("S" if isPY2 else "U") + "2") if fixed else "O"),
209+
210+
# Unicode --> cast to unicode for both PY2 and PY3.
211+
([u"\u2120b", u"456"], lambda fixed, isPY2: "U3" if fixed else "O"),
212+
213+
# Bytes --> cast to string for both PY2 and PY3.
214+
([b"2", b"5"], lambda fixed, isPY2: "S1" if fixed else "O"),
215+
], ids=["int", "mixed", "str", "max-len", "unicode", "bytes"])
216+
def test_to_records_with_strings_as_fixed_length(self, fixed_length,
217+
values, dtype_getter):
218+
219+
# see gh-18146
220+
df = DataFrame({"values": values}, index=["a", "b"])
221+
result = df.to_records(stringlike_as_fixed_length=fixed_length)
222+
223+
ind_dtype = ((("S" if compat.PY2 else "U") + "1")
224+
if fixed_length else "O")
225+
val_dtype = dtype_getter(fixed_length, compat.PY2)
226+
227+
expected = np.rec.array([("a", values[0]), ("b", values[1])],
228+
dtype=[("index", ind_dtype),
229+
("values", val_dtype)])
230+
tm.assert_almost_equal(result, expected)
231+
194232
@pytest.mark.parametrize('mapping', [
195233
dict,
196234
collections.defaultdict(list),

0 commit comments

Comments
 (0)