Skip to content

Commit 0769688

Browse files
qinghao1gfyoung
authored andcommitted
ENH: Add strings_as_fixed_length parameter for df.to_records() (#18146) (#22229)
* ENH: Allow fixed-length strings in df.to_records() Adds parameter to allow string-like columns to be cast as fixed-length string-like dtypes for more efficient storage. Closes gh-18146. Originally authored by @qinghao1 but cleaned up by @gfyoung to fix merge conflicts. * Add dtype parameters instead of fix-string-like The original parameter was causing a lot of acrobatics with regards to string dtypes between 2.x and 3.x. The new parameters simplify the internal logic and pass the responsibility and motivation of memory efficiency back to the users. * MAINT: Use is_dict_like in to_records More generic than checking whether our mappings are instances of dict. Expands is_dict_like check to include whether it has a __contains__ method. * TST: Add test for is_dict_like expanded def * MAINT: Address final comments
1 parent 43c4dcd commit 0769688

File tree

5 files changed

+272
-4
lines changed

5 files changed

+272
-4
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ Other Enhancements
411411
- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`)
412412
- :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`)
413413
- The ``scatter_matrix``, ``andrews_curves``, ``parallel_coordinates``, ``lag_plot``, ``autocorrelation_plot``, ``bootstrap_plot``, and ``radviz`` plots from the ``pandas.plotting`` module are now accessible from calling :meth:`DataFrame.plot` (:issue:`11978`)
414+
- :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`)
414415
- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`)
415416
- :func:`pandas.DataFrame.to_sql` has gained the ``method`` argument to control SQL insertion clause. See the :ref:`insertion method <io.sql.method>` section in the documentation. (:issue:`8953`)
416417

pandas/core/dtypes/inference.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,11 @@ def is_dict_like(obj):
398398
>>> is_dict_like([1, 2, 3])
399399
False
400400
"""
401+
for attr in ("__getitem__", "keys", "__contains__"):
402+
if not hasattr(obj, attr):
403+
return False
401404

402-
return hasattr(obj, '__getitem__') and hasattr(obj, 'keys')
405+
return True
403406

404407

405408
def is_named_tuple(obj):

pandas/core/frame.py

+89-3
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
OrderedDict, PY36, raise_with_traceback,
3636
string_and_binary_types)
3737
from pandas.compat.numpy import function as nv
38-
3938
from pandas.core.dtypes.cast import (
4039
maybe_upcast,
4140
cast_scalar_to_array,
@@ -49,6 +48,7 @@
4948
maybe_upcast_putmask,
5049
find_common_type)
5150
from pandas.core.dtypes.common import (
51+
is_dict_like,
5252
is_object_dtype,
5353
is_extension_type,
5454
is_extension_array_dtype,
@@ -1540,7 +1540,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
15401540

15411541
return cls(mgr)
15421542

1543-
def to_records(self, index=True, convert_datetime64=None):
1543+
def to_records(self, index=True, convert_datetime64=None,
1544+
column_dtypes=None, index_dtypes=None):
15441545
"""
15451546
Convert DataFrame to a NumPy record array.
15461547
@@ -1557,6 +1558,20 @@ def to_records(self, index=True, convert_datetime64=None):
15571558
15581559
Whether to convert the index to datetime.datetime if it is a
15591560
DatetimeIndex.
1561+
column_dtypes : str, type, dict, default None
1562+
.. versionadded:: 0.24.0
1563+
1564+
If a string or type, the data type to store all columns. If
1565+
a dictionary, a mapping of column names and indices (zero-indexed)
1566+
to specific data types.
1567+
index_dtypes : str, type, dict, default None
1568+
.. versionadded:: 0.24.0
1569+
1570+
If a string or type, the data type to store all index levels. If
1571+
a dictionary, a mapping of index level names and indices
1572+
(zero-indexed) to specific data types.
1573+
1574+
This mapping is applied only if `index=True`.
15601575
15611576
Returns
15621577
-------
@@ -1598,6 +1613,23 @@ def to_records(self, index=True, convert_datetime64=None):
15981613
>>> df.to_records(index=False)
15991614
rec.array([(1, 0.5 ), (2, 0.75)],
16001615
dtype=[('A', '<i8'), ('B', '<f8')])
1616+
1617+
Data types can be specified for the columns:
1618+
1619+
>>> df.to_records(column_dtypes={"A": "int32"})
1620+
rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
1621+
dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
1622+
1623+
As well as for the index:
1624+
1625+
>>> df.to_records(index_dtypes="<S2")
1626+
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
1627+
dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
1628+
1629+
>>> index_dtypes = "<S{}".format(df.index.str.len().max())
1630+
>>> df.to_records(index_dtypes=index_dtypes)
1631+
rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
1632+
dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
16011633
"""
16021634

16031635
if convert_datetime64 is not None:
@@ -1620,20 +1652,74 @@ def to_records(self, index=True, convert_datetime64=None):
16201652

16211653
count = 0
16221654
index_names = list(self.index.names)
1655+
16231656
if isinstance(self.index, MultiIndex):
16241657
for i, n in enumerate(index_names):
16251658
if n is None:
16261659
index_names[i] = 'level_%d' % count
16271660
count += 1
16281661
elif index_names[0] is None:
16291662
index_names = ['index']
1663+
16301664
names = (lmap(compat.text_type, index_names) +
16311665
lmap(compat.text_type, self.columns))
16321666
else:
16331667
arrays = [self[c].get_values() for c in self.columns]
16341668
names = lmap(compat.text_type, self.columns)
1669+
index_names = []
1670+
1671+
index_len = len(index_names)
1672+
formats = []
1673+
1674+
for i, v in enumerate(arrays):
1675+
index = i
1676+
1677+
# When the names and arrays are collected, we
1678+
# first collect those in the DataFrame's index,
1679+
# followed by those in its columns.
1680+
#
1681+
# Thus, the total length of the array is:
1682+
# len(index_names) + len(DataFrame.columns).
1683+
#
1684+
# This check allows us to see whether we are
1685+
# handling a name / array in the index or column.
1686+
if index < index_len:
1687+
dtype_mapping = index_dtypes
1688+
name = index_names[index]
1689+
else:
1690+
index -= index_len
1691+
dtype_mapping = column_dtypes
1692+
name = self.columns[index]
1693+
1694+
# We have a dictionary, so we get the data type
1695+
# associated with the index or column (which can
1696+
# be denoted by its name in the DataFrame or its
1697+
# position in DataFrame's array of indices or
1698+
# columns, whichever is applicable.
1699+
if is_dict_like(dtype_mapping):
1700+
if name in dtype_mapping:
1701+
dtype_mapping = dtype_mapping[name]
1702+
elif index in dtype_mapping:
1703+
dtype_mapping = dtype_mapping[index]
1704+
else:
1705+
dtype_mapping = None
1706+
1707+
# If no mapping can be found, use the array's
1708+
# dtype attribute for formatting.
1709+
#
1710+
# A valid dtype must either be a type or
1711+
# string naming a type.
1712+
if dtype_mapping is None:
1713+
formats.append(v.dtype)
1714+
elif isinstance(dtype_mapping, (type, compat.string_types)):
1715+
formats.append(dtype_mapping)
1716+
else:
1717+
element = "row" if i < index_len else "column"
1718+
msg = ("Invalid dtype {dtype} specified for "
1719+
"{element} {name}").format(dtype=dtype_mapping,
1720+
element=element, name=name)
1721+
raise ValueError(msg)
16351722

1636-
formats = [v.dtype for v in arrays]
16371723
return np.rec.fromarrays(
16381724
arrays,
16391725
dtype={'names': names, 'formats': formats}

pandas/tests/dtypes/test_inference.py

+27
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,33 @@ def test_is_dict_like_fails(ll):
178178
assert not inference.is_dict_like(ll)
179179

180180

181+
@pytest.mark.parametrize("has_keys", [True, False])
182+
@pytest.mark.parametrize("has_getitem", [True, False])
183+
@pytest.mark.parametrize("has_contains", [True, False])
184+
def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains):
185+
class DictLike(object):
186+
def __init__(self, d):
187+
self.d = d
188+
189+
if has_keys:
190+
def keys(self):
191+
return self.d.keys()
192+
193+
if has_getitem:
194+
def __getitem__(self, key):
195+
return self.d.__getitem__(key)
196+
197+
if has_contains:
198+
def __contains__(self, key):
199+
return self.d.__contains__(key)
200+
201+
d = DictLike({1: 2})
202+
result = inference.is_dict_like(d)
203+
expected = has_keys and has_getitem and has_contains
204+
205+
assert result is expected
206+
207+
181208
def test_is_file_like(mock):
182209
class MockFile(object):
183210
pass

pandas/tests/frame/test_convert_to.py

+151
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,157 @@ def test_to_records_with_categorical(self):
191191
dtype=[('index', '=i8'), ('0', 'O')])
192192
tm.assert_almost_equal(result, expected)
193193

194+
@pytest.mark.parametrize("kwargs,expected", [
195+
# No dtypes --> default to array dtypes.
196+
(dict(),
197+
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
198+
dtype=[("index", "<i8"), ("A", "<i8"),
199+
("B", "<f8"), ("C", "O")])),
200+
201+
# Should have no effect in this case.
202+
(dict(index=True),
203+
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
204+
dtype=[("index", "<i8"), ("A", "<i8"),
205+
("B", "<f8"), ("C", "O")])),
206+
207+
# Column dtype applied across the board. Index unaffected.
208+
(dict(column_dtypes="<U4"),
209+
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
210+
dtype=[("index", "<i8"), ("A", "<U4"),
211+
("B", "<U4"), ("C", "<U4")])),
212+
213+
# Index dtype applied across the board. Columns unaffected.
214+
(dict(index_dtypes="<U1"),
215+
np.rec.array([("0", 1, 0.2, "a"), ("1", 2, 1.5, "bc")],
216+
dtype=[("index", "<U1"), ("A", "<i8"),
217+
("B", "<f8"), ("C", "O")])),
218+
219+
# Pass in a type instance.
220+
(dict(column_dtypes=np.unicode),
221+
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
222+
dtype=[("index", "<i8"), ("A", "<U"),
223+
("B", "<U"), ("C", "<U")])),
224+
225+
# Pass in a dictionary (name-only).
226+
(dict(column_dtypes={"A": np.int8, "B": np.float32, "C": "<U2"}),
227+
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
228+
dtype=[("index", "<i8"), ("A", "i1"),
229+
("B", "<f4"), ("C", "<U2")])),
230+
231+
# Pass in a dictionary (indices-only).
232+
(dict(index_dtypes={0: "int16"}),
233+
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
234+
dtype=[("index", "i2"), ("A", "<i8"),
235+
("B", "<f8"), ("C", "O")])),
236+
237+
# Ignore index mappings if index is not True.
238+
(dict(index=False, index_dtypes="<U2"),
239+
np.rec.array([(1, 0.2, "a"), (2, 1.5, "bc")],
240+
dtype=[("A", "<i8"), ("B", "<f8"), ("C", "O")])),
241+
242+
# Non-existent names / indices in mapping should not error.
243+
(dict(index_dtypes={0: "int16", "not-there": "float32"}),
244+
np.rec.array([(0, 1, 0.2, "a"), (1, 2, 1.5, "bc")],
245+
dtype=[("index", "i2"), ("A", "<i8"),
246+
("B", "<f8"), ("C", "O")])),
247+
248+
# Names / indices not in mapping default to array dtype.
249+
(dict(column_dtypes={"A": np.int8, "B": np.float32}),
250+
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
251+
dtype=[("index", "<i8"), ("A", "i1"),
252+
("B", "<f4"), ("C", "O")])),
253+
254+
# Mixture of everything.
255+
(dict(column_dtypes={"A": np.int8, "B": np.float32},
256+
index_dtypes="<U2"),
257+
np.rec.array([("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")],
258+
dtype=[("index", "<U2"), ("A", "i1"),
259+
("B", "<f4"), ("C", "O")])),
260+
261+
# Invalid dype values.
262+
(dict(index=False, column_dtypes=list()),
263+
"Invalid dtype \\[\\] specified for column A"),
264+
265+
(dict(index=False, column_dtypes={"A": "int32", "B": 5}),
266+
"Invalid dtype 5 specified for column B"),
267+
])
268+
def test_to_records_dtype(self, kwargs, expected):
269+
# see gh-18146
270+
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
271+
272+
if isinstance(expected, str):
273+
with pytest.raises(ValueError, match=expected):
274+
df.to_records(**kwargs)
275+
else:
276+
result = df.to_records(**kwargs)
277+
tm.assert_almost_equal(result, expected)
278+
279+
@pytest.mark.parametrize("df,kwargs,expected", [
280+
# MultiIndex in the index.
281+
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
282+
columns=list("abc")).set_index(["a", "b"]),
283+
dict(column_dtypes="float64", index_dtypes={0: "int32", 1: "int8"}),
284+
np.rec.array([(1, 2, 3.), (4, 5, 6.), (7, 8, 9.)],
285+
dtype=[("a", "<i4"), ("b", "i1"), ("c", "<f8")])),
286+
287+
# MultiIndex in the columns.
288+
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
289+
columns=MultiIndex.from_tuples([("a", "d"), ("b", "e"),
290+
("c", "f")])),
291+
dict(column_dtypes={0: "<U1", 2: "float32"}, index_dtypes="float32"),
292+
np.rec.array([(0., u"1", 2, 3.), (1., u"4", 5, 6.),
293+
(2., u"7", 8, 9.)],
294+
dtype=[("index", "<f4"),
295+
("('a', 'd')", "<U1"),
296+
("('b', 'e')", "<i8"),
297+
("('c', 'f')", "<f4")])),
298+
299+
# MultiIndex in both the columns and index.
300+
(DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
301+
columns=MultiIndex.from_tuples([
302+
("a", "d"), ("b", "e"), ("c", "f")], names=list("ab")),
303+
index=MultiIndex.from_tuples([
304+
("d", -4), ("d", -5), ("f", -6)], names=list("cd"))),
305+
dict(column_dtypes="float64", index_dtypes={0: "<U2", 1: "int8"}),
306+
np.rec.array([("d", -4, 1., 2., 3.), ("d", -5, 4., 5., 6.),
307+
("f", -6, 7, 8, 9.)],
308+
dtype=[("c", "<U2"), ("d", "i1"),
309+
("('a', 'd')", "<f8"), ("('b', 'e')", "<f8"),
310+
("('c', 'f')", "<f8")]))
311+
])
312+
def test_to_records_dtype_mi(self, df, kwargs, expected):
313+
# see gh-18146
314+
result = df.to_records(**kwargs)
315+
tm.assert_almost_equal(result, expected)
316+
317+
def test_to_records_dict_like(self):
318+
# see gh-18146
319+
class DictLike(object):
320+
def __init__(self, **kwargs):
321+
self.d = kwargs.copy()
322+
323+
def __getitem__(self, key):
324+
return self.d.__getitem__(key)
325+
326+
def __contains__(self, key):
327+
return key in self.d
328+
329+
def keys(self):
330+
return self.d.keys()
331+
332+
df = DataFrame({"A": [1, 2], "B": [0.2, 1.5], "C": ["a", "bc"]})
333+
334+
dtype_mappings = dict(column_dtypes=DictLike(**{"A": np.int8,
335+
"B": np.float32}),
336+
index_dtypes="<U2")
337+
338+
result = df.to_records(**dtype_mappings)
339+
expected = np.rec.array([("0", "1", "0.2", "a"),
340+
("1", "2", "1.5", "bc")],
341+
dtype=[("index", "<U2"), ("A", "i1"),
342+
("B", "<f4"), ("C", "O")])
343+
tm.assert_almost_equal(result, expected)
344+
194345
@pytest.mark.parametrize('mapping', [
195346
dict,
196347
collections.defaultdict(list),

0 commit comments

Comments
 (0)