Skip to content

Commit 2f48a8f

Browse files
qinghao1Chu Qinghao
authored and
Chu Qinghao
committed
ENH: Add string_as_bytes option for df.to_records() (pandas-dev#18146)
This options records dtype for string as arrays as 'Sx', where x is the length of the longest string, instead of 'O"
1 parent 2156431 commit 2f48a8f

File tree

2 files changed

+43
-2
lines changed

2 files changed

+43
-2
lines changed

pandas/core/frame.py

+32-2
Original file line numberDiff line numberDiff line change
@@ -1335,7 +1335,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
13351335

13361336
return cls(mgr)
13371337

1338-
def to_records(self, index=True, convert_datetime64=None):
1338+
def to_records(self, index=True, convert_datetime64=None, strings_as_bytes=False):
13391339
"""
13401340
Convert DataFrame to a NumPy record array.
13411341
@@ -1351,6 +1351,9 @@ def to_records(self, index=True, convert_datetime64=None):
13511351
13521352
Whether to convert the index to datetime.datetime if it is a
13531353
DatetimeIndex.
1354+
strings_as_bytes: boolean, default False
1355+
Store strings as bytes ('S' dtype) instead of Python objects
1356+
('O' dtype)
13541357
13551358
Returns
13561359
-------
@@ -1401,6 +1404,24 @@ def to_records(self, index=True, convert_datetime64=None):
14011404
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
14021405
('2018-01-01T09:01:00.000000000', 2, 0.75)],
14031406
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1407+
1408+
By default, strings are recorded as dtype `O` for object:
1409+
1410+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1411+
... index=['a', 'b'])
1412+
>>> df.to_records()
1413+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1414+
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
1415+
1416+
This can be inefficient (e.g. for short strings, or when storing with
1417+
`np.save()`). They can be recorded as dtype `S` for zero-terminated
1418+
bytes instead:
1419+
1420+
>>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1421+
... index=['a', 'b'])
1422+
>>> df.to_records()
1423+
rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1424+
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'S4')])
14041425
"""
14051426

14061427
if convert_datetime64 is not None:
@@ -1436,7 +1457,16 @@ def to_records(self, index=True, convert_datetime64=None):
14361457
arrays = [self[c].get_values() for c in self.columns]
14371458
names = lmap(compat.text_type, self.columns)
14381459

1439-
formats = [v.dtype for v in arrays]
1460+
if strings_as_bytes:
1461+
# GH18146
1462+
# for string arrays, set dtype as zero-terminated bytes with max
1463+
# length equals to that of the longest string
1464+
formats = ['S{}'.format(max(lmap(len, v)))
1465+
if v.dtype == '|O'
1466+
else v.dtype
1467+
for v in arrays]
1468+
else:
1469+
formats = [v.dtype for v in arrays]
14401470
return np.rec.fromarrays(
14411471
arrays,
14421472
dtype={'names': names, 'formats': formats}

pandas/tests/frame/test_convert_to.py

+11
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,17 @@ def test_to_records_with_categorical(self):
186186
dtype=[('index', '=i8'), ('0', 'O')])
187187
tm.assert_almost_equal(result, expected)
188188

189+
def test_to_records_with_strings_as_bytes(self):
190+
191+
# GH18146
192+
193+
df = DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
194+
index=['a', 'b'])
195+
result = df.to_records(strings_as_bytes=True)
196+
expected = np.rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
197+
dtype=[('index', 'O'), ('A', '<i8'), ('B', 'S4')])
198+
tm.assert_almost_equal(result, expected)
199+
189200
@pytest.mark.parametrize('mapping', [
190201
dict,
191202
collections.defaultdict(list),

0 commit comments

Comments
 (0)