@@ -1335,7 +1335,8 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
1335
1335
1336
1336
return cls (mgr )
1337
1337
1338
- def to_records (self , index = True , convert_datetime64 = None ):
1338
+ def to_records (self , index = True , convert_datetime64 = None ,
1339
+ strings_as_bytes = False ):
1339
1340
"""
1340
1341
Convert DataFrame to a NumPy record array.
1341
1342
@@ -1351,6 +1352,9 @@ def to_records(self, index=True, convert_datetime64=None):
1351
1352
1352
1353
Whether to convert the index to datetime.datetime if it is a
1353
1354
DatetimeIndex.
1355
+ strings_as_bytes: boolean, default False
1356
+ Store strings as bytes ('S' dtype) instead of Python objects
1357
+ ('O' dtype)
1354
1358
1355
1359
Returns
1356
1360
-------
@@ -1401,6 +1405,24 @@ def to_records(self, index=True, convert_datetime64=None):
1401
1405
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1402
1406
('2018-01-01T09:01:00.000000000', 2, 0.75)],
1403
1407
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1408
+
1409
+ By default, strings are recorded as dtype `O` for object:
1410
+
1411
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1412
+ ... index=['a', 'b'])
1413
+ >>> df.to_records()
1414
+ rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1415
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
1416
+
1417
+ This can be inefficient (e.g. for short strings, or when storing with
1418
+ `np.save()`). They can be recorded as dtype `S` for zero-terminated
1419
+ bytes instead:
1420
+
1421
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1422
+ ... index=['a', 'b'])
1423
+ >>> df.to_records()
1424
+ rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1425
+ dtype=[('index', 'S1'), ('A', '<i8'), ('B', 'S4')])
1404
1426
"""
1405
1427
1406
1428
if convert_datetime64 is not None :
@@ -1436,7 +1458,16 @@ def to_records(self, index=True, convert_datetime64=None):
1436
1458
arrays = [self [c ].get_values () for c in self .columns ]
1437
1459
names = lmap (compat .text_type , self .columns )
1438
1460
1439
- formats = [v .dtype for v in arrays ]
1461
+ if strings_as_bytes :
1462
+ # GH18146
1463
+ # for string arrays, set dtype as zero-terminated bytes with max
1464
+ # length equals to that of the longest string
1465
+ formats = ['S{}' .format (max (map (len , v )))
1466
+ if v .dtype == '|O'
1467
+ else v .dtype
1468
+ for v in arrays ]
1469
+ else :
1470
+ formats = [v .dtype for v in arrays ]
1440
1471
return np .rec .fromarrays (
1441
1472
arrays ,
1442
1473
dtype = {'names' : names , 'formats' : formats }
0 commit comments