@@ -1335,7 +1335,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None,
1335
1335
1336
1336
return cls (mgr )
1337
1337
1338
- def to_records (self , index = True , convert_datetime64 = None ):
1338
+ def to_records (self , index = True , convert_datetime64 = None , strings_as_bytes = False ):
1339
1339
"""
1340
1340
Convert DataFrame to a NumPy record array.
1341
1341
@@ -1351,6 +1351,9 @@ def to_records(self, index=True, convert_datetime64=None):
1351
1351
1352
1352
Whether to convert the index to datetime.datetime if it is a
1353
1353
DatetimeIndex.
1354
+ strings_as_bytes: boolean, default False
1355
+ Store strings as bytes ('S' dtype) instead of Python objects
1356
+ ('O' dtype)
1354
1357
1355
1358
Returns
1356
1359
-------
@@ -1401,6 +1404,24 @@ def to_records(self, index=True, convert_datetime64=None):
1401
1404
rec.array([('2018-01-01T09:00:00.000000000', 1, 0.5 ),
1402
1405
('2018-01-01T09:01:00.000000000', 2, 0.75)],
1403
1406
dtype=[('index', '<M8[ns]'), ('A', '<i8'), ('B', '<f8')])
1407
+
1408
+ By default, strings are recorded as dtype `O` for object:
1409
+
1410
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1411
+ ... index=['a', 'b'])
1412
+ >>> df.to_records()
1413
+ rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1414
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', 'O')])
1415
+
1416
+ This can be inefficient (e.g. for short strings, or when storing with
1417
+ `np.save()`). They can be recorded as dtype `S` for zero-terminated
1418
+ bytes instead:
1419
+
1420
+ >>> df = pd.DataFrame({'A': [1, 2], 'B': ['abc', 'defg']},
1421
+ ... index=['a', 'b'])
1422
+ >>> df.to_records()
1423
+ rec.array([('a', 1, 'abc'), ('b', 2, 'defg')],
1424
+ dtype=[('index', 'O'), ('A', '<i8'), ('B', 'S4')])
1404
1425
"""
1405
1426
1406
1427
if convert_datetime64 is not None :
@@ -1436,7 +1457,16 @@ def to_records(self, index=True, convert_datetime64=None):
1436
1457
arrays = [self [c ].get_values () for c in self .columns ]
1437
1458
names = lmap (compat .text_type , self .columns )
1438
1459
1439
- formats = [v .dtype for v in arrays ]
1460
+ if strings_as_bytes :
1461
+ # GH18146
1462
+ # for string arrays, set dtype as zero-terminated bytes with max
1463
+ # length equals to that of the longest string
1464
+ formats = ['S{}' .format (max (lmap (len , v )))
1465
+ if v .dtype == '|O'
1466
+ else v .dtype
1467
+ for v in arrays ]
1468
+ else :
1469
+ formats = [v .dtype for v in arrays ]
1440
1470
return np .rec .fromarrays (
1441
1471
arrays ,
1442
1472
dtype = {'names' : names , 'formats' : formats }
0 commit comments