@@ -266,21 +266,29 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
266
266
267
267
assert buffers ["offsets" ], "String buffers must contain offsets"
268
268
# Retrieve the data buffer containing the UTF-8 code units
269
- data_buff , protocol_data_dtype = buffers ["data" ]
270
- # We're going to reinterpret the buffer as uint8, so make sure we can do it safely
271
- assert protocol_data_dtype [1 ] == 8
272
- assert protocol_data_dtype [2 ] in (
273
- ArrowCTypes .STRING ,
274
- ArrowCTypes .LARGE_STRING ,
275
- ) # format_str == utf-8
276
- # Convert the buffers to NumPy arrays. In order to go from STRING to
277
- # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
278
- data_dtype = (
279
- DtypeKind .UINT ,
280
- 8 ,
281
- ArrowCTypes .UINT8 ,
282
- Endianness .NATIVE ,
283
- )
269
+ data_buff , data_dtype = buffers ["data" ]
270
+
271
+ if (data_dtype [1 ] == 8 ) and (
272
+ data_dtype [2 ]
273
+ in (
274
+ ArrowCTypes .STRING ,
275
+ ArrowCTypes .LARGE_STRING ,
276
+ )
277
+ ): # format_str == utf-8
278
+ # temporary workaround to keep backwards compatibility due to
279
+ # https://github.com/pandas-dev/pandas/issues/54781
280
+
281
+ # We're going to reinterpret the buffer as uint8, so make sure we can do it
282
+ # safely
283
+
284
+ # Convert the buffers to NumPy arrays. In order to go from STRING to
285
+ # an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array)
286
+ data_dtype = (
287
+ DtypeKind .UINT ,
288
+ 8 ,
289
+ ArrowCTypes .UINT8 ,
290
+ Endianness .NATIVE ,
291
+ )
284
292
# Specify zero offset as we don't want to chunk the string data
285
293
data = buffer_to_ndarray (data_buff , data_dtype , offset = 0 , length = data_buff .bufsize )
286
294
@@ -378,16 +386,22 @@ def datetime_column_to_ndarray(col: Column) -> tuple[np.ndarray | pd.Series, Any
378
386
buffers = col .get_buffers ()
379
387
380
388
_ , _ , format_str , _ = col .dtype
381
- dbuf , dtype = buffers ["data" ]
382
- # Consider dtype being `uint` to get number of units passed since the 01.01.1970
383
- data = buffer_to_ndarray (
384
- dbuf ,
385
- (
389
+ dbuf , data_dtype = buffers ["data" ]
390
+
391
+ if data_dtype [0 ] == DtypeKind .DATETIME :
392
+ # temporary workaround to keep backwards compatibility due to
393
+ # https://github.com/pandas-dev/pandas/issues/54781
394
+ # Consider dtype being `uint` to get number of units passed since the 01.01.1970
395
+ data_dtype = (
386
396
DtypeKind .UINT ,
387
- dtype [1 ],
388
- getattr (ArrowCTypes , f"UINT{ dtype [1 ]} " ),
397
+ data_dtype [1 ],
398
+ getattr (ArrowCTypes , f"UINT{ data_dtype [1 ]} " ),
389
399
Endianness .NATIVE ,
390
- ),
400
+ )
401
+
402
+ data = buffer_to_ndarray (
403
+ dbuf ,
404
+ data_dtype ,
391
405
offset = col .offset ,
392
406
length = col .size (),
393
407
)
0 commit comments