|
15 | 15 | is_bool_dtype,
|
16 | 16 | is_datetime64_any_dtype,
|
17 | 17 | is_numeric_dtype,
|
18 |
| - is_string_dtype, |
19 | 18 | is_timedelta64_dtype,
|
20 | 19 | )
|
21 | 20 | from tlz import first, merge, partition_all, remove, unique
|
@@ -323,72 +322,6 @@ def _scalar_binary(op, self, other, inv=False):
|
323 | 322 | return Scalar(graph, name, meta)
|
324 | 323 |
|
325 | 324 |
|
326 |
| -def _is_pyarrow_string(dtype): |
327 |
| - if not PANDAS_GT_130: |
328 |
| - return False |
329 |
| - |
330 |
| - if PANDAS_GT_150: |
331 |
| - import pyarrow as pa |
332 |
| - |
333 |
| - types = [pd.StringDtype("pyarrow"), pd.ArrowDtype(pa.string())] |
334 |
| - else: |
335 |
| - types = [pd.StringDtype("pyarrow")] |
336 |
| - if dtype in types: |
337 |
| - return True |
338 |
| - return False |
339 |
| - |
340 |
| - |
341 |
| -def _is_object_string_dtype(dtype): |
342 |
| - """Determine if input is a non-pyarrow string dtype""" |
343 |
| - return is_string_dtype(dtype) and not _is_pyarrow_string(dtype) |
344 |
| - |
345 |
| - |
346 |
| -def _index_check(x): |
347 |
| - return ( |
348 |
| - is_index_like(x) |
349 |
| - and _is_object_string_dtype(x.dtype) |
350 |
| - and not isinstance( |
351 |
| - x, pd.MultiIndex |
352 |
| - ) # MultiIndex don't support non-object dtypes |
353 |
| - ) |
354 |
| - |
355 |
| - |
356 |
| -def _series_check(x): |
357 |
| - return is_series_like(x) and ( |
358 |
| - _is_object_string_dtype(x.dtype) or _index_check(x.index) |
359 |
| - ) |
360 |
| - |
361 |
| - |
362 |
| -def _dataframe_check(x): |
363 |
| - return is_dataframe_like(x) and ( |
364 |
| - any(_series_check(s) for _, s in x.items()) or _index_check(x.index) |
365 |
| - ) |
366 |
| - |
367 |
| - |
368 |
| -def to_pyarrow_string(df): |
369 |
| - if not (is_dataframe_like(df) or is_series_like(df) or is_index_like(df)): |
370 |
| - return df |
371 |
| - |
372 |
| - # Possibly convert DataFrame/Series/Index to `string[pyarrow]` |
373 |
| - dtypes = None |
374 |
| - if is_dataframe_like(df): |
375 |
| - dtypes = { |
376 |
| - col: pd.StringDtype("pyarrow") |
377 |
| - for col, s in df.items() |
378 |
| - if _is_object_string_dtype(s.dtype) |
379 |
| - } |
380 |
| - elif _is_object_string_dtype(df.dtype): |
381 |
| - dtypes = pd.StringDtype("pyarrow") |
382 |
| - |
383 |
| - if dtypes is not None: |
384 |
| - df = df.astype(dtypes) |
385 |
| - |
386 |
| - # Convert DataFrame/Series index too |
387 |
| - if (is_dataframe_like(df) or is_series_like(df)) and _index_check(df.index): |
388 |
| - df.index = df.index.astype(pd.StringDtype("pyarrow")) |
389 |
| - return df |
390 |
| - |
391 |
| - |
392 | 325 | class _Frame(DaskMethodsMixin, OperatorMethodMixin):
|
393 | 326 | """Superclass for DataFrame and Series
|
394 | 327 |
|
@@ -436,7 +369,18 @@ def __init__(self, dsk, name, meta, divisions):
|
436 | 369 | f"pandas={str(PANDAS_VERSION)} is currently using used."
|
437 | 370 | )
|
438 | 371 |
|
439 |
| - if _dataframe_check(meta) or _series_check(meta) or _index_check(meta): |
| 372 | + from dask.dataframe._pyarrow_utils import ( |
| 373 | + is_object_string_dataframe, |
| 374 | + is_object_string_index, |
| 375 | + is_object_string_series, |
| 376 | + to_pyarrow_string, |
| 377 | + ) |
| 378 | + |
| 379 | + if ( |
| 380 | + is_object_string_dataframe(meta) |
| 381 | + or is_object_string_series(meta) |
| 382 | + or is_object_string_index(meta) |
| 383 | + ): |
440 | 384 | result = self.map_partitions(to_pyarrow_string)
|
441 | 385 | self.dask = result.dask
|
442 | 386 | self._name = result._name
|
|
0 commit comments