Move into dedicated module

jrbourbeau · j-bennet · commit 1cb49d9e003a · 2023-02-17T09:08:42.000-08:00
diff --git a/dask/dataframe/_pyarrow_utils.py b/dask/dataframe/_pyarrow_utils.py
@@ -0,0 +1,79 @@
+import pandas as pd
+
+from dask.dataframe._compat import PANDAS_GT_130, PANDAS_GT_150
+from dask.dataframe.utils import is_dataframe_like, is_index_like, is_series_like
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
+
+PYARROW_STRINGS_AVAILABLE: bool = pa is not None and PANDAS_GT_130
+
+
+def is_pyarrow_string_dtype(dtype):
+    if not PYARROW_STRINGS_AVAILABLE:
+        return False
+
+    if PANDAS_GT_150:
+        types = [pd.StringDtype("pyarrow"), pd.ArrowDtype(pa.string())]
+    else:
+        types = [pd.StringDtype("pyarrow")]
+    if dtype in types:
+        return True
+    return False
+
+
+def is_object_string_dtype(dtype):
+    """Determine if input is a non-pyarrow string dtype"""
+    return pd.api.types.is_string_dtype(dtype) and not is_pyarrow_string_dtype(dtype)
+
+
+def is_object_string_index(x):
+    return (
+        is_index_like(x)
+        and is_object_string_dtype(x.dtype)
+        and not isinstance(
+            x, pd.MultiIndex
+        )  # MultiIndex don't support non-object dtypes
+    )
+
+
+def is_object_string_series(x):
+    return is_series_like(x) and (
+        is_object_string_dtype(x.dtype) or is_object_string_index(x.index)
+    )
+
+
+def is_object_string_dataframe(x):
+    return is_dataframe_like(x) and (
+        any(is_object_string_series(s) for _, s in x.items())
+        or is_object_string_index(x.index)
+    )
+
+
+def to_pyarrow_string(df):
+    if not (is_dataframe_like(df) or is_series_like(df) or is_index_like(df)):
+        return df
+
+    # Possibly convert DataFrame/Series/Index to `string[pyarrow]`
+    dtypes = None
+    if is_dataframe_like(df):
+        dtypes = {
+            col: pd.StringDtype("pyarrow")
+            for col, s in df.items()
+            if is_object_string_dtype(s.dtype)
+        }
+    elif is_object_string_dtype(df.dtype):
+        dtypes = pd.StringDtype("pyarrow")
+
+    if dtypes is not None:
+        df = df.astype(dtypes)
+
+    # Convert DataFrame/Series index too
+    if (is_dataframe_like(df) or is_series_like(df)) and is_object_string_index(
+        df.index
+    ):
+        df.index = df.index.astype(pd.StringDtype("pyarrow"))
+    return df
diff --git a/dask/dataframe/core.py b/dask/dataframe/core.py
@@ -15,7 +15,6 @@
     is_bool_dtype,
     is_datetime64_any_dtype,
     is_numeric_dtype,
-    is_string_dtype,
     is_timedelta64_dtype,
 )
 from tlz import first, merge, partition_all, remove, unique
@@ -323,72 +322,6 @@ def _scalar_binary(op, self, other, inv=False):
         return Scalar(graph, name, meta)
 
 
-def _is_pyarrow_string(dtype):
-    if not PANDAS_GT_130:
-        return False
-
-    if PANDAS_GT_150:
-        import pyarrow as pa
-
-        types = [pd.StringDtype("pyarrow"), pd.ArrowDtype(pa.string())]
-    else:
-        types = [pd.StringDtype("pyarrow")]
-    if dtype in types:
-        return True
-    return False
-
-
-def _is_object_string_dtype(dtype):
-    """Determine if input is a non-pyarrow string dtype"""
-    return is_string_dtype(dtype) and not _is_pyarrow_string(dtype)
-
-
-def _index_check(x):
-    return (
-        is_index_like(x)
-        and _is_object_string_dtype(x.dtype)
-        and not isinstance(
-            x, pd.MultiIndex
-        )  # MultiIndex don't support non-object dtypes
-    )
-
-
-def _series_check(x):
-    return is_series_like(x) and (
-        _is_object_string_dtype(x.dtype) or _index_check(x.index)
-    )
-
-
-def _dataframe_check(x):
-    return is_dataframe_like(x) and (
-        any(_series_check(s) for _, s in x.items()) or _index_check(x.index)
-    )
-
-
-def to_pyarrow_string(df):
-    if not (is_dataframe_like(df) or is_series_like(df) or is_index_like(df)):
-        return df
-
-    # Possibly convert DataFrame/Series/Index to `string[pyarrow]`
-    dtypes = None
-    if is_dataframe_like(df):
-        dtypes = {
-            col: pd.StringDtype("pyarrow")
-            for col, s in df.items()
-            if _is_object_string_dtype(s.dtype)
-        }
-    elif _is_object_string_dtype(df.dtype):
-        dtypes = pd.StringDtype("pyarrow")
-
-    if dtypes is not None:
-        df = df.astype(dtypes)
-
-    # Convert DataFrame/Series index too
-    if (is_dataframe_like(df) or is_series_like(df)) and _index_check(df.index):
-        df.index = df.index.astype(pd.StringDtype("pyarrow"))
-    return df
-
-
 class _Frame(DaskMethodsMixin, OperatorMethodMixin):
     """Superclass for DataFrame and Series
 
@@ -436,7 +369,18 @@ def __init__(self, dsk, name, meta, divisions):
                     f"pandas={str(PANDAS_VERSION)} is currently using used."
                 )
 
-            if _dataframe_check(meta) or _series_check(meta) or _index_check(meta):
+            from dask.dataframe._pyarrow_utils import (
+                is_object_string_dataframe,
+                is_object_string_index,
+                is_object_string_series,
+                to_pyarrow_string,
+            )
+
+            if (
+                is_object_string_dataframe(meta)
+                or is_object_string_series(meta)
+                or is_object_string_index(meta)
+            ):
                 result = self.map_partitions(to_pyarrow_string)
                 self.dask = result.dask
                 self._name = result._name
diff --git a/dask/dataframe/utils.py b/dask/dataframe/utils.py
@@ -545,7 +545,7 @@ def assert_eq(
     import dask
 
     if dask.config.get("dataframe.object_as_pyarrow_string"):
-        from dask.dataframe.core import to_pyarrow_string
+        from dask.dataframe._pyarrow_utils import to_pyarrow_string
 
         if not is_dask_collection(a):
             a = to_pyarrow_string(a)