Switch dataframe constructor to use dispatch

saulshanabrook · saulshanabrook · commit 6fe0831fdf3d · 2020-03-19T17:39:52.000-04:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -14,6 +14,7 @@
 import datetime
 from io import StringIO
 import itertools
+import functools
 from textwrap import dedent
 from typing import (
     IO,
@@ -36,6 +37,7 @@
 
 import numpy as np
 import numpy.ma as ma
+import numpy.ma.mrecords as mrecords
 
 from pandas._config import get_option
 
@@ -427,97 +429,9 @@ def __init__(
         dtype: Optional[Dtype] = None,
         copy: bool = False,
     ):
-        if data is None:
-            data = {}
         if dtype is not None:
             dtype = self._validate_dtype(dtype)
-
-        if isinstance(data, DataFrame):
-            data = data._data
-
-        if isinstance(data, BlockManager):
-            mgr = self._init_mgr(
-                data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
-            )
-        elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
-        elif isinstance(data, ma.MaskedArray):
-            import numpy.ma.mrecords as mrecords
-
-            # masked recarray
-            if isinstance(data, mrecords.MaskedRecords):
-                mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
-
-            # a masked array
-            else:
-                mask = ma.getmaskarray(data)
-                if mask.any():
-                    data, fill_value = maybe_upcast(data, copy=True)
-                    data.soften_mask()  # set hardmask False if it was True
-                    data[mask] = fill_value
-                else:
-                    data = data.copy()
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
-
-        elif isinstance(data, (np.ndarray, Series, Index)):
-            if data.dtype.names:
-                data_columns = list(data.dtype.names)
-                data = {k: data[k] for k in data_columns}
-                if columns is None:
-                    columns = data_columns
-                mgr = init_dict(data, index, columns, dtype=dtype)
-            elif getattr(data, "name", None) is not None:
-                mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
-            else:
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
-
-        # For data is list-like, or Iterable (will consume into list)
-        elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
-            if not isinstance(data, (abc.Sequence, ExtensionArray)):
-                data = list(data)
-            if len(data) > 0:
-                if is_dataclass(data[0]):
-                    data = dataclasses_to_dicts(data)
-                if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
-                    if is_named_tuple(data[0]) and columns is None:
-                        columns = data[0]._fields
-                    arrays, columns = to_arrays(data, columns, dtype=dtype)
-                    columns = ensure_index(columns)
-
-                    # set the index
-                    if index is None:
-                        if isinstance(data[0], Series):
-                            index = get_names_from_index(data)
-                        elif isinstance(data[0], Categorical):
-                            index = ibase.default_index(len(data[0]))
-                        else:
-                            index = ibase.default_index(len(data))
-
-                    mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
-                else:
-                    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
-            else:
-                mgr = init_dict({}, index, columns, dtype=dtype)
-        else:
-            try:
-                arr = np.array(data, dtype=dtype, copy=copy)
-            except (ValueError, TypeError) as err:
-                exc = TypeError(
-                    "DataFrame constructor called with "
-                    f"incompatible data and dtype: {err}"
-                )
-                raise exc from err
-
-            if arr.ndim == 0 and index is not None and columns is not None:
-                values = cast_scalar_to_array(
-                    (len(index), len(columns)), data, dtype=dtype
-                )
-                mgr = init_ndarray(
-                    values, index, columns, dtype=values.dtype, copy=False
-                )
-            else:
-                raise ValueError("DataFrame constructor not properly called!")
-
+        mgr = create_block_manager(data, self, index, columns, dtype, copy)
         NDFrame.__init__(self, mgr)
 
     # ----------------------------------------------------------------------
@@ -8548,6 +8462,133 @@ def isin(self, values) -> "DataFrame":
 ops.add_special_arithmetic_methods(DataFrame)
 
 
+@functools.singledispatch
+def create_block_manager(
+        data: Any,
+        df: DataFrame,
+        index: Optional[Axes],
+        columns: Optional[Axes],
+        dtype: Optional[Dtype],
+        copy: bool
+    ) -> BlockManager:
+    """
+    Convert an object into a BlockManager. Used inside the DataFrame constructor
+    so if you want to provide a custom way to convert from your objec to a DataFrame
+    you can register a dispatch on this method.
+    """
+    # Base case is to try to cast to NumPy array
+    try:
+        arr = np.array(data, dtype=dtype, copy=copy)
+    except (ValueError, TypeError) as err:
+        exc = TypeError(
+            "DataFrame constructor called with "
+            f"incompatible data and dtype: {err}"
+        )
+        raise exc from err
+
+    if arr.ndim == 0 and index is not None and columns is not None:
+        values = cast_scalar_to_array(
+            (len(index), len(columns)), data, dtype=dtype
+        )
+        return init_ndarray(
+            values, index, columns, dtype=values.dtype, copy=False
+        )
+    else:
+        raise ValueError("DataFrame constructor not properly called!")
+
+@create_block_manager.register
+def _create_block_manager_none(data: None, *args, **kwargs):
+    return create_block_manager({}, *args, **kwargs)
+
+@create_block_manager.register
+def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs):
+    return create_block_manager(data._data, *args, **kwargs)
+
+
+@create_block_manager.register
+def _create_block_manager_dataframe(data: BlockManager, df, index, columns, dtype, copy):
+    mgr = df._init_mgr(
+        data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+    )
+    return mgr
+
+@create_block_manager.register
+def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy):
+    return init_dict(data, index, columns, dtype=dtype)
+
+
+@create_block_manager.register
+def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, dtype, copy):
+    mask = ma.getmaskarray(data)
+    if mask.any():
+        data, fill_value = maybe_upcast(data, copy=True)
+        data.soften_mask()  # set hardmask False if it was True
+        data[mask] = fill_value
+    else:
+        data = data.copy()
+    return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+
+
+@create_block_manager.register
+def _create_block_manager_masked_record(data: mrecords.MaskedRecords, df, index, columns, dtype, copy):
+    return masked_rec_array_to_mgr(data, index, columns, dtype, copy)
+
+@create_block_manager.register(np.ndarray)
+@create_block_manager.register(Series)
+@create_block_manager.register(Index)
+def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy):
+    if data.dtype.names:
+        data_columns = list(data.dtype.names)
+        data = {k: data[k] for k in data_columns}
+        if columns is None:
+            columns = data_columns
+        return init_dict(data, index, columns, dtype=dtype)
+    elif getattr(data, "name", None) is not None:
+        return init_dict({data.name: data}, index, columns, dtype=dtype)
+    return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+
+class _IterableExceptStringOrBytesMeta(type):
+    def __subclasscheck__(cls, sub: Type) -> bool:
+        return (
+            not issubclass(sub, (str, bytes))
+            and issubclass(sub, abc.Iterable)
+        )
+
+class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta):
+    """
+    Class that is subclass of iterable but not of str or bytes to use for singledispatch
+    registration
+    """
+    pass
+
+
+@create_block_manager.register
+def _create_block_manager_iterable(data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy):
+    if not isinstance(data, (abc.Sequence, ExtensionArray)):
+        data = list(data)
+    if len(data) > 0:
+        if is_dataclass(data[0]):
+            data = dataclasses_to_dicts(data)
+        if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
+            if is_named_tuple(data[0]) and columns is None:
+                columns = data[0]._fields
+            arrays, columns = to_arrays(data, columns, dtype=dtype)
+            columns = ensure_index(columns)
+
+            # set the index
+            if index is None:
+                if isinstance(data[0], Series):
+                    index = get_names_from_index(data)
+                elif isinstance(data[0], Categorical):
+                    index = ibase.default_index(len(data[0]))
+                else:
+                    index = ibase.default_index(len(data))
+
+            return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+        return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+    return init_dict({}, index, columns, dtype=dtype)
+
+
 def _from_nested_dict(data):
     # TODO: this should be seriously cythonized
     new_data = collections.defaultdict(dict)