Skip to content

Switch dataframe constructor to use dispatch #32844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 14 commits into from
228 changes: 139 additions & 89 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import datetime
from io import StringIO
import itertools
import functools
from textwrap import dedent
from typing import (
IO,
Expand All @@ -36,6 +37,7 @@

import numpy as np
import numpy.ma as ma
import numpy.ma.mrecords as mrecords
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think we import this lazily to speed up import time

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought maybe that was the case. However, looking at the PR that added the import (https://github.com/pandas-dev/pandas/pull/5579/files) I didn't see any discussion about import time. Do you know if there are any benchmarks to back this up I could run?

If it's a deal breaker, I can move the mrecords check into the ma check like it was in the original code.


from pandas._config import get_option

Expand Down Expand Up @@ -427,97 +429,9 @@ def __init__(
dtype: Optional[Dtype] = None,
copy: bool = False,
):
if data is None:
data = {}
if dtype is not None:
dtype = self._validate_dtype(dtype)

if isinstance(data, DataFrame):
data = data._data

if isinstance(data, BlockManager):
mgr = self._init_mgr(
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
)
elif isinstance(data, dict):
mgr = init_dict(data, index, columns, dtype=dtype)
elif isinstance(data, ma.MaskedArray):
import numpy.ma.mrecords as mrecords

# masked recarray
if isinstance(data, mrecords.MaskedRecords):
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)

# a masked array
else:
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = maybe_upcast(data, copy=True)
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

elif isinstance(data, (np.ndarray, Series, Index)):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
mgr = init_dict(data, index, columns, dtype=dtype)
elif getattr(data, "name", None) is not None:
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)

# For data is list-like, or Iterable (will consume into list)
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = dataclasses_to_dicts(data)
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
else:
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
else:
mgr = init_dict({}, index, columns, dtype=dtype)
else:
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
"DataFrame constructor called with "
f"incompatible data and dtype: {err}"
)
raise exc from err

if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array(
(len(index), len(columns)), data, dtype=dtype
)
mgr = init_ndarray(
values, index, columns, dtype=values.dtype, copy=False
)
else:
raise ValueError("DataFrame constructor not properly called!")

mgr = create_block_manager(data, self, index, columns, dtype, copy)
NDFrame.__init__(self, mgr)

# ----------------------------------------------------------------------
Expand Down Expand Up @@ -8548,6 +8462,142 @@ def isin(self, values) -> "DataFrame":
ops.add_special_arithmetic_methods(DataFrame)


@functools.singledispatch
def create_block_manager(
data: Any,
df: DataFrame,
index: Optional[Axes],
columns: Optional[Axes],
dtype: Optional[Dtype],
copy: bool,
) -> BlockManager:
"""
Convert an object into a BlockManager. Used inside the DataFrame constructor
so if you want to provide a custom way to convert from your objec to a DataFrame
you can register a dispatch on this method.
"""
# Base case is to try to cast to NumPy array
try:
arr = np.array(data, dtype=dtype, copy=copy)
except (ValueError, TypeError) as err:
exc = TypeError(
"DataFrame constructor called with " f"incompatible data and dtype: {err}"
)
raise exc from err

if arr.ndim == 0 and index is not None and columns is not None:
values = cast_scalar_to_array((len(index), len(columns)), data, dtype=dtype)
return init_ndarray(values, index, columns, dtype=values.dtype, copy=False)
else:
raise ValueError("DataFrame constructor not properly called!")


@create_block_manager.register
def _create_block_manager_none(data: None, *args, **kwargs):
return create_block_manager({}, *args, **kwargs)


@create_block_manager.register
def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs):
return create_block_manager(data._data, *args, **kwargs)


@create_block_manager.register
def _create_block_manager_blockmanager(
data: BlockManager, df, index, columns, dtype, copy
):
mgr = df._init_mgr(
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
)
return mgr


@create_block_manager.register
def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy):
return init_dict(data, index, columns, dtype=dtype)


@create_block_manager.register
def _create_block_manager_masked_array(
data: ma.MaskedArray, df, index, columns, dtype, copy
):
mask = ma.getmaskarray(data)
if mask.any():
data, fill_value = maybe_upcast(data, copy=True)
data.soften_mask() # set hardmask False if it was True
data[mask] = fill_value
else:
data = data.copy()
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)


@create_block_manager.register
def _create_block_manager_masked_record(
data: mrecords.MaskedRecords, df, index, columns, dtype, copy
):
return masked_rec_array_to_mgr(data, index, columns, dtype, copy)


@create_block_manager.register(np.ndarray)
@create_block_manager.register(Series)
@create_block_manager.register(Index)
def _create_block_manager_array_series_index(
data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy
):
if data.dtype.names:
data_columns = list(data.dtype.names)
data = {k: data[k] for k in data_columns}
if columns is None:
columns = data_columns
return init_dict(data, index, columns, dtype=dtype)
elif getattr(data, "name", None) is not None:
return init_dict({data.name: data}, index, columns, dtype=dtype)
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)


class _IterableExceptStringOrBytesMeta(type):
def __subclasscheck__(cls, sub: Type) -> bool:
return not issubclass(sub, (str, bytes)) and issubclass(sub, abc.Iterable)


class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta):
"""
Class that is subclass of iterable but not of str or bytes to use for singledispatch
registration
"""

pass


@create_block_manager.register
def _create_block_manager_iterable(
data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy
):
if not isinstance(data, (abc.Sequence, ExtensionArray)):
data = list(data)
if len(data) > 0:
if is_dataclass(data[0]):
data = dataclasses_to_dicts(data)
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
if is_named_tuple(data[0]) and columns is None:
columns = data[0]._fields
arrays, columns = to_arrays(data, columns, dtype=dtype)
columns = ensure_index(columns)

# set the index
if index is None:
if isinstance(data[0], Series):
index = get_names_from_index(data)
elif isinstance(data[0], Categorical):
index = ibase.default_index(len(data[0]))
else:
index = ibase.default_index(len(data))

return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
return init_dict({}, index, columns, dtype=dtype)


def _from_nested_dict(data):
# TODO: this should be seriously cythonized
new_data = collections.defaultdict(dict)
Expand Down