Skip to content

Commit 6fe0831

Browse files
Switch dataframe constructor to use dispatch
1 parent dbd24ad commit 6fe0831

File tree

1 file changed

+130
-89
lines changed

1 file changed

+130
-89
lines changed

pandas/core/frame.py

Lines changed: 130 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import datetime
1515
from io import StringIO
1616
import itertools
17+
import functools
1718
from textwrap import dedent
1819
from typing import (
1920
IO,
@@ -36,6 +37,7 @@
3637

3738
import numpy as np
3839
import numpy.ma as ma
40+
import numpy.ma.mrecords as mrecords
3941

4042
from pandas._config import get_option
4143

@@ -427,97 +429,9 @@ def __init__(
427429
dtype: Optional[Dtype] = None,
428430
copy: bool = False,
429431
):
430-
if data is None:
431-
data = {}
432432
if dtype is not None:
433433
dtype = self._validate_dtype(dtype)
434-
435-
if isinstance(data, DataFrame):
436-
data = data._data
437-
438-
if isinstance(data, BlockManager):
439-
mgr = self._init_mgr(
440-
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
441-
)
442-
elif isinstance(data, dict):
443-
mgr = init_dict(data, index, columns, dtype=dtype)
444-
elif isinstance(data, ma.MaskedArray):
445-
import numpy.ma.mrecords as mrecords
446-
447-
# masked recarray
448-
if isinstance(data, mrecords.MaskedRecords):
449-
mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy)
450-
451-
# a masked array
452-
else:
453-
mask = ma.getmaskarray(data)
454-
if mask.any():
455-
data, fill_value = maybe_upcast(data, copy=True)
456-
data.soften_mask() # set hardmask False if it was True
457-
data[mask] = fill_value
458-
else:
459-
data = data.copy()
460-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
461-
462-
elif isinstance(data, (np.ndarray, Series, Index)):
463-
if data.dtype.names:
464-
data_columns = list(data.dtype.names)
465-
data = {k: data[k] for k in data_columns}
466-
if columns is None:
467-
columns = data_columns
468-
mgr = init_dict(data, index, columns, dtype=dtype)
469-
elif getattr(data, "name", None) is not None:
470-
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
471-
else:
472-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
473-
474-
# For data is list-like, or Iterable (will consume into list)
475-
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
476-
if not isinstance(data, (abc.Sequence, ExtensionArray)):
477-
data = list(data)
478-
if len(data) > 0:
479-
if is_dataclass(data[0]):
480-
data = dataclasses_to_dicts(data)
481-
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
482-
if is_named_tuple(data[0]) and columns is None:
483-
columns = data[0]._fields
484-
arrays, columns = to_arrays(data, columns, dtype=dtype)
485-
columns = ensure_index(columns)
486-
487-
# set the index
488-
if index is None:
489-
if isinstance(data[0], Series):
490-
index = get_names_from_index(data)
491-
elif isinstance(data[0], Categorical):
492-
index = ibase.default_index(len(data[0]))
493-
else:
494-
index = ibase.default_index(len(data))
495-
496-
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
497-
else:
498-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
499-
else:
500-
mgr = init_dict({}, index, columns, dtype=dtype)
501-
else:
502-
try:
503-
arr = np.array(data, dtype=dtype, copy=copy)
504-
except (ValueError, TypeError) as err:
505-
exc = TypeError(
506-
"DataFrame constructor called with "
507-
f"incompatible data and dtype: {err}"
508-
)
509-
raise exc from err
510-
511-
if arr.ndim == 0 and index is not None and columns is not None:
512-
values = cast_scalar_to_array(
513-
(len(index), len(columns)), data, dtype=dtype
514-
)
515-
mgr = init_ndarray(
516-
values, index, columns, dtype=values.dtype, copy=False
517-
)
518-
else:
519-
raise ValueError("DataFrame constructor not properly called!")
520-
434+
mgr = create_block_manager(data, self, index, columns, dtype, copy)
521435
NDFrame.__init__(self, mgr)
522436

523437
# ----------------------------------------------------------------------
@@ -8548,6 +8462,133 @@ def isin(self, values) -> "DataFrame":
85488462
ops.add_special_arithmetic_methods(DataFrame)
85498463

85508464

8465+
@functools.singledispatch
8466+
def create_block_manager(
8467+
data: Any,
8468+
df: DataFrame,
8469+
index: Optional[Axes],
8470+
columns: Optional[Axes],
8471+
dtype: Optional[Dtype],
8472+
copy: bool
8473+
) -> BlockManager:
8474+
"""
8475+
Convert an object into a BlockManager. Used inside the DataFrame constructor
8476+
so if you want to provide a custom way to convert from your objec to a DataFrame
8477+
you can register a dispatch on this method.
8478+
"""
8479+
# Base case is to try to cast to NumPy array
8480+
try:
8481+
arr = np.array(data, dtype=dtype, copy=copy)
8482+
except (ValueError, TypeError) as err:
8483+
exc = TypeError(
8484+
"DataFrame constructor called with "
8485+
f"incompatible data and dtype: {err}"
8486+
)
8487+
raise exc from err
8488+
8489+
if arr.ndim == 0 and index is not None and columns is not None:
8490+
values = cast_scalar_to_array(
8491+
(len(index), len(columns)), data, dtype=dtype
8492+
)
8493+
return init_ndarray(
8494+
values, index, columns, dtype=values.dtype, copy=False
8495+
)
8496+
else:
8497+
raise ValueError("DataFrame constructor not properly called!")
8498+
8499+
@create_block_manager.register
8500+
def _create_block_manager_none(data: None, *args, **kwargs):
8501+
return create_block_manager({}, *args, **kwargs)
8502+
8503+
@create_block_manager.register
8504+
def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs):
8505+
return create_block_manager(data._data, *args, **kwargs)
8506+
8507+
8508+
@create_block_manager.register
8509+
def _create_block_manager_dataframe(data: BlockManager, df, index, columns, dtype, copy):
8510+
mgr = df._init_mgr(
8511+
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
8512+
)
8513+
return mgr
8514+
8515+
@create_block_manager.register
8516+
def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy):
8517+
return init_dict(data, index, columns, dtype=dtype)
8518+
8519+
8520+
@create_block_manager.register
8521+
def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, dtype, copy):
8522+
mask = ma.getmaskarray(data)
8523+
if mask.any():
8524+
data, fill_value = maybe_upcast(data, copy=True)
8525+
data.soften_mask() # set hardmask False if it was True
8526+
data[mask] = fill_value
8527+
else:
8528+
data = data.copy()
8529+
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
8530+
8531+
8532+
@create_block_manager.register
8533+
def _create_block_manager_masked_record(data: mrecords.MaskedRecords, df, index, columns, dtype, copy):
8534+
return masked_rec_array_to_mgr(data, index, columns, dtype, copy)
8535+
8536+
@create_block_manager.register(np.ndarray)
8537+
@create_block_manager.register(Series)
8538+
@create_block_manager.register(Index)
8539+
def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy):
8540+
if data.dtype.names:
8541+
data_columns = list(data.dtype.names)
8542+
data = {k: data[k] for k in data_columns}
8543+
if columns is None:
8544+
columns = data_columns
8545+
return init_dict(data, index, columns, dtype=dtype)
8546+
elif getattr(data, "name", None) is not None:
8547+
return init_dict({data.name: data}, index, columns, dtype=dtype)
8548+
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
8549+
8550+
class _IterableExceptStringOrBytesMeta(type):
8551+
def __subclasscheck__(cls, sub: Type) -> bool:
8552+
return (
8553+
not issubclass(sub, (str, bytes))
8554+
and issubclass(sub, abc.Iterable)
8555+
)
8556+
8557+
class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta):
8558+
"""
8559+
Class that is subclass of iterable but not of str or bytes to use for singledispatch
8560+
registration
8561+
"""
8562+
pass
8563+
8564+
8565+
@create_block_manager.register
8566+
def _create_block_manager_iterable(data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy):
8567+
if not isinstance(data, (abc.Sequence, ExtensionArray)):
8568+
data = list(data)
8569+
if len(data) > 0:
8570+
if is_dataclass(data[0]):
8571+
data = dataclasses_to_dicts(data)
8572+
if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1:
8573+
if is_named_tuple(data[0]) and columns is None:
8574+
columns = data[0]._fields
8575+
arrays, columns = to_arrays(data, columns, dtype=dtype)
8576+
columns = ensure_index(columns)
8577+
8578+
# set the index
8579+
if index is None:
8580+
if isinstance(data[0], Series):
8581+
index = get_names_from_index(data)
8582+
elif isinstance(data[0], Categorical):
8583+
index = ibase.default_index(len(data[0]))
8584+
else:
8585+
index = ibase.default_index(len(data))
8586+
8587+
return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
8588+
return init_ndarray(data, index, columns, dtype=dtype, copy=copy)
8589+
return init_dict({}, index, columns, dtype=dtype)
8590+
8591+
85518592
def _from_nested_dict(data):
85528593
# TODO: this should be seriously cythonized
85538594
new_data = collections.defaultdict(dict)

0 commit comments

Comments
 (0)