|
14 | 14 | import datetime
|
15 | 15 | from io import StringIO
|
16 | 16 | import itertools
|
| 17 | +import functools |
17 | 18 | from textwrap import dedent
|
18 | 19 | from typing import (
|
19 | 20 | IO,
|
|
36 | 37 |
|
37 | 38 | import numpy as np
|
38 | 39 | import numpy.ma as ma
|
| 40 | +import numpy.ma.mrecords as mrecords |
39 | 41 |
|
40 | 42 | from pandas._config import get_option
|
41 | 43 |
|
@@ -427,97 +429,9 @@ def __init__(
|
427 | 429 | dtype: Optional[Dtype] = None,
|
428 | 430 | copy: bool = False,
|
429 | 431 | ):
|
430 |
| - if data is None: |
431 |
| - data = {} |
432 | 432 | if dtype is not None:
|
433 | 433 | dtype = self._validate_dtype(dtype)
|
434 |
| - |
435 |
| - if isinstance(data, DataFrame): |
436 |
| - data = data._data |
437 |
| - |
438 |
| - if isinstance(data, BlockManager): |
439 |
| - mgr = self._init_mgr( |
440 |
| - data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy |
441 |
| - ) |
442 |
| - elif isinstance(data, dict): |
443 |
| - mgr = init_dict(data, index, columns, dtype=dtype) |
444 |
| - elif isinstance(data, ma.MaskedArray): |
445 |
| - import numpy.ma.mrecords as mrecords |
446 |
| - |
447 |
| - # masked recarray |
448 |
| - if isinstance(data, mrecords.MaskedRecords): |
449 |
| - mgr = masked_rec_array_to_mgr(data, index, columns, dtype, copy) |
450 |
| - |
451 |
| - # a masked array |
452 |
| - else: |
453 |
| - mask = ma.getmaskarray(data) |
454 |
| - if mask.any(): |
455 |
| - data, fill_value = maybe_upcast(data, copy=True) |
456 |
| - data.soften_mask() # set hardmask False if it was True |
457 |
| - data[mask] = fill_value |
458 |
| - else: |
459 |
| - data = data.copy() |
460 |
| - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
461 |
| - |
462 |
| - elif isinstance(data, (np.ndarray, Series, Index)): |
463 |
| - if data.dtype.names: |
464 |
| - data_columns = list(data.dtype.names) |
465 |
| - data = {k: data[k] for k in data_columns} |
466 |
| - if columns is None: |
467 |
| - columns = data_columns |
468 |
| - mgr = init_dict(data, index, columns, dtype=dtype) |
469 |
| - elif getattr(data, "name", None) is not None: |
470 |
| - mgr = init_dict({data.name: data}, index, columns, dtype=dtype) |
471 |
| - else: |
472 |
| - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
473 |
| - |
474 |
| - # For data is list-like, or Iterable (will consume into list) |
475 |
| - elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)): |
476 |
| - if not isinstance(data, (abc.Sequence, ExtensionArray)): |
477 |
| - data = list(data) |
478 |
| - if len(data) > 0: |
479 |
| - if is_dataclass(data[0]): |
480 |
| - data = dataclasses_to_dicts(data) |
481 |
| - if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: |
482 |
| - if is_named_tuple(data[0]) and columns is None: |
483 |
| - columns = data[0]._fields |
484 |
| - arrays, columns = to_arrays(data, columns, dtype=dtype) |
485 |
| - columns = ensure_index(columns) |
486 |
| - |
487 |
| - # set the index |
488 |
| - if index is None: |
489 |
| - if isinstance(data[0], Series): |
490 |
| - index = get_names_from_index(data) |
491 |
| - elif isinstance(data[0], Categorical): |
492 |
| - index = ibase.default_index(len(data[0])) |
493 |
| - else: |
494 |
| - index = ibase.default_index(len(data)) |
495 |
| - |
496 |
| - mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) |
497 |
| - else: |
498 |
| - mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
499 |
| - else: |
500 |
| - mgr = init_dict({}, index, columns, dtype=dtype) |
501 |
| - else: |
502 |
| - try: |
503 |
| - arr = np.array(data, dtype=dtype, copy=copy) |
504 |
| - except (ValueError, TypeError) as err: |
505 |
| - exc = TypeError( |
506 |
| - "DataFrame constructor called with " |
507 |
| - f"incompatible data and dtype: {err}" |
508 |
| - ) |
509 |
| - raise exc from err |
510 |
| - |
511 |
| - if arr.ndim == 0 and index is not None and columns is not None: |
512 |
| - values = cast_scalar_to_array( |
513 |
| - (len(index), len(columns)), data, dtype=dtype |
514 |
| - ) |
515 |
| - mgr = init_ndarray( |
516 |
| - values, index, columns, dtype=values.dtype, copy=False |
517 |
| - ) |
518 |
| - else: |
519 |
| - raise ValueError("DataFrame constructor not properly called!") |
520 |
| - |
| 434 | + mgr = create_block_manager(data, self, index, columns, dtype, copy) |
521 | 435 | NDFrame.__init__(self, mgr)
|
522 | 436 |
|
523 | 437 | # ----------------------------------------------------------------------
|
@@ -8548,6 +8462,133 @@ def isin(self, values) -> "DataFrame":
|
8548 | 8462 | ops.add_special_arithmetic_methods(DataFrame)
|
8549 | 8463 |
|
8550 | 8464 |
|
| 8465 | +@functools.singledispatch |
| 8466 | +def create_block_manager( |
| 8467 | + data: Any, |
| 8468 | + df: DataFrame, |
| 8469 | + index: Optional[Axes], |
| 8470 | + columns: Optional[Axes], |
| 8471 | + dtype: Optional[Dtype], |
| 8472 | + copy: bool |
| 8473 | + ) -> BlockManager: |
| 8474 | + """ |
| 8475 | + Convert an object into a BlockManager. Used inside the DataFrame constructor |
| 8476 | + so if you want to provide a custom way to convert from your objec to a DataFrame |
| 8477 | + you can register a dispatch on this method. |
| 8478 | + """ |
| 8479 | + # Base case is to try to cast to NumPy array |
| 8480 | + try: |
| 8481 | + arr = np.array(data, dtype=dtype, copy=copy) |
| 8482 | + except (ValueError, TypeError) as err: |
| 8483 | + exc = TypeError( |
| 8484 | + "DataFrame constructor called with " |
| 8485 | + f"incompatible data and dtype: {err}" |
| 8486 | + ) |
| 8487 | + raise exc from err |
| 8488 | + |
| 8489 | + if arr.ndim == 0 and index is not None and columns is not None: |
| 8490 | + values = cast_scalar_to_array( |
| 8491 | + (len(index), len(columns)), data, dtype=dtype |
| 8492 | + ) |
| 8493 | + return init_ndarray( |
| 8494 | + values, index, columns, dtype=values.dtype, copy=False |
| 8495 | + ) |
| 8496 | + else: |
| 8497 | + raise ValueError("DataFrame constructor not properly called!") |
| 8498 | + |
| 8499 | +@create_block_manager.register |
| 8500 | +def _create_block_manager_none(data: None, *args, **kwargs): |
| 8501 | + return create_block_manager({}, *args, **kwargs) |
| 8502 | + |
| 8503 | +@create_block_manager.register |
| 8504 | +def _create_block_manager_dataframe(data: DataFrame, *args, **kwargs): |
| 8505 | + return create_block_manager(data._data, *args, **kwargs) |
| 8506 | + |
| 8507 | + |
| 8508 | +@create_block_manager.register |
| 8509 | +def _create_block_manager_dataframe(data: BlockManager, df, index, columns, dtype, copy): |
| 8510 | + mgr = df._init_mgr( |
| 8511 | + data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy |
| 8512 | + ) |
| 8513 | + return mgr |
| 8514 | + |
| 8515 | +@create_block_manager.register |
| 8516 | +def _create_block_manager_dict(data: dict, df, index, columns, dtype, copy): |
| 8517 | + return init_dict(data, index, columns, dtype=dtype) |
| 8518 | + |
| 8519 | + |
| 8520 | +@create_block_manager.register |
| 8521 | +def _create_block_manager_masked_array(data: ma.MaskedArray, df, index, columns, dtype, copy): |
| 8522 | + mask = ma.getmaskarray(data) |
| 8523 | + if mask.any(): |
| 8524 | + data, fill_value = maybe_upcast(data, copy=True) |
| 8525 | + data.soften_mask() # set hardmask False if it was True |
| 8526 | + data[mask] = fill_value |
| 8527 | + else: |
| 8528 | + data = data.copy() |
| 8529 | + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
| 8530 | + |
| 8531 | + |
| 8532 | +@create_block_manager.register |
| 8533 | +def _create_block_manager_masked_record(data: mrecords.MaskedRecords, df, index, columns, dtype, copy): |
| 8534 | + return masked_rec_array_to_mgr(data, index, columns, dtype, copy) |
| 8535 | + |
| 8536 | +@create_block_manager.register(np.ndarray) |
| 8537 | +@create_block_manager.register(Series) |
| 8538 | +@create_block_manager.register(Index) |
| 8539 | +def _create_block_manager_array_series_index(data: Union[np.ndarray, Series, Index], df, index, columns, dtype, copy): |
| 8540 | + if data.dtype.names: |
| 8541 | + data_columns = list(data.dtype.names) |
| 8542 | + data = {k: data[k] for k in data_columns} |
| 8543 | + if columns is None: |
| 8544 | + columns = data_columns |
| 8545 | + return init_dict(data, index, columns, dtype=dtype) |
| 8546 | + elif getattr(data, "name", None) is not None: |
| 8547 | + return init_dict({data.name: data}, index, columns, dtype=dtype) |
| 8548 | + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
| 8549 | + |
| 8550 | +class _IterableExceptStringOrBytesMeta(type): |
| 8551 | + def __subclasscheck__(cls, sub: Type) -> bool: |
| 8552 | + return ( |
| 8553 | + not issubclass(sub, (str, bytes)) |
| 8554 | + and issubclass(sub, abc.Iterable) |
| 8555 | + ) |
| 8556 | + |
| 8557 | +class _IterableExceptStringOrBytes(metaclass=_IterableExceptStringOrBytesMeta): |
| 8558 | + """ |
| 8559 | + Class that is subclass of iterable but not of str or bytes to use for singledispatch |
| 8560 | + registration |
| 8561 | + """ |
| 8562 | + pass |
| 8563 | + |
| 8564 | + |
| 8565 | +@create_block_manager.register |
| 8566 | +def _create_block_manager_iterable(data: _IterableExceptStringOrBytes, df, index, columns, dtype, copy): |
| 8567 | + if not isinstance(data, (abc.Sequence, ExtensionArray)): |
| 8568 | + data = list(data) |
| 8569 | + if len(data) > 0: |
| 8570 | + if is_dataclass(data[0]): |
| 8571 | + data = dataclasses_to_dicts(data) |
| 8572 | + if is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1: |
| 8573 | + if is_named_tuple(data[0]) and columns is None: |
| 8574 | + columns = data[0]._fields |
| 8575 | + arrays, columns = to_arrays(data, columns, dtype=dtype) |
| 8576 | + columns = ensure_index(columns) |
| 8577 | + |
| 8578 | + # set the index |
| 8579 | + if index is None: |
| 8580 | + if isinstance(data[0], Series): |
| 8581 | + index = get_names_from_index(data) |
| 8582 | + elif isinstance(data[0], Categorical): |
| 8583 | + index = ibase.default_index(len(data[0])) |
| 8584 | + else: |
| 8585 | + index = ibase.default_index(len(data)) |
| 8586 | + |
| 8587 | + return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) |
| 8588 | + return init_ndarray(data, index, columns, dtype=dtype, copy=copy) |
| 8589 | + return init_dict({}, index, columns, dtype=dtype) |
| 8590 | + |
| 8591 | + |
8551 | 8592 | def _from_nested_dict(data):
|
8552 | 8593 | # TODO: this should be seriously cythonized
|
8553 | 8594 | new_data = collections.defaultdict(dict)
|
|
0 commit comments