Skip to content

Commit a6c744d

Browse files
jrebackjorisvandenbossche
authored andcommitted
API/PERF: add policy argument to constructors, pandas-dev#10556
- closes pandas-dev#10556, add policy argument to constructors - closes pandas-dev#9216, all passing of dict with view directly to the API - closes pandas-dev#5902
1 parent 64859ec commit a6c744d

File tree

7 files changed

+209
-59
lines changed

7 files changed

+209
-59
lines changed

pandas/core/config_init.py

+22-9
Original file line numberDiff line numberDiff line change
@@ -471,21 +471,15 @@ def use_inf_as_na_cb(key):
471471
)
472472

473473

474-
# user warnings
474+
#
475+
# options from the "mode" namespace
476+
475477
chained_assignment = """
476478
: string
477479
Raise an exception, warn, or no action if trying to use chained assignment,
478480
The default is warn
479481
"""
480482

481-
with cf.config_prefix("mode"):
482-
cf.register_option(
483-
"chained_assignment",
484-
"warn",
485-
chained_assignment,
486-
validator=is_one_of_factory([None, "warn", "raise"]),
487-
)
488-
489483

490484
# Set up the io.excel specific reader configuration.
491485
reader_engine_doc = """
@@ -499,6 +493,25 @@ def use_inf_as_na_cb(key):
499493
_xlsx_options = ["xlrd", "openpyxl"]
500494
_ods_options = ["odf"]
501495
_xlsb_options = ["pyxlsb"]
496+
policy = """
497+
: string
498+
Default policy for construction of objects,
499+
The default is 'block'
500+
"""
501+
502+
with cf.config_prefix("mode"):
503+
cf.register_option(
504+
"chained_assignment",
505+
"warn",
506+
chained_assignment,
507+
validator=is_one_of_factory([None, "warn", "raise"]),
508+
)
509+
cf.register_option(
510+
"policy",
511+
"block",
512+
policy,
513+
validator=is_one_of_factory(["block", "column", "split"]),
514+
)
502515

503516

504517
with cf.config_prefix("io.excel.xls"):

pandas/core/frame.py

+39-11
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,12 @@ class DataFrame(NDFrame):
361361
Data type to force. Only a single dtype is allowed. If None, infer.
362362
copy : bool, default False
363363
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
364+
policy : string, default None
365+
Provide consolidation policy
366+
- None : use default policy
367+
- block : consolidate into blocks by dtype
368+
- column : don't consolidate, but don't split blocks
369+
- split : don't consolidate, force splitting of input
364370
365371
See Also
366372
--------
@@ -437,6 +443,7 @@ def __init__(
437443
columns: Optional[Axes] = None,
438444
dtype: Optional[Dtype] = None,
439445
copy: bool = False,
446+
policy=None,
440447
):
441448
if data is None:
442449
data = {}
@@ -453,11 +460,15 @@ def __init__(
453460
return
454461

455462
mgr = self._init_mgr(
456-
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
463+
data,
464+
axes=dict(index=index, columns=columns),
465+
dtype=dtype,
466+
copy=copy,
467+
policy=policy,
457468
)
458469

459470
elif isinstance(data, dict):
460-
mgr = init_dict(data, index, columns, dtype=dtype)
471+
mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
461472
elif isinstance(data, ma.MaskedArray):
462473
import numpy.ma.mrecords as mrecords
463474

@@ -474,19 +485,25 @@ def __init__(
474485
data[mask] = fill_value
475486
else:
476487
data = data.copy()
477-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
488+
mgr = init_ndarray(
489+
data, index, columns, dtype=dtype, copy=copy, policy=policy
490+
)
478491

479492
elif isinstance(data, (np.ndarray, Series, Index)):
480493
if data.dtype.names:
481494
data_columns = list(data.dtype.names)
482495
data = {k: data[k] for k in data_columns}
483496
if columns is None:
484497
columns = data_columns
485-
mgr = init_dict(data, index, columns, dtype=dtype)
498+
mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
486499
elif getattr(data, "name", None) is not None:
487-
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
500+
mgr = init_dict(
501+
{data.name: data}, index, columns, dtype=dtype, policy=policy
502+
)
488503
else:
489-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
504+
mgr = init_ndarray(
505+
data, index, columns, dtype=dtype, copy=copy, policy=policy
506+
)
490507

491508
# For data is list-like, or Iterable (will consume into list)
492509
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
@@ -510,11 +527,15 @@ def __init__(
510527
else:
511528
index = ibase.default_index(len(data))
512529

513-
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
530+
mgr = arrays_to_mgr(
531+
arrays, columns, index, columns, dtype=dtype, policy=policy
532+
)
514533
else:
515-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
534+
mgr = init_ndarray(
535+
data, index, columns, dtype=dtype, copy=copy, policy=policy
536+
)
516537
else:
517-
mgr = init_dict({}, index, columns, dtype=dtype)
538+
mgr = init_dict({}, index, columns, dtype=dtype, policy=policy)
518539
else:
519540
try:
520541
arr = np.array(data, dtype=dtype, copy=copy)
@@ -530,7 +551,12 @@ def __init__(
530551
(len(index), len(columns)), data, dtype=dtype
531552
)
532553
mgr = init_ndarray(
533-
values, index, columns, dtype=values.dtype, copy=False
554+
values,
555+
index,
556+
columns,
557+
dtype=values.dtype,
558+
copy=False,
559+
policy=policy,
534560
)
535561
else:
536562
raise ValueError("DataFrame constructor not properly called!")
@@ -592,7 +618,7 @@ def _is_homogeneous_type(self) -> bool:
592618
Index._is_homogeneous_type : Whether the object has a single
593619
dtype.
594620
MultiIndex._is_homogeneous_type : Whether all the levels of a
595-
MultiIndex have the same dtype.
621+
have the same dtype.
596622
597623
Examples
598624
--------
@@ -1977,6 +2003,7 @@ def _from_arrays(
19772003
index,
19782004
dtype: Optional[Dtype] = None,
19792005
verify_integrity: bool = True,
2006+
policy=None,
19802007
) -> "DataFrame":
19812008
"""
19822009
Create DataFrame from a list of arrays corresponding to the columns.
@@ -2012,6 +2039,7 @@ def _from_arrays(
20122039
columns,
20132040
dtype=dtype,
20142041
verify_integrity=verify_integrity,
2042+
policy=policy,
20152043
)
20162044
return cls(mgr)
20172045

pandas/core/generic.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,9 @@ def __init__(
216216
object.__setattr__(self, "_attrs", attrs)
217217

218218
@classmethod
219-
def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
219+
def _init_mgr(
220+
cls, mgr, axes, dtype=None, copy: bool = False, policy=None
221+
) -> BlockManager:
220222
""" passed a manager and a axes dict """
221223
for a, axe in axes.items():
222224
if axe is not None:
@@ -252,6 +254,19 @@ def attrs(self) -> Dict[Optional[Hashable], Any]:
252254
def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
253255
self._attrs = dict(value)
254256

257+
@property
258+
def _policy(self):
259+
""" return my policy for internal implementation """
260+
return self._mgr.policy
261+
262+
@_policy.setter
263+
def _policy(self, value):
264+
"""
265+
set my policy for internal implementation
266+
should only set the property for state purposes
267+
"""
268+
self._mgr.policy = value
269+
255270
@classmethod
256271
def _validate_dtype(cls, dtype):
257272
""" validate the passed dtype """
@@ -1832,6 +1847,7 @@ def __getstate__(self) -> Dict[str, Any]:
18321847
_typ=self._typ,
18331848
_metadata=self._metadata,
18341849
attrs=self.attrs,
1850+
_policy=self._policy,
18351851
**meta,
18361852
)
18371853

@@ -5752,7 +5768,9 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
57525768
"""
57535769
data = self._mgr.copy(deep=deep)
57545770
self._clear_item_cache()
5755-
return self._constructor(data).__finalize__(self, method="copy")
5771+
return self._constructor(data, policy=self._policy).__finalize__(
5772+
self, method="copy"
5773+
)
57565774

57575775
def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
57585776
return self.copy(deep=deep)

pandas/core/internals/blocks.py

+4
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,10 @@ def getitem_block(self, slicer, new_mgr_locs=None):
298298

299299
return self.make_block_same_class(new_values, new_mgr_locs)
300300

301+
@property
302+
def base(self):
303+
return self.values.base
304+
301305
@property
302306
def shape(self):
303307
return self.values.shape

pandas/core/internals/construction.py

+16-7
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ def arrays_to_mgr(
6464
columns,
6565
dtype: Optional[DtypeObj] = None,
6666
verify_integrity: bool = True,
67+
policy=None,
6768
):
6869
"""
6970
Segregate Series based on type and coerce into matrices.
@@ -90,7 +91,7 @@ def arrays_to_mgr(
9091
# from BlockManager perspective
9192
axes = [columns, index]
9293

93-
return create_block_manager_from_arrays(arrays, arr_names, axes)
94+
return create_block_manager_from_arrays(arrays, arr_names, axes, policy=policy)
9495

9596

9697
def masked_rec_array_to_mgr(
@@ -140,7 +141,9 @@ def masked_rec_array_to_mgr(
140141
# DataFrame Constructor Interface
141142

142143

143-
def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
144+
def init_ndarray(
145+
values, index, columns, dtype: Optional[DtypeObj], copy: bool, policy=None
146+
):
144147
# input must be a ndarray, list, Series, index
145148

146149
if isinstance(values, ABCSeries):
@@ -169,7 +172,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
169172
values = values.copy()
170173

171174
index, columns = _get_axes(len(values), 1, index, columns)
172-
return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
175+
return arrays_to_mgr(
176+
[values], columns, index, columns, dtype=dtype, policy=policy
177+
)
173178
elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
174179
# GH#19157
175180

@@ -183,7 +188,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
183188
if columns is None:
184189
columns = Index(range(len(values)))
185190

186-
return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
191+
return arrays_to_mgr(
192+
values, columns, index, columns, dtype=dtype, policy=policy
193+
)
187194

188195
# by definition an array here
189196
# the dtypes will be coerced to a single dtype
@@ -231,10 +238,12 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
231238
else:
232239
block_values = [values]
233240

234-
return create_block_manager_from_blocks(block_values, [columns, index])
241+
return create_block_manager_from_blocks(block_values, [columns, index], policy)
235242

236243

237-
def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
244+
def init_dict(
245+
data: Dict, index, columns, dtype: Optional[DtypeObj] = None, policy=None
246+
):
238247
"""
239248
Segregate Series based on type and coerce into matrices.
240249
Needs to handle a lot of exceptional cases.
@@ -280,7 +289,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
280289
arrays = [
281290
arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282291
]
283-
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
292+
return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, policy=policy)
284293

285294

286295
# ---------------------------------------------------------------------

0 commit comments

Comments
 (0)