Skip to content

Commit afaadc2

Browse files
jrebackTomAugspurger
authored andcommitted
API/PERF: add policy argument to constructors, pandas-dev#10556
- closes pandas-dev#10556, add policy argument to constructors - closes pandas-dev#9216, all passing of dict with view directly to the API - closes pandas-dev#5902
1 parent 81149fb commit afaadc2

File tree

9 files changed

+8661
-43
lines changed

9 files changed

+8661
-43
lines changed

pandas/core/config_init.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,9 @@ def use_inf_as_na_cb(key):
472472
)
473473

474474

475-
# user warnings
475+
#
476+
# options from the "mode" namespace
477+
476478
chained_assignment = """
477479
: string
478480
Raise an exception, warn, or no action if trying to use chained assignment,
@@ -500,6 +502,17 @@ def use_inf_as_na_cb(key):
500502
_xlsx_options = ["xlrd", "openpyxl"]
501503
_ods_options = ["odf"]
502504
_xlsb_options = ["pyxlsb"]
505+
policy = """
506+
: string
507+
Default policy for construction of objects,
508+
The default is 'block'
509+
"""
510+
511+
with cf.config_prefix('mode'):
512+
cf.register_option('chained_assignment', 'warn', chained_assignment,
513+
validator=is_one_of_factory([None, 'warn', 'raise']))
514+
cf.register_option('policy', 'block', policy,
515+
validator=is_one_of_factory(['block', 'column', 'split']))
503516

504517

505518
with cf.config_prefix("io.excel.xls"):

pandas/core/frame.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,12 @@ class DataFrame(NDFrame):
355355
Data type to force. Only a single dtype is allowed. If None, infer.
356356
copy : bool, default False
357357
Copy data from inputs. Only affects DataFrame / 2d ndarray input.
358+
policy : string, default None
359+
Provide consolidation policy
360+
- None : use default policy
361+
- block : consolidate into blocks by dtype
362+
- column : don't consolidate, but don't split blocks
363+
- split : don't consolidate, force splitting of input
358364
359365
See Also
360366
--------
@@ -363,6 +369,9 @@ class DataFrame(NDFrame):
363369
read_csv : Read a comma-separated values (csv) file into DataFrame.
364370
read_table : Read general delimited file into DataFrame.
365371
read_clipboard : Read text from clipboard into DataFrame.
372+
Data type to force. Only a single dtype is allowed. If None, infer
373+
copy : boolean, default False
374+
Copy data from inputs. Only affects DataFrame / 2d ndarray input
366375
367376
Examples
368377
--------
@@ -426,6 +435,7 @@ def __init__(
426435
columns: Optional[Axes] = None,
427436
dtype: Optional[Dtype] = None,
428437
copy: bool = False,
438+
policy=None,
429439
):
430440
if data is None:
431441
data = {}
@@ -437,10 +447,11 @@ def __init__(
437447

438448
if isinstance(data, BlockManager):
439449
mgr = self._init_mgr(
440-
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
450+
data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy,
451+
policy=policy,
441452
)
442453
elif isinstance(data, dict):
443-
mgr = init_dict(data, index, columns, dtype=dtype)
454+
mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
444455
elif isinstance(data, ma.MaskedArray):
445456
import numpy.ma.mrecords as mrecords
446457

@@ -457,19 +468,19 @@ def __init__(
457468
data[mask] = fill_value
458469
else:
459470
data = data.copy()
460-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
471+
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
461472

462473
elif isinstance(data, (np.ndarray, Series, Index)):
463474
if data.dtype.names:
464475
data_columns = list(data.dtype.names)
465476
data = {k: data[k] for k in data_columns}
466477
if columns is None:
467478
columns = data_columns
468-
mgr = init_dict(data, index, columns, dtype=dtype)
479+
mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
469480
elif getattr(data, "name", None) is not None:
470-
mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
481+
mgr = init_dict({data.name: data}, index, columns, dtype=dtype, policy=policy)
471482
else:
472-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
483+
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
473484

474485
# For data is list-like, or Iterable (will consume into list)
475486
elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
@@ -493,11 +504,11 @@ def __init__(
493504
else:
494505
index = ibase.default_index(len(data))
495506

496-
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
507+
mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype, policy=policy)
497508
else:
498-
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
509+
mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
499510
else:
500-
mgr = init_dict({}, index, columns, dtype=dtype)
511+
mgr = init_dict({}, index, columns, dtype=dtype, policy=policy)
501512
else:
502513
try:
503514
arr = np.array(data, dtype=dtype, copy=copy)
@@ -513,7 +524,7 @@ def __init__(
513524
(len(index), len(columns)), data, dtype=dtype
514525
)
515526
mgr = init_ndarray(
516-
values, index, columns, dtype=values.dtype, copy=False
527+
values, index, columns, dtype=values.dtype, copy=False, policy=policy,
517528
)
518529
else:
519530
raise ValueError("DataFrame constructor not properly called!")
@@ -575,7 +586,7 @@ def _is_homogeneous_type(self) -> bool:
575586
Index._is_homogeneous_type : Whether the object has a single
576587
dtype.
577588
MultiIndex._is_homogeneous_type : Whether all the levels of a
578-
MultiIndex have the same dtype.
589+
have the same dtype.
579590
580591
Examples
581592
--------

pandas/core/generic.py

+142-1
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,21 @@ def attrs(self) -> Dict[Optional[Hashable], Any]:
248248
@attrs.setter
249249
def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
250250
self._attrs = dict(value)
251+
@property
252+
def _policy(self):
253+
""" return my policy for internal implementation """
254+
return self._data.policy
255+
256+
@_policy.setter
257+
def _policy(self, value):
258+
"""
259+
set my policy for internal implementation
260+
should only set the property for state purposes
261+
"""
262+
self._data.policy = value
263+
264+
def _validate_dtype(self, dtype):
265+
""" validate the passed dtype """
251266

252267
@classmethod
253268
def _validate_dtype(cls, dtype):
@@ -1822,6 +1837,7 @@ def __getstate__(self) -> Dict[str, Any]:
18221837
_typ=self._typ,
18231838
_metadata=self._metadata,
18241839
attrs=self.attrs,
1840+
_policy=self._policy,
18251841
**meta,
18261842
)
18271843

@@ -3259,8 +3275,133 @@ def _maybe_update_cacher(
32593275
if clear:
32603276
self._clear_item_cache()
32613277

3278+
<<<<<<< HEAD
32623279
def _clear_item_cache(self) -> None:
32633280
self._item_cache.clear()
3281+
=======
3282+
def _clear_item_cache(self, i=None):
3283+
if i is not None:
3284+
self._item_cache.pop(i, None)
3285+
else:
3286+
self._item_cache.clear()
3287+
3288+
def _slice(self, slobj, axis=0, kind=None):
3289+
"""
3290+
Construct a slice of this container.
3291+
3292+
kind parameter is maintained for compatibility with Series slicing.
3293+
"""
3294+
axis = self._get_block_manager_axis(axis)
3295+
result = self._constructor(self._data.get_slice(slobj, axis=axis))
3296+
result = result.__finalize__(self)
3297+
3298+
# this could be a view
3299+
# but only in a single-dtyped view slicable case
3300+
is_copy = axis != 0 or result._is_view
3301+
result._set_is_copy(self, copy=is_copy)
3302+
return result
3303+
3304+
def _set_item(self, key, value):
3305+
self._data.set(key, value)
3306+
self._clear_item_cache()
3307+
3308+
def _set_is_copy(self, ref=None, copy=True):
3309+
if not copy:
3310+
self._is_copy = None
3311+
else:
3312+
if ref is not None:
3313+
self._is_copy = weakref.ref(ref)
3314+
else:
3315+
self._is_copy = None
3316+
3317+
def _check_is_chained_assignment_possible(self):
3318+
"""
3319+
Check if we are a view, have a cacher, and are of mixed type.
3320+
If so, then force a setitem_copy check.
3321+
3322+
Should be called just near setting a value
3323+
3324+
Will return a boolean if it we are a view and are cached, but a
3325+
single-dtype meaning that the cacher should be updated following
3326+
setting.
3327+
"""
3328+
if self._is_view and self._is_cached:
3329+
ref = self._get_cacher()
3330+
if ref is not None:
3331+
3332+
# TODO: fix me!
3333+
# if we are a single block, then we don't need to check
3334+
# anything here if we are column and are actually a block,
3335+
# maybe be a bit tricky
3336+
if ref._policy in ['column', 'split']:
3337+
return True
3338+
if ref._is_mixed_type:
3339+
self._check_setitem_copy(stacklevel=4, t='referant',
3340+
force=True)
3341+
return True
3342+
elif self._is_copy:
3343+
self._check_setitem_copy(stacklevel=4, t='referant')
3344+
return False
3345+
3346+
def _check_setitem_copy(self, stacklevel=4, t='setting', force=False):
3347+
"""
3348+
3349+
Parameters
3350+
----------
3351+
stacklevel : integer, default 4
3352+
the level to show of the stack when the error is output
3353+
t : string, the type of setting error
3354+
force : boolean, default False
3355+
if True, then force showing an error
3356+
3357+
validate if we are doing a settitem on a chained copy.
3358+
3359+
If you call this function, be sure to set the stacklevel such that the
3360+
user will see the error *at the level of setting*
3361+
3362+
It is technically possible to figure out that we are setting on
3363+
a copy even WITH a multi-dtyped pandas object. In other words, some
3364+
blocks may be views while other are not. Currently _is_view will ALWAYS
3365+
return False for multi-blocks to avoid having to handle this case.
3366+
3367+
df = DataFrame(np.arange(0,9), columns=['count'])
3368+
df['group'] = 'b'
3369+
3370+
# This technically need not raise SettingWithCopy if both are view
3371+
# (which is not # generally guaranteed but is usually True. However,
3372+
# this is in general not a good practice and we recommend using .loc.
3373+
df.iloc[0:5]['group'] = 'a'
3374+
3375+
"""
3376+
3377+
if force or self._is_copy:
3378+
3379+
value = config.get_option('mode.chained_assignment')
3380+
if value is None:
3381+
return
3382+
3383+
# see if the copy is not actually referred; if so, then dissolve
3384+
# the copy weakref
3385+
try:
3386+
gc.collect(2)
3387+
if not gc.get_referents(self._is_copy()):
3388+
self._is_copy = None
3389+
return
3390+
except Exception:
3391+
pass
3392+
3393+
# we might be a false positive
3394+
try:
3395+
if self._is_copy().shape == self.shape:
3396+
self._is_copy = None
3397+
return
3398+
except Exception:
3399+
pass
3400+
3401+
# a custom message
3402+
if isinstance(self._is_copy, string_types):
3403+
t = self._is_copy
3404+
>>>>>>> API/PERF: add policy argument to constructors, #10556
32643405

32653406
# ----------------------------------------------------------------------
32663407
# Indexing Methods
@@ -5682,7 +5823,7 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
56825823
dtype: object
56835824
"""
56845825
data = self._data.copy(deep=deep)
5685-
return self._constructor(data).__finalize__(self)
5826+
return self._constructor(data, policy=self._policy).__finalize__(self)
56865827

56875828
def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
56885829
return self.copy(deep=deep)

0 commit comments

Comments
 (0)