API/PERF: add policy argument to constructors, pandas-dev#10556

jreback · jorisvandenbossche · commit a6c744d38405 · 2020-05-25T20:36:09.000+02:00
- closes pandas-dev#10556, add policy argument to constructors - closes pandas-dev#9216, all passing of dict with view directly to the API - closes pandas-dev#5902
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -471,21 +471,15 @@ def use_inf_as_na_cb(key):
 )
 
 
-# user warnings
+#
+# options from the "mode" namespace
+
 chained_assignment = """
 : string
     Raise an exception, warn, or no action if trying to use chained assignment,
     The default is warn
 """
 
-with cf.config_prefix("mode"):
-    cf.register_option(
-        "chained_assignment",
-        "warn",
-        chained_assignment,
-        validator=is_one_of_factory([None, "warn", "raise"]),
-    )
-
 
 # Set up the io.excel specific reader configuration.
 reader_engine_doc = """
@@ -499,6 +493,25 @@ def use_inf_as_na_cb(key):
 _xlsx_options = ["xlrd", "openpyxl"]
 _ods_options = ["odf"]
 _xlsb_options = ["pyxlsb"]
+policy = """
+: string
+    Default policy for construction of objects,
+    The default is 'block'
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "chained_assignment",
+        "warn",
+        chained_assignment,
+        validator=is_one_of_factory([None, "warn", "raise"]),
+    )
+    cf.register_option(
+        "policy",
+        "block",
+        policy,
+        validator=is_one_of_factory(["block", "column", "split"]),
+    )
 
 
 with cf.config_prefix("io.excel.xls"):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -361,6 +361,12 @@ class DataFrame(NDFrame):
         Data type to force. Only a single dtype is allowed. If None, infer.
     copy : bool, default False
         Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+    policy : string, default None
+        Provide consolidation policy
+          - None : use default policy
+          - block : consolidate into blocks by dtype
+          - column : don't consolidate, but don't split blocks
+          - split : don't consolidate, force splitting of input
 
     See Also
     --------
@@ -437,6 +443,7 @@ def __init__(
         columns: Optional[Axes] = None,
         dtype: Optional[Dtype] = None,
         copy: bool = False,
+        policy=None,
     ):
         if data is None:
             data = {}
@@ -453,11 +460,15 @@ def __init__(
                 return
 
             mgr = self._init_mgr(
-                data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+                data,
+                axes=dict(index=index, columns=columns),
+                dtype=dtype,
+                copy=copy,
+                policy=policy,
             )
 
         elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
+            mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 
@@ -474,19 +485,25 @@ def __init__(
                     data[mask] = fill_value
                 else:
                     data = data.copy()
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                mgr = init_ndarray(
+                    data, index, columns, dtype=dtype, copy=copy, policy=policy
+                )
 
         elif isinstance(data, (np.ndarray, Series, Index)):
             if data.dtype.names:
                 data_columns = list(data.dtype.names)
                 data = {k: data[k] for k in data_columns}
                 if columns is None:
                     columns = data_columns
-                mgr = init_dict(data, index, columns, dtype=dtype)
+                mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
             elif getattr(data, "name", None) is not None:
-                mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
+                mgr = init_dict(
+                    {data.name: data}, index, columns, dtype=dtype, policy=policy
+                )
             else:
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                mgr = init_ndarray(
+                    data, index, columns, dtype=dtype, copy=copy, policy=policy
+                )
 
         # For data is list-like, or Iterable (will consume into list)
         elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
@@ -510,11 +527,15 @@ def __init__(
                         else:
                             index = ibase.default_index(len(data))
 
-                    mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+                    mgr = arrays_to_mgr(
+                        arrays, columns, index, columns, dtype=dtype, policy=policy
+                    )
                 else:
-                    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                    mgr = init_ndarray(
+                        data, index, columns, dtype=dtype, copy=copy, policy=policy
+                    )
             else:
-                mgr = init_dict({}, index, columns, dtype=dtype)
+                mgr = init_dict({}, index, columns, dtype=dtype, policy=policy)
         else:
             try:
                 arr = np.array(data, dtype=dtype, copy=copy)
@@ -530,7 +551,12 @@ def __init__(
                     (len(index), len(columns)), data, dtype=dtype
                 )
                 mgr = init_ndarray(
-                    values, index, columns, dtype=values.dtype, copy=False
+                    values,
+                    index,
+                    columns,
+                    dtype=values.dtype,
+                    copy=False,
+                    policy=policy,
                 )
             else:
                 raise ValueError("DataFrame constructor not properly called!")
@@ -592,7 +618,7 @@ def _is_homogeneous_type(self) -> bool:
         Index._is_homogeneous_type : Whether the object has a single
             dtype.
         MultiIndex._is_homogeneous_type : Whether all the levels of a
-            MultiIndex have the same dtype.
+             have the same dtype.
 
         Examples
         --------
@@ -1977,6 +2003,7 @@ def _from_arrays(
         index,
         dtype: Optional[Dtype] = None,
         verify_integrity: bool = True,
+        policy=None,
     ) -> "DataFrame":
         """
         Create DataFrame from a list of arrays corresponding to the columns.
@@ -2012,6 +2039,7 @@ def _from_arrays(
             columns,
             dtype=dtype,
             verify_integrity=verify_integrity,
+            policy=policy,
         )
         return cls(mgr)
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -216,7 +216,9 @@ def __init__(
         object.__setattr__(self, "_attrs", attrs)
 
     @classmethod
-    def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager:
+    def _init_mgr(
+        cls, mgr, axes, dtype=None, copy: bool = False, policy=None
+    ) -> BlockManager:
         """ passed a manager and a axes dict """
         for a, axe in axes.items():
             if axe is not None:
@@ -252,6 +254,19 @@ def attrs(self) -> Dict[Optional[Hashable], Any]:
     def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
         self._attrs = dict(value)
 
+    @property
+    def _policy(self):
+        """ return my policy for internal implementation """
+        return self._mgr.policy
+
+    @_policy.setter
+    def _policy(self, value):
+        """
+        set my policy for internal implementation
+        should only set the property for state purposes
+        """
+        self._mgr.policy = value
+
     @classmethod
     def _validate_dtype(cls, dtype):
         """ validate the passed dtype """
@@ -1832,6 +1847,7 @@ def __getstate__(self) -> Dict[str, Any]:
             _typ=self._typ,
             _metadata=self._metadata,
             attrs=self.attrs,
+            _policy=self._policy,
             **meta,
         )
 
@@ -5752,7 +5768,9 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
         """
         data = self._mgr.copy(deep=deep)
         self._clear_item_cache()
-        return self._constructor(data).__finalize__(self, method="copy")
+        return self._constructor(data, policy=self._policy).__finalize__(
+            self, method="copy"
+        )
 
     def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
         return self.copy(deep=deep)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -298,6 +298,10 @@ def getitem_block(self, slicer, new_mgr_locs=None):
 
         return self.make_block_same_class(new_values, new_mgr_locs)
 
+    @property
+    def base(self):
+        return self.values.base
+
     @property
     def shape(self):
         return self.values.shape
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -64,6 +64,7 @@ def arrays_to_mgr(
     columns,
     dtype: Optional[DtypeObj] = None,
     verify_integrity: bool = True,
+    policy=None,
 ):
     """
     Segregate Series based on type and coerce into matrices.
@@ -90,7 +91,7 @@ def arrays_to_mgr(
     # from BlockManager perspective
     axes = [columns, index]
 
-    return create_block_manager_from_arrays(arrays, arr_names, axes)
+    return create_block_manager_from_arrays(arrays, arr_names, axes, policy=policy)
 
 
 def masked_rec_array_to_mgr(
@@ -140,7 +141,9 @@ def masked_rec_array_to_mgr(
 # DataFrame Constructor Interface
 
 
-def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
+def init_ndarray(
+    values, index, columns, dtype: Optional[DtypeObj], copy: bool, policy=None
+):
     # input must be a ndarray, list, Series, index
 
     if isinstance(values, ABCSeries):
@@ -169,7 +172,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
             values = values.copy()
 
         index, columns = _get_axes(len(values), 1, index, columns)
-        return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
+        return arrays_to_mgr(
+            [values], columns, index, columns, dtype=dtype, policy=policy
+        )
     elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
         # GH#19157
 
@@ -183,7 +188,9 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
         if columns is None:
             columns = Index(range(len(values)))
 
-        return arrays_to_mgr(values, columns, index, columns, dtype=dtype)
+        return arrays_to_mgr(
+            values, columns, index, columns, dtype=dtype, policy=policy
+        )
 
     # by definition an array here
     # the dtypes will be coerced to a single dtype
@@ -231,10 +238,12 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
     else:
         block_values = [values]
 
-    return create_block_manager_from_blocks(block_values, [columns, index])
+    return create_block_manager_from_blocks(block_values, [columns, index], policy)
 
 
-def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
+def init_dict(
+    data: Dict, index, columns, dtype: Optional[DtypeObj] = None, policy=None
+):
     """
     Segregate Series based on type and coerce into matrices.
     Needs to handle a lot of exceptional cases.
@@ -280,7 +289,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None):
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, policy=policy)
 
 
 # ---------------------------------------------------------------------
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
diff --git a/pandas/core/series.py b/pandas/core/series.py