API/PERF: add policy argument to constructors, pandas-dev#10556

jreback · TomAugspurger · commit c5396f64c5fa · 2020-03-17T08:12:45.000-05:00
- closes pandas-dev#10556, add policy argument to constructors - closes pandas-dev#9216, all passing of dict with view directly to the API - closes pandas-dev#5902
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -472,21 +472,15 @@ def use_inf_as_na_cb(key):
 )
 
 
-# user warnings
+#
+# options from the "mode" namespace
+
 chained_assignment = """
 : string
     Raise an exception, warn, or no action if trying to use chained assignment,
     The default is warn
 """
 
-with cf.config_prefix("mode"):
-    cf.register_option(
-        "chained_assignment",
-        "warn",
-        chained_assignment,
-        validator=is_one_of_factory([None, "warn", "raise"]),
-    )
-
 
 # Set up the io.excel specific reader configuration.
 reader_engine_doc = """
@@ -500,6 +494,25 @@ def use_inf_as_na_cb(key):
 _xlsx_options = ["xlrd", "openpyxl"]
 _ods_options = ["odf"]
 _xlsb_options = ["pyxlsb"]
+policy = """
+: string
+    Default policy for construction of objects,
+    The default is 'block'
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "chained_assignment",
+        "warn",
+        chained_assignment,
+        validator=is_one_of_factory([None, "warn", "raise"]),
+    )
+    cf.register_option(
+        "policy",
+        "block",
+        policy,
+        validator=is_one_of_factory(["block", "column", "split"]),
+    )
 
 
 with cf.config_prefix("io.excel.xls"):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -355,6 +355,12 @@ class DataFrame(NDFrame):
         Data type to force. Only a single dtype is allowed. If None, infer.
     copy : bool, default False
         Copy data from inputs. Only affects DataFrame / 2d ndarray input.
+    policy : string, default None
+        Provide consolidation policy
+          - None : use default policy
+          - block : consolidate into blocks by dtype
+          - column : don't consolidate, but don't split blocks
+          - split : don't consolidate, force splitting of input
 
     See Also
     --------
@@ -363,6 +369,9 @@ class DataFrame(NDFrame):
     read_csv : Read a comma-separated values (csv) file into DataFrame.
     read_table : Read general delimited file into DataFrame.
     read_clipboard : Read text from clipboard into DataFrame.
+        Data type to force. Only a single dtype is allowed. If None, infer
+    copy : boolean, default False
+        Copy data from inputs. Only affects DataFrame / 2d ndarray input
 
     Examples
     --------
@@ -426,6 +435,7 @@ def __init__(
         columns: Optional[Axes] = None,
         dtype: Optional[Dtype] = None,
         copy: bool = False,
+        policy=None,
     ):
         if data is None:
             data = {}
@@ -437,10 +447,11 @@ def __init__(
 
         if isinstance(data, BlockManager):
             mgr = self._init_mgr(
-                data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy
+                data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy,
+                policy=policy,
             )
         elif isinstance(data, dict):
-            mgr = init_dict(data, index, columns, dtype=dtype)
+            mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
         elif isinstance(data, ma.MaskedArray):
             import numpy.ma.mrecords as mrecords
 
@@ -457,19 +468,19 @@ def __init__(
                     data[mask] = fill_value
                 else:
                     data = data.copy()
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
 
         elif isinstance(data, (np.ndarray, Series, Index)):
             if data.dtype.names:
                 data_columns = list(data.dtype.names)
                 data = {k: data[k] for k in data_columns}
                 if columns is None:
                     columns = data_columns
-                mgr = init_dict(data, index, columns, dtype=dtype)
+                mgr = init_dict(data, index, columns, dtype=dtype, policy=policy)
             elif getattr(data, "name", None) is not None:
-                mgr = init_dict({data.name: data}, index, columns, dtype=dtype)
+                mgr = init_dict({data.name: data}, index, columns, dtype=dtype, policy=policy)
             else:
-                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
 
         # For data is list-like, or Iterable (will consume into list)
         elif isinstance(data, abc.Iterable) and not isinstance(data, (str, bytes)):
@@ -493,11 +504,11 @@ def __init__(
                         else:
                             index = ibase.default_index(len(data))
 
-                    mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
+                    mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype, policy=policy)
                 else:
-                    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
+                    mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy, policy=policy)
             else:
-                mgr = init_dict({}, index, columns, dtype=dtype)
+                mgr = init_dict({}, index, columns, dtype=dtype, policy=policy)
         else:
             try:
                 arr = np.array(data, dtype=dtype, copy=copy)
@@ -513,7 +524,7 @@ def __init__(
                     (len(index), len(columns)), data, dtype=dtype
                 )
                 mgr = init_ndarray(
-                    values, index, columns, dtype=values.dtype, copy=False
+                    values, index, columns, dtype=values.dtype, copy=False, policy=policy,
                 )
             else:
                 raise ValueError("DataFrame constructor not properly called!")
@@ -575,7 +586,7 @@ def _is_homogeneous_type(self) -> bool:
         Index._is_homogeneous_type : Whether the object has a single
             dtype.
         MultiIndex._is_homogeneous_type : Whether all the levels of a
-            MultiIndex have the same dtype.
+             have the same dtype.
 
         Examples
         --------
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -249,6 +249,19 @@ def attrs(self) -> Dict[Optional[Hashable], Any]:
     def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None:
         self._attrs = dict(value)
 
+    @property
+    def _policy(self):
+        """ return my policy for internal implementation """
+        return self._data.policy
+
+    @_policy.setter
+    def _policy(self, value):
+        """
+        set my policy for internal implementation
+        should only set the property for state purposes
+        """
+        self._data.policy = value
+
     @classmethod
     def _validate_dtype(cls, dtype):
         """ validate the passed dtype """
@@ -1822,6 +1835,7 @@ def __getstate__(self) -> Dict[str, Any]:
             _typ=self._typ,
             _metadata=self._metadata,
             attrs=self.attrs,
+            _policy=self._policy,
             **meta,
         )
 
@@ -5682,7 +5696,7 @@ def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
         dtype: object
         """
         data = self._data.copy(deep=deep)
-        return self._constructor(data).__finalize__(self)
+        return self._constructor(data, policy=self._policy).__finalize__(self)
 
     def __copy__(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries:
         return self.copy(deep=deep)
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -314,6 +314,10 @@ def getitem_block(self, slicer, new_mgr_locs=None):
 
         return self.make_block_same_class(new_values, new_mgr_locs)
 
+    @property
+    def base(self):
+        return self.values.base
+
     @property
     def shape(self):
         return self.values.shape
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -53,7 +53,7 @@
 # BlockManager Interface
 
 
-def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
+def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None, policy=None):
     """
     Segregate Series based on type and coerce into matrices.
 
@@ -71,7 +71,7 @@ def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None):
     # from BlockManager perspective
     axes = [ensure_index(columns), index]
 
-    return create_block_manager_from_arrays(arrays, arr_names, axes)
+    return create_block_manager_from_arrays(arrays, arr_names, axes, policy=policy)
 
 
 def masked_rec_array_to_mgr(data, index, columns, dtype, copy: bool):
@@ -209,7 +209,7 @@ def init_ndarray(values, index, columns, dtype=None, copy=False):
     return create_block_manager_from_blocks(block_values, [columns, index])
 
 
-def init_dict(data, index, columns, dtype=None):
+def init_dict(data, index, columns, dtype=None, policy=None):
     """
     Segregate Series based on type and coerce into matrices.
     Needs to handle a lot of exceptional cases.
@@ -250,7 +250,7 @@ def init_dict(data, index, columns, dtype=None):
         arrays = [
             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
         ]
-    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
+    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, policy=policy)
 
 
 # ---------------------------------------------------------------------
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
diff --git a/pandas/core/series.py b/pandas/core/series.py