Merge remote-tracking branch 'upstream/master' into excel-read-shared-init-to-baseclass

tdamsma · tdamsma · commit ddcaad80d44e · 2019-04-29T09:16:58.000+02:00
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -365,6 +365,7 @@ I/O
 - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
 - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
 - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
+- Improved the ``col_space`` parameter in :meth:`DataFrame.to_html` to accept a string so CSS length values can be set correctly (:issue:`25941`)
 - Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`)
 - Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`)
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -658,7 +658,9 @@ def _repr_html_(self):
 
     @Substitution(header='Write out the column names. If a list of strings '
                          'is given, it is assumed to be aliases for the '
-                         'column names')
+                         'column names',
+                  col_space_type='int',
+                  col_space='The minimum width of each column')
     @Substitution(shared_params=fmt.common_docstring,
                   returns=fmt.return_docstring)
     def to_string(self, buf=None, columns=None, col_space=None, header=True,
@@ -2138,7 +2140,12 @@ def to_parquet(self, fname, engine='auto', compression='snappy',
                    compression=compression, index=index,
                    partition_cols=partition_cols, **kwargs)
 
-    @Substitution(header='Whether to print column labels, default True')
+    @Substitution(header='Whether to print column labels, default True',
+                  col_space_type='str or int',
+                  col_space='The minimum width of each column in CSS length '
+                            'units.  An int is assumed to be px units.\n\n'
+                            '            .. versionadded:: 0.25.0\n'
+                            '                Abillity to use str')
     @Substitution(shared_params=fmt.common_docstring,
                   returns=fmt.return_docstring)
     def to_html(self, buf=None, columns=None, col_space=None, header=True,
diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py
@@ -522,21 +522,17 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
     any_arraylike = any(isinstance(g, (list, tuple, Series, Index, np.ndarray))
                         for g in keys)
 
-    try:
+    # is this an index replacement?
+    if (not any_callable and not any_arraylike and not any_groupers and
+            match_axis_length and level is None):
         if isinstance(obj, DataFrame):
-            all_in_columns_index = all(g in obj.columns or g in obj.index.names
-                                       for g in keys)
+            all_in_columns_index = all(g in obj.columns or g in
+                                       obj.index.names for g in keys)
         elif isinstance(obj, Series):
             all_in_columns_index = all(g in obj.index.names for g in keys)
-        else:
-            all_in_columns_index = False
-    except Exception:
-        all_in_columns_index = False
 
-    if (not any_callable and not all_in_columns_index and
-            not any_arraylike and not any_groupers and
-            match_axis_length and level is None):
-        keys = [com.asarray_tuplesafe(keys)]
+        if not all_in_columns_index:
+            keys = [com.asarray_tuplesafe(keys)]
 
     if isinstance(level, (tuple, list)):
         if key is None:
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -27,13 +27,13 @@
     CategoricalDtype, ExtensionDtype, PandasExtensionDtype)
 from pandas.core.dtypes.generic import (
     ABCDataFrame, ABCDatetimeIndex, ABCExtensionArray, ABCIndexClass,
-    ABCSeries)
+    ABCPandasArray, ABCSeries)
 from pandas.core.dtypes.missing import (
     _isna_compat, array_equivalent, isna, notna)
 
 import pandas.core.algorithms as algos
 from pandas.core.arrays import (
-    Categorical, DatetimeArray, ExtensionArray, TimedeltaArray)
+    Categorical, DatetimeArray, ExtensionArray, PandasDtype, TimedeltaArray)
 from pandas.core.base import PandasObject
 import pandas.core.common as com
 from pandas.core.indexes.datetimes import DatetimeIndex
@@ -576,23 +576,14 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
 
             return self.make_block(Categorical(self.values, dtype=dtype))
 
-        # convert dtypes if needed
         dtype = pandas_dtype(dtype)
+
         # astype processing
         if is_dtype_equal(self.dtype, dtype):
             if copy:
                 return self.copy()
             return self
 
-        klass = None
-        if is_sparse(self.values):
-            # special case sparse, Series[Sparse].astype(object) is sparse
-            klass = ExtensionBlock
-        elif is_object_dtype(dtype):
-            klass = ObjectBlock
-        elif is_extension_array_dtype(dtype):
-            klass = ExtensionBlock
-
         try:
             # force the copy here
             if values is None:
@@ -624,7 +615,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
                     pass
 
             newb = make_block(values, placement=self.mgr_locs,
-                              klass=klass, ndim=self.ndim)
+                              ndim=self.ndim)
         except Exception:  # noqa: E722
             if errors == 'raise':
                 raise
@@ -3041,6 +3032,13 @@ def get_block_type(values, dtype=None):
 
 def make_block(values, placement, klass=None, ndim=None, dtype=None,
                fastpath=None):
+    # Ensure that we don't allow PandasArray / PandasDtype in internals.
+    # For now, blocks should be backed by ndarrays when possible.
+    if isinstance(values, ABCPandasArray):
+        values = values.to_numpy()
+    if isinstance(dtype, PandasDtype):
+        dtype = dtype.numpy_dtype
+
     if fastpath is not None:
         # GH#19265 pyarrow is passing this
         warnings.warn("fastpath argument is deprecated, will be removed "
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -41,8 +41,8 @@
             Buffer to write to.
         columns : sequence, optional, default None
             The subset of columns to write. Writes all columns by default.
-        col_space : int, optional
-            The minimum width of each column.
+        col_space : %(col_space_type)s, optional
+            %(col_space)s.
         header : bool, optional
             %(header)s.
         index : bool, optional, default True
diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py
@@ -45,6 +45,9 @@ def __init__(self, formatter, classes=None, border=None):
         self.border = border
         self.table_id = self.fmt.table_id
         self.render_links = self.fmt.render_links
+        if isinstance(self.fmt.col_space, int):
+            self.fmt.col_space = ('{colspace}px'
+                                  .format(colspace=self.fmt.col_space))
 
     @property
     def show_row_idx_names(self):
@@ -84,8 +87,30 @@ def write(self, s, indent=0):
         rs = pprint_thing(s)
         self.elements.append(' ' * indent + rs)
 
-    def write_th(self, s, indent=0, tags=None):
-        if self.fmt.col_space is not None and self.fmt.col_space > 0:
+    def write_th(self, s, header=False, indent=0, tags=None):
+        """
+        Method for writting a formatted <th> cell.
+
+        If col_space is set on the formatter then that is used for
+        the value of min-width.
+
+        Parameters
+        ----------
+        s : object
+            The data to be written inside the cell.
+        header : boolean, default False
+            Set to True if the <th> is for use inside <thead>.  This will
+            cause min-width to be set if there is one.
+        indent : int, default 0
+            The indentation level of the cell.
+        tags : string, default None
+            Tags to include in the cell.
+
+        Returns
+        -------
+        A written <th> cell.
+        """
+        if header and self.fmt.col_space is not None:
             tags = (tags or "")
             tags += ('style="min-width: {colspace};"'
                      .format(colspace=self.fmt.col_space))
@@ -136,7 +161,7 @@ def write_tr(self, line, indent=0, indent_delta=0, header=False,
         for i, s in enumerate(line):
             val_tag = tags.get(i, None)
             if header or (self.bold_rows and i < nindex_levels):
-                self.write_th(s, indent, tags=val_tag)
+                self.write_th(s, indent=indent, header=header, tags=val_tag)
             else:
                 self.write_td(s, indent, tags=val_tag)
 
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -1291,3 +1291,23 @@ def test_block_shape():
 
     assert (a._data.blocks[0].mgr_locs.indexer ==
             b._data.blocks[0].mgr_locs.indexer)
+
+
+def test_make_block_no_pandas_array():
+    # https://github.com/pandas-dev/pandas/pull/24866
+    arr = pd.array([1, 2])
+
+    # PandasArray, no dtype
+    result = make_block(arr, slice(len(arr)))
+    assert result.is_integer is True
+    assert result.is_extension is False
+
+    # PandasArray, PandasDtype
+    result = make_block(arr, slice(len(arr)), dtype=arr.dtype)
+    assert result.is_integer is True
+    assert result.is_extension is False
+
+    # ndarray, PandasDtype
+    result = make_block(arr.to_numpy(), slice(len(arr)), dtype=arr.dtype)
+    assert result.is_integer is True
+    assert result.is_extension is False
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py
@@ -641,3 +641,17 @@ def test_to_html_round_column_headers():
         notebook = df.to_html(notebook=True)
     assert "0.55555" in html
     assert "0.556" in notebook
+
+
+@pytest.mark.parametrize("unit", ['100px', '10%', '5em', 150])
+def test_to_html_with_col_space_units(unit):
+    # GH 25941
+    df = DataFrame(np.random.random(size=(1, 3)))
+    result = df.to_html(col_space=unit)
+    result = result.split('tbody')[0]
+    hdrs = [x for x in result.split("\n") if re.search(r"<th[>\s]", x)]
+    if isinstance(unit, int):
+        unit = str(unit) + 'px'
+    for h in hdrs:
+        expected = '<th style="min-width: {unit};">'.format(unit=unit)
+        assert expected in h
diff --git a/pandas/tests/series/test_internals.py b/pandas/tests/series/test_internals.py
@@ -312,6 +312,14 @@ def test_constructor_no_pandas_array(self):
         tm.assert_series_equal(ser, result)
         assert isinstance(result._data.blocks[0], IntBlock)
 
+    def test_astype_no_pandas_dtype(self):
+        # https://github.com/pandas-dev/pandas/pull/24866
+        ser = pd.Series([1, 2], dtype="int64")
+        # Don't have PandasDtype in the public API, so we use `.array.dtype`,
+        # which is a PandasDtype.
+        result = ser.astype(ser.array.dtype)
+        tm.assert_series_equal(result, ser)
+
     def test_from_array(self):
         result = pd.Series(pd.array(['1H', '2H'], dtype='timedelta64[ns]'))
         assert result._data.blocks[0].is_extension is False