From 71e8b3169b77149923afd39e70fb52c4277d7ebb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 4 Apr 2019 00:02:54 +0100 Subject: [PATCH 1/9] to_html formatter not called for float values in a mixed-type column --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/io/formats/format.py | 17 +++++++++++------ pandas/tests/io/formats/test_to_html.py | 10 ++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 1ef05ae5f9c6b..e01185b54f49d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -344,6 +344,7 @@ I/O ^^^ - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) +- Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b658c8a53dc8b..e2f01e03e3cef 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -980,6 +980,17 @@ def _format(x): if leading_space is None: leading_space = is_float_type.any() + if leading_space is False: + # False specifically, so that the default is + # to include a space if we get here. + tpl = '{v}' + else: + tpl = ' {v}' + + # shortcut + if self.formatter is not None: + return [tpl.format(v=self.formatter(x)) for x in self.values] + fmt_values = [] for i, v in enumerate(vals): if not is_float_type[i] and leading_space: @@ -987,12 +998,6 @@ def _format(x): elif is_float_type[i]: fmt_values.append(float_format(v)) else: - if leading_space is False: - # False specifically, so that the default is - # to include a space if we get here. - tpl = '{v}' - else: - tpl = ' {v}' fmt_values.append(tpl.format(v=_format(v))) return fmt_values diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index d146e9c16e114..436dcff0aed60 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -633,3 +633,13 @@ def test_to_html_invalid_classes_type(classes): with pytest.raises(TypeError, match=msg): df.to_html(classes=classes) + + +def test_to_html_formatters_object_type(): + # GH 13021 + def f(x): + return x if type(x) is str else '${:,.0f}'.format(x) + + df = pd.DataFrame([['a'], [0], [10.4], [3]], columns=['x']) + result = df.to_html(formatters=dict(x=f)) + assert '$10' in result From 2a2bb57a9ee71c6686838df2ed47733865b015b0 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 4 Apr 2019 20:30:28 +0100 Subject: [PATCH 2/9] changes to test as requested --- .../data/html/gh13021_expected_output.html | 26 +++++++++++++++++++ pandas/tests/io/formats/test_to_html.py | 7 ++--- 2 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/io/formats/data/html/gh13021_expected_output.html diff --git a/pandas/tests/io/formats/data/html/gh13021_expected_output.html b/pandas/tests/io/formats/data/html/gh13021_expected_output.html new file mode 100644 index 0000000000000..55d864d8a0d3d --- /dev/null +++ b/pandas/tests/io/formats/data/html/gh13021_expected_output.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
x
0a
1$0
2$10
3$3
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 436dcff0aed60..29da4de36bf66 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -635,11 +635,12 @@ def test_to_html_invalid_classes_type(classes): df.to_html(classes=classes) -def test_to_html_formatters_object_type(): +def test_to_html_formatters_object_type(datapath): # GH 13021 def f(x): - return x if type(x) is str else '${:,.0f}'.format(x) + return x if isinstance(x, str) else '${:,.0f}'.format(x) df = pd.DataFrame([['a'], [0], [10.4], [3]], columns=['x']) result = df.to_html(formatters=dict(x=f)) - assert '$10' in result + expected = expected_html(datapath, 'gh13021_expected_output') + assert result == expected From 4b60e4bbb84f024cb026075bbafffbfecfb4c29a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 5 Apr 2019 22:06:43 +0100 Subject: [PATCH 3/9] shortcut format_array --- pandas/io/formats/format.py | 34 ++++++++++++++---------- pandas/tests/io/formats/test_format.py | 8 +++--- pandas/tests/io/formats/test_to_latex.py | 8 +++--- 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index e8fcae809f1d6..b58d3b6de3306 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -717,9 +717,20 @@ def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) values_to_format = frame.iloc[:, i]._formatting_values() - return format_array(values_to_format, formatter, - float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, decimal=self.decimal) + if formatter: + try: + fmt_values = [formatter(x) for x in values_to_format] + except AttributeError: + # assume we have np.datetime64 array + values_to_format = DatetimeIndex(values_to_format) + fmt_values = [formatter(x) for x in values_to_format] + return _make_fixed_width(fmt_values, self.justify) + + else: + return format_array( + values_to_format, formatter=None, + float_format=self.float_format, na_rep=self.na_rep, + space=self.col_space, decimal=self.decimal) def to_html(self, classes=None, notebook=False, border=None): """ @@ -981,17 +992,6 @@ def _format(x): if leading_space is None: leading_space = is_float_type.any() - if leading_space is False: - # False specifically, so that the default is - # to include a space if we get here. - tpl = '{v}' - else: - tpl = ' {v}' - - # shortcut - if self.formatter is not None: - return [tpl.format(v=self.formatter(x)) for x in self.values] - fmt_values = [] for i, v in enumerate(vals): if not is_float_type[i] and leading_space: @@ -999,6 +999,12 @@ def _format(x): elif is_float_type[i]: fmt_values.append(float_format(v)) else: + if leading_space is False: + # False specifically, so that the default is + # to include a space if we get here. + tpl = '{v}' + else: + tpl = ' {v}' fmt_values.append(tpl.format(v=_format(v))) return fmt_values diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a960dcb9d164a..bf0d539ff616c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -511,10 +511,10 @@ def test_to_string_with_formatters(self): ('object', lambda x: '-{x!s}-'.format(x=x))] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=lzip(*formatters)[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-') assert result == result2 def test_to_string_with_datetime64_monthformatter(self): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index a2b65dab9a0a2..4e76fdf845619 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -125,11 +125,11 @@ def test_to_latex_with_formatters(self): expected = r"""\begin{tabular}{llrrl} \toprule -{} & datetime64 & float & int & object \\ +{} & datetime64 & float & int & object \\ \midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ \bottomrule \end{tabular} """ From 8a64459ba2aae22337c7c9df0e76b3e05c833e69 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 5 Apr 2019 23:12:50 +0100 Subject: [PATCH 4/9] add shortcut parameter to format_array --- pandas/io/formats/format.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b58d3b6de3306..865f0f5457336 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -717,20 +717,11 @@ def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) values_to_format = frame.iloc[:, i]._formatting_values() - if formatter: - try: - fmt_values = [formatter(x) for x in values_to_format] - except AttributeError: - # assume we have np.datetime64 array - values_to_format = DatetimeIndex(values_to_format) - fmt_values = [formatter(x) for x in values_to_format] - return _make_fixed_width(fmt_values, self.justify) - - else: - return format_array( - values_to_format, formatter=None, - float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, decimal=self.decimal) + shortcut = formatter is not None + return format_array(values_to_format, formatter, + float_format=self.float_format, na_rep=self.na_rep, + space=self.col_space, decimal=self.decimal, + shortcut=shortcut) def to_html(self, classes=None, notebook=False, border=None): """ @@ -867,7 +858,7 @@ def _get_column_name_list(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', - leading_space=None): + leading_space=None, shortcut=False): """ Format an array for printing. @@ -889,6 +880,9 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', When formatting an Index subclass (e.g. IntervalIndex._format_native_types), we don't want the leading space since it should be left-aligned. + shortcut : bool, optional, default False + Whether to shortcut the formatting options. Used when specifying + custom formatters in to_string, to_latex and to_html Returns ------- @@ -922,7 +916,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, float_format=float_format, formatter=formatter, space=space, justify=justify, decimal=decimal, - leading_space=leading_space) + leading_space=leading_space, shortcut=shortcut) return fmt_obj.get_result() @@ -931,7 +925,8 @@ class GenericArrayFormatter(object): def __init__(self, values, digits=7, formatter=None, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', - quoting=None, fixed_width=True, leading_space=None): + quoting=None, fixed_width=True, leading_space=None, + shortcut=False): self.values = values self.digits = digits self.na_rep = na_rep @@ -943,12 +938,17 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.quoting = quoting self.fixed_width = fixed_width self.leading_space = leading_space + self.shortcut = shortcut def get_result(self): fmt_values = self._format_strings() return _make_fixed_width(fmt_values, self.justify) def _format_strings(self): + # shortcut + if self.formatter is not None and self.shortcut: + return [self.formatter(x) for x in self.values] + if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: From 9c1354c7bcfa18047277ef3fb408f90eb0fbf854 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 5 Apr 2019 23:42:27 +0100 Subject: [PATCH 5/9] add whatsnew for #26002 --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index f4d236487eaa9..547c0785bba3e 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,6 +348,7 @@ I/O - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) +- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would add a leading space when using ``formatters`` on a column with ``dtype=object`` (:issue:`26002`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) From d0df1d63d9596bcedf13a2bd820152b9eb1391fe Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 6 Apr 2019 23:48:57 +0100 Subject: [PATCH 6/9] remove shortcut parameter from format_array --- pandas/core/indexes/interval.py | 1 + pandas/io/formats/format.py | 24 ++++++++++-------------- pandas/tests/io/formats/test_format.py | 8 ++++---- pandas/tests/io/formats/test_to_latex.py | 8 ++++---- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ffbed7ab2006d..37b3432268ff8 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1019,6 +1019,7 @@ def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs): """ actually format my specific types """ from pandas.io.formats.format import ExtensionArrayFormatter return ExtensionArrayFormatter(values=self, + formatter=False, na_rep=na_rep, justify='all', leading_space=False).get_result() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 865f0f5457336..9a0ec2342f57e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -717,11 +717,9 @@ def _format_col(self, i): frame = self.tr_frame formatter = self._get_formatter(i) values_to_format = frame.iloc[:, i]._formatting_values() - shortcut = formatter is not None return format_array(values_to_format, formatter, float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, decimal=self.decimal, - shortcut=shortcut) + space=self.col_space, decimal=self.decimal) def to_html(self, classes=None, notebook=False, border=None): """ @@ -858,7 +856,7 @@ def _get_column_name_list(self): def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', - leading_space=None, shortcut=False): + leading_space=None): """ Format an array for printing. @@ -880,9 +878,6 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', When formatting an Index subclass (e.g. IntervalIndex._format_native_types), we don't want the leading space since it should be left-aligned. - shortcut : bool, optional, default False - Whether to shortcut the formatting options. Used when specifying - custom formatters in to_string, to_latex and to_html Returns ------- @@ -916,7 +911,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, float_format=float_format, formatter=formatter, space=space, justify=justify, decimal=decimal, - leading_space=leading_space, shortcut=shortcut) + leading_space=leading_space) return fmt_obj.get_result() @@ -925,8 +920,7 @@ class GenericArrayFormatter(object): def __init__(self, values, digits=7, formatter=None, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', - quoting=None, fixed_width=True, leading_space=None, - shortcut=False): + quoting=None, fixed_width=True, leading_space=None): self.values = values self.digits = digits self.na_rep = na_rep @@ -938,7 +932,6 @@ def __init__(self, values, digits=7, formatter=None, na_rep='NaN', self.quoting = quoting self.fixed_width = fixed_width self.leading_space = leading_space - self.shortcut = shortcut def get_result(self): fmt_values = self._format_strings() @@ -946,8 +939,8 @@ def get_result(self): def _format_strings(self): # shortcut - if self.formatter is not None and self.shortcut: - return [self.formatter(x) for x in self.values] + if self.formatter is not None: + return [' {}'.format(self.formatter(x)) for x in self.values] if self.float_format is None: float_format = get_option("display.float_format") @@ -1199,7 +1192,10 @@ def _format_strings(self): if isinstance(values, (ABCIndexClass, ABCSeries)): values = values._values - formatter = values._formatter(boxed=True) + if self.formatter is None: + formatter = values._formatter(boxed=True) + elif self.formatter is False: + formatter = None if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index bf0d539ff616c..a960dcb9d164a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -511,10 +511,10 @@ def test_to_string_with_formatters(self): ('object', lambda x: '-{x!s}-'.format(x=x))] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=lzip(*formatters)[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-') assert result == result2 def test_to_string_with_datetime64_monthformatter(self): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 4e76fdf845619..a2b65dab9a0a2 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -125,11 +125,11 @@ def test_to_latex_with_formatters(self): expected = r"""\begin{tabular}{llrrl} \toprule -{} & datetime64 & float & int & object \\ +{} & datetime64 & float & int & object \\ \midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ \bottomrule \end{tabular} """ From 4262113959cc42e3dbefc11d2230ce0d4a202c33 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 6 Apr 2019 23:51:02 +0100 Subject: [PATCH 7/9] remove whatsnew for #26002 --- doc/source/whatsnew/v0.25.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 547c0785bba3e..f4d236487eaa9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -348,7 +348,6 @@ I/O - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) -- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` that would add a leading space when using ``formatters`` on a column with ``dtype=object`` (:issue:`26002`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) From f0cf9b7f2b5f40f94f3dd4999cc4ee940d9c775e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Apr 2019 01:13:10 +0100 Subject: [PATCH 8/9] defer to GenericArrayFormatter for IntervalArray --- pandas/core/arrays/interval.py | 4 ++++ pandas/core/indexes/interval.py | 1 - pandas/io/formats/format.py | 5 +---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 7b20c7e1b6336..f800a261ff7ed 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -804,6 +804,10 @@ def value_counts(self, dropna=True): # Formatting + def _formatter(self, boxed=False): + # Defer to GenericArrayFormatter's formatter. + return None + def _format_data(self): # TODO: integrate with categorical and make generic diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 37b3432268ff8..ffbed7ab2006d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1019,7 +1019,6 @@ def _format_native_types(self, na_rep='NaN', quoting=None, **kwargs): """ actually format my specific types """ from pandas.io.formats.format import ExtensionArrayFormatter return ExtensionArrayFormatter(values=self, - formatter=False, na_rep=na_rep, justify='all', leading_space=False).get_result() diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9a0ec2342f57e..421093a017414 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1192,10 +1192,7 @@ def _format_strings(self): if isinstance(values, (ABCIndexClass, ABCSeries)): values = values._values - if self.formatter is None: - formatter = values._formatter(boxed=True) - elif self.formatter is False: - formatter = None + formatter = values._formatter(boxed=True) if is_categorical_dtype(values.dtype): # Categorical is special for now, so that we can preserve tzinfo From 5ecf91a79d9a46ff3d32750b57b31e6d1bbd0d8d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 18 Jun 2019 16:00:45 +0100 Subject: [PATCH 9/9] pre-format instead of shortcut --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/arrays/categorical.py | 6 +-- pandas/core/arrays/integer.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/io/formats/format.py | 54 +++++++++++------------- pandas/tests/arrays/test_integer.py | 8 ++++ pandas/tests/io/formats/test_format.py | 38 ++++++++--------- pandas/tests/io/formats/test_to_latex.py | 8 ++-- 8 files changed, 60 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d01979f5874db..a4a82db3566cf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -654,6 +654,7 @@ I/O - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) - Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) +- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` resulting in incorrect column spacing using ``formatters`` on a column with ``dtype=object`` (:issue:`26002`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924..ce0ef607ce01d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1973,11 +1973,11 @@ def _repr_categories(self): from pandas.io.formats import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 - head = fmt.format_array(self.categories[:num], None) - tail = fmt.format_array(self.categories[-num:], None) + head = fmt.format_array(self.categories[:num]) + tail = fmt.format_array(self.categories[-num:]) category_strs = head + ["..."] + tail else: - category_strs = fmt.format_array(self.categories, None) + category_strs = fmt.format_array(self.categories) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 589e98f016f69..40cecb9ce0d4f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -315,7 +315,7 @@ def _from_factorized(cls, values, original): def _formatter(self, boxed=False): def fmt(x): if isna(x): - return 'NaN' + return np.nan if boxed else 'NaN' return str(x) return fmt diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4601d63f2d27e..3e3569e0bd2c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1041,7 +1041,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = result.tolist() else: - result = _trim_front(format_array(values, None, justify='left')) + result = _trim_front(format_array(values, justify='left')) return header + result def to_native_types(self, slicer=None, **kwargs): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 3425a751508d2..87567844ffab5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -129,7 +129,7 @@ def _get_footer(self): return str(footer) def _get_formatted_values(self): - return format_array(self.categorical.get_values(), None, + return format_array(self.categorical.get_values(), float_format=None, na_rep=self.na_rep) def to_string(self): @@ -249,7 +249,7 @@ def _get_formatted_index(self): def _get_formatted_values(self): values_to_format = self.tr_series._formatting_values() - return format_array(values_to_format, None, + return format_array(values_to_format, float_format=self.float_format, na_rep=self.na_rep) def to_string(self): @@ -853,7 +853,7 @@ def _get_column_name_list(self): # Array formatters -def format_array(values, formatter, float_format=None, na_rep='NaN', +def format_array(values, formatter=None, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', leading_space=None): """ @@ -883,6 +883,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', List[str] """ + if is_datetime64_dtype(values.dtype): + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + # we apply an optional formatter function passed as a parameter before + # applying additional formatting parameters. This allows EA to control + # formatting and also honour additional formatting options. + # We specify dtype and dispatch to GenericArrayFormatter. + if formatter is not None and callable(formatter): + values = np.array([formatter(x) for x in values], dtype=object) + if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): @@ -908,7 +919,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', digits = get_option("display.precision") fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, formatter=formatter, + float_format=float_format, space=space, justify=justify, decimal=decimal, leading_space=leading_space) @@ -917,14 +928,13 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', class GenericArrayFormatter: - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', + def __init__(self, values, digits=7, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', quoting=None, fixed_width=True, leading_space=None): self.values = values self.digits = digits self.na_rep = na_rep self.space = space - self.formatter = formatter self.float_format = float_format self.justify = justify self.decimal = decimal @@ -937,10 +947,6 @@ def get_result(self): return _make_fixed_width(fmt_values, self.justify) def _format_strings(self): - # shortcut - if self.formatter is not None: - return [' {}'.format(self.formatter(x)) for x in self.values] - if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: @@ -950,9 +956,7 @@ def _format_strings(self): else: float_format = self.float_format - formatter = ( - self.formatter if self.formatter is not None else - (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')))) + formatter = lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): @@ -1008,7 +1012,8 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.formatter = None # float_format is expected to be a string # formatter should be used to pass a function @@ -1066,9 +1071,6 @@ def get_result_as_array(self): the parameters given at initialisation, as a numpy array """ - if self.formatter is not None: - return np.array([self.formatter(x) for x in self.values]) - if self.fixed_width: threshold = get_option("display.chop_threshold") else: @@ -1158,7 +1160,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) + formatter = lambda x: '{x: d}'.format(x=x) fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -1175,12 +1177,6 @@ def _format_strings(self): values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) - - if self.formatter is not None and callable(self.formatter): - return [self.formatter(x) for x in values] - fmt_values = format_array_from_datetime( values.asi8.ravel(), format=_get_format_datetime64_from_values(values, @@ -1341,9 +1337,8 @@ def _format_strings(self): values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = (self.formatter or - _get_format_datetime64(is_dates_only, - date_format=self.date_format)) + formatter = _get_format_datetime64(is_dates_only, + date_format=self.date_format) fmt_values = [formatter(x) for x in values] return fmt_values @@ -1357,9 +1352,8 @@ def __init__(self, values, nat_rep='NaT', box=False, **kwargs): self.box = box def _format_strings(self): - formatter = (self.formatter or - _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box)) + formatter = _get_format_timedelta64(self.values, nat_rep=self.nat_rep, + box=self.box) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 066eadc9b68bc..62e625c20915d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -489,6 +489,14 @@ def test_frame_repr(data_missing): assert result == expected +def test_frame_to_string_na_rep(data_missing): + + df = pd.DataFrame({'A': data_missing}) + result = df.to_string(na_rep='foo') + expected = ' A\n0 foo\n1 1' + assert result == expected + + def test_conversions(data_missing): # astype to object series diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index edb7c2136825d..8714e7dab3f79 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -498,10 +498,10 @@ def test_to_string_with_formatters(self): ('object', lambda x: '-{x!s}-'.format(x=x))] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-') assert result == result2 def test_to_string_with_datetime64_monthformatter(self): @@ -511,7 +511,7 @@ def test_to_string_with_datetime64_monthformatter(self): def format_func(x): return x.strftime('%Y-%m') result = x.to_string(formatters={'months': format_func}) - expected = 'months\n0 2016-01\n1 2016-02' + expected = 'months\n0 2016-01\n1 2016-02' assert result.strip() == expected def test_to_string_with_datetime64_hourformatter(self): @@ -523,7 +523,7 @@ def format_func(x): return x.strftime('%H:%M') result = x.to_string(formatters={'hod': format_func}) - expected = 'hod\n0 10:10\n1 12:12' + expected = 'hod\n0 10:10\n1 12:12' assert result.strip() == expected def test_to_string_with_formatters_unicode(self): @@ -2547,19 +2547,19 @@ class TestDatetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): x = Series([Timestamp(200)]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "1970-01-01 00:00:00.000000200" def test_dates_display(self): @@ -2568,35 +2568,35 @@ def test_dates_display(self): # make sure that we are consistently display date formatting x = Series(date_range('20130101 09:00:00', periods=5, freq='D')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range('20130101 09:00:00', periods=5, freq='s')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range('20130101 09:00:00', periods=5, freq='ms')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range('20130101 09:00:00', periods=5, freq='us')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" x = Series(date_range('20130101 09:00:00', periods=5, freq='N')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" @@ -2607,9 +2607,8 @@ def test_datetime64formatter_yearmonth(self): def format_func(x): return x.strftime('%Y-%m') - formatter = fmt.Datetime64Formatter(x, formatter=format_func) - result = formatter.get_result() - assert result == ['2016-01', '2016-02'] + result = fmt.format_array(x, formatter=format_func) + assert result == [' 2016-01', ' 2016-02'] def test_datetime64formatter_hoursecond(self): @@ -2619,9 +2618,8 @@ def test_datetime64formatter_hoursecond(self): def format_func(x): return x.strftime('%H:%M') - formatter = fmt.Datetime64Formatter(x, formatter=format_func) - result = formatter.get_result() - assert result == ['10:10', '12:12'] + result = fmt.format_array(x, formatter=format_func) + assert result == [' 10:10', ' 12:12'] class TestNaTFormatting: diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index b9f28ec36d021..280bf8b84feca 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -120,11 +120,11 @@ def test_to_latex_with_formatters(self): expected = r"""\begin{tabular}{llrrl} \toprule -{} & datetime64 & float & int & object \\ +{} & datetime64 & float & int & object \\ \midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ \bottomrule \end{tabular} """