diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2b1a61186dca6..a4a82db3566cf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -653,6 +653,8 @@ I/O ^^^ - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) +- Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) +- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` resulting in incorrect column spacing using ``formatters`` on a column with ``dtype=object`` (:issue:`26002`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924..ce0ef607ce01d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1973,11 +1973,11 @@ def _repr_categories(self): from pandas.io.formats import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 - head = fmt.format_array(self.categories[:num], None) - tail = fmt.format_array(self.categories[-num:], None) + head = fmt.format_array(self.categories[:num]) + tail = fmt.format_array(self.categories[-num:]) category_strs = head + ["..."] + tail else: - category_strs = fmt.format_array(self.categories, None) + category_strs = fmt.format_array(self.categories) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 589e98f016f69..40cecb9ce0d4f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -315,7 +315,7 @@ def _from_factorized(cls, values, original): def _formatter(self, boxed=False): def fmt(x): if isna(x): - return 'NaN' + return np.nan if boxed else 'NaN' return str(x) return fmt diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4f628eff43167..01ab450cd5bf2 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -809,6 +809,10 @@ def value_counts(self, dropna=True): # Formatting + def _formatter(self, boxed=False): + # Defer to GenericArrayFormatter's formatter. + return None + def _format_data(self): # TODO: integrate with categorical and make generic diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4601d63f2d27e..3e3569e0bd2c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1041,7 +1041,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = result.tolist() else: - result = _trim_front(format_array(values, None, justify='left')) + result = _trim_front(format_array(values, justify='left')) return header + result def to_native_types(self, slicer=None, **kwargs): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8655fb05f34e2..87567844ffab5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -129,7 +129,7 @@ def _get_footer(self): return str(footer) def _get_formatted_values(self): - return format_array(self.categorical.get_values(), None, + return format_array(self.categorical.get_values(), float_format=None, na_rep=self.na_rep) def to_string(self): @@ -249,7 +249,7 @@ def _get_formatted_index(self): def _get_formatted_values(self): values_to_format = self.tr_series._formatting_values() - return format_array(values_to_format, None, + return format_array(values_to_format, float_format=self.float_format, na_rep=self.na_rep) def to_string(self): @@ -853,7 +853,7 @@ def _get_column_name_list(self): # Array formatters -def format_array(values, formatter, float_format=None, na_rep='NaN', +def format_array(values, formatter=None, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', leading_space=None): """ @@ -883,6 +883,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', List[str] """ + if is_datetime64_dtype(values.dtype): + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + # we apply an optional formatter function passed as a parameter before + # applying additional formatting parameters. This allows EA to control + # formatting and also honour additional formatting options. + # We specify dtype and dispatch to GenericArrayFormatter. + if formatter is not None and callable(formatter): + values = np.array([formatter(x) for x in values], dtype=object) + if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): @@ -908,7 +919,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', digits = get_option("display.precision") fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, formatter=formatter, + float_format=float_format, space=space, justify=justify, decimal=decimal, leading_space=leading_space) @@ -917,14 +928,13 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', class GenericArrayFormatter: - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', + def __init__(self, values, digits=7, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', quoting=None, fixed_width=True, leading_space=None): self.values = values self.digits = digits self.na_rep = na_rep self.space = space - self.formatter = formatter self.float_format = float_format self.justify = justify self.decimal = decimal @@ -946,9 +956,7 @@ def _format_strings(self): else: float_format = self.float_format - formatter = ( - self.formatter if self.formatter is not None else - (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')))) + formatter = lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): @@ -1004,7 +1012,8 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.formatter = None # float_format is expected to be a string # formatter should be used to pass a function @@ -1062,9 +1071,6 @@ def get_result_as_array(self): the parameters given at initialisation, as a numpy array """ - if self.formatter is not None: - return np.array([self.formatter(x) for x in self.values]) - if self.fixed_width: threshold = get_option("display.chop_threshold") else: @@ -1154,7 +1160,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) + formatter = lambda x: '{x: d}'.format(x=x) fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -1171,12 +1177,6 @@ def _format_strings(self): values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) - - if self.formatter is not None and callable(self.formatter): - return [self.formatter(x) for x in values] - fmt_values = format_array_from_datetime( values.asi8.ravel(), format=_get_format_datetime64_from_values(values, @@ -1337,9 +1337,8 @@ def _format_strings(self): values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = (self.formatter or - _get_format_datetime64(is_dates_only, - date_format=self.date_format)) + formatter = _get_format_datetime64(is_dates_only, + date_format=self.date_format) fmt_values = [formatter(x) for x in values] return fmt_values @@ -1353,9 +1352,8 @@ def __init__(self, values, nat_rep='NaT', box=False, **kwargs): self.box = box def _format_strings(self): - formatter = (self.formatter or - _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box)) + formatter = _get_format_timedelta64(self.values, nat_rep=self.nat_rep, + box=self.box) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 066eadc9b68bc..62e625c20915d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -489,6 +489,14 @@ def test_frame_repr(data_missing): assert result == expected +def test_frame_to_string_na_rep(data_missing): + + df = pd.DataFrame({'A': data_missing}) + result = df.to_string(na_rep='foo') + expected = ' A\n0 foo\n1 1' + assert result == expected + + def test_conversions(data_missing): # astype to object series diff --git a/pandas/tests/io/formats/data/html/gh13021_expected_output.html b/pandas/tests/io/formats/data/html/gh13021_expected_output.html new file mode 100644 index 0000000000000..55d864d8a0d3d --- /dev/null +++ b/pandas/tests/io/formats/data/html/gh13021_expected_output.html @@ -0,0 +1,26 @@ +
+ | x | +
---|---|
0 | +a | +
1 | +$0 | +
2 | +$10 | +
3 | +$3 | +