diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2b1a61186dca6..a4a82db3566cf 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -653,6 +653,8 @@ I/O ^^^ - Bug in :func:`DataFrame.to_html()` where values were truncated using display options instead of outputting the full content (:issue:`17004`) +- Bug in :meth:`DataFrame.to_html` that would ignore ``formatters`` argument for float values in a column with ``dtype=object`` (:issue:`13021`) +- Bug in :meth:`DataFrame.to_string` and :meth:`DataFrame.to_latex` resulting in incorrect column spacing using ``formatters`` on a column with ``dtype=object`` (:issue:`26002`) - Fixed bug in missing text when using :meth:`to_clipboard` if copying utf-16 characters in Python 3 on Windows (:issue:`25040`) - Bug in :func:`read_json` for ``orient='table'`` when it tries to infer dtypes by default, which is not applicable as dtypes are already defined in the JSON schema (:issue:`21345`) - Bug in :func:`read_json` for ``orient='table'`` and float index, as it infers index dtype by default, which is not applicable because index dtype is already defined in the JSON schema (:issue:`25433`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c079b860bb924..ce0ef607ce01d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1973,11 +1973,11 @@ def _repr_categories(self): from pandas.io.formats import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 - head = fmt.format_array(self.categories[:num], None) - tail = fmt.format_array(self.categories[-num:], None) + head = fmt.format_array(self.categories[:num]) + tail = fmt.format_array(self.categories[-num:]) category_strs = head + ["..."] + tail else: - category_strs = fmt.format_array(self.categories, None) + category_strs = fmt.format_array(self.categories) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 589e98f016f69..40cecb9ce0d4f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -315,7 +315,7 @@ def _from_factorized(cls, values, original): def _formatter(self, boxed=False): def fmt(x): if isna(x): - return 'NaN' + return np.nan if boxed else 'NaN' return str(x) return fmt diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 4f628eff43167..01ab450cd5bf2 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -809,6 +809,10 @@ def value_counts(self, dropna=True): # Formatting + def _formatter(self, boxed=False): + # Defer to GenericArrayFormatter's formatter. + return None + def _format_data(self): # TODO: integrate with categorical and make generic diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4601d63f2d27e..3e3569e0bd2c7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1041,7 +1041,7 @@ def _format_with_header(self, header, na_rep='NaN', **kwargs): result = result.tolist() else: - result = _trim_front(format_array(values, None, justify='left')) + result = _trim_front(format_array(values, justify='left')) return header + result def to_native_types(self, slicer=None, **kwargs): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8655fb05f34e2..87567844ffab5 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -129,7 +129,7 @@ def _get_footer(self): return str(footer) def _get_formatted_values(self): - return format_array(self.categorical.get_values(), None, + return format_array(self.categorical.get_values(), float_format=None, na_rep=self.na_rep) def to_string(self): @@ -249,7 +249,7 @@ def _get_formatted_index(self): def _get_formatted_values(self): values_to_format = self.tr_series._formatting_values() - return format_array(values_to_format, None, + return format_array(values_to_format, float_format=self.float_format, na_rep=self.na_rep) def to_string(self): @@ -853,7 +853,7 @@ def _get_column_name_list(self): # Array formatters -def format_array(values, formatter, float_format=None, na_rep='NaN', +def format_array(values, formatter=None, float_format=None, na_rep='NaN', digits=None, space=None, justify='right', decimal='.', leading_space=None): """ @@ -883,6 +883,17 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', List[str] """ + if is_datetime64_dtype(values.dtype): + if not isinstance(values, DatetimeIndex): + values = DatetimeIndex(values) + + # we apply an optional formatter function passed as a parameter before + # applying additional formatting parameters. This allows EA to control + # formatting and also honour additional formatting options. + # We specify dtype and dispatch to GenericArrayFormatter. + if formatter is not None and callable(formatter): + values = np.array([formatter(x) for x in values], dtype=object) + if is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter elif is_datetime64tz_dtype(values): @@ -908,7 +919,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', digits = get_option("display.precision") fmt_obj = fmt_klass(values, digits=digits, na_rep=na_rep, - float_format=float_format, formatter=formatter, + float_format=float_format, space=space, justify=justify, decimal=decimal, leading_space=leading_space) @@ -917,14 +928,13 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', class GenericArrayFormatter: - def __init__(self, values, digits=7, formatter=None, na_rep='NaN', + def __init__(self, values, digits=7, na_rep='NaN', space=12, float_format=None, justify='right', decimal='.', quoting=None, fixed_width=True, leading_space=None): self.values = values self.digits = digits self.na_rep = na_rep self.space = space - self.formatter = formatter self.float_format = float_format self.justify = justify self.decimal = decimal @@ -946,9 +956,7 @@ def _format_strings(self): else: float_format = self.float_format - formatter = ( - self.formatter if self.formatter is not None else - (lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')))) + formatter = lambda x: pprint_thing(x, escape_chars=('\t', '\r', '\n')) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): @@ -1004,7 +1012,8 @@ class FloatArrayFormatter(GenericArrayFormatter): """ def __init__(self, *args, **kwargs): - GenericArrayFormatter.__init__(self, *args, **kwargs) + super().__init__(*args, **kwargs) + self.formatter = None # float_format is expected to be a string # formatter should be used to pass a function @@ -1062,9 +1071,6 @@ def get_result_as_array(self): the parameters given at initialisation, as a numpy array """ - if self.formatter is not None: - return np.array([self.formatter(x) for x in self.values]) - if self.fixed_width: threshold = get_option("display.chop_threshold") else: @@ -1154,7 +1160,7 @@ def _format_strings(self): class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self): - formatter = self.formatter or (lambda x: '{x: d}'.format(x=x)) + formatter = lambda x: '{x: d}'.format(x=x) fmt_values = [formatter(x) for x in self.values] return fmt_values @@ -1171,12 +1177,6 @@ def _format_strings(self): values = self.values - if not isinstance(values, DatetimeIndex): - values = DatetimeIndex(values) - - if self.formatter is not None and callable(self.formatter): - return [self.formatter(x) for x in values] - fmt_values = format_array_from_datetime( values.asi8.ravel(), format=_get_format_datetime64_from_values(values, @@ -1337,9 +1337,8 @@ def _format_strings(self): values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = (self.formatter or - _get_format_datetime64(is_dates_only, - date_format=self.date_format)) + formatter = _get_format_datetime64(is_dates_only, + date_format=self.date_format) fmt_values = [formatter(x) for x in values] return fmt_values @@ -1353,9 +1352,8 @@ def __init__(self, values, nat_rep='NaT', box=False, **kwargs): self.box = box def _format_strings(self): - formatter = (self.formatter or - _get_format_timedelta64(self.values, nat_rep=self.nat_rep, - box=self.box)) + formatter = _get_format_timedelta64(self.values, nat_rep=self.nat_rep, + box=self.box) fmt_values = np.array([formatter(x) for x in self.values]) return fmt_values diff --git a/pandas/tests/arrays/test_integer.py b/pandas/tests/arrays/test_integer.py index 066eadc9b68bc..62e625c20915d 100644 --- a/pandas/tests/arrays/test_integer.py +++ b/pandas/tests/arrays/test_integer.py @@ -489,6 +489,14 @@ def test_frame_repr(data_missing): assert result == expected +def test_frame_to_string_na_rep(data_missing): + + df = pd.DataFrame({'A': data_missing}) + result = df.to_string(na_rep='foo') + expected = ' A\n0 foo\n1 1' + assert result == expected + + def test_conversions(data_missing): # astype to object series diff --git a/pandas/tests/io/formats/data/html/gh13021_expected_output.html b/pandas/tests/io/formats/data/html/gh13021_expected_output.html new file mode 100644 index 0000000000000..55d864d8a0d3d --- /dev/null +++ b/pandas/tests/io/formats/data/html/gh13021_expected_output.html @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + +
x
0a
1$0
2$10
3$3
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index edb7c2136825d..8714e7dab3f79 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -498,10 +498,10 @@ def test_to_string_with_formatters(self): ('object', lambda x: '-{x!s}-'.format(x=x))] result = df.to_string(formatters=dict(formatters)) result2 = df.to_string(formatters=list(zip(*formatters))[1]) - assert result == (' int float object\n' - '0 0x1 [ 1.0] -(1, 2)-\n' - '1 0x2 [ 2.0] -True-\n' - '2 0x3 [ 3.0] -False-') + assert result == (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-') assert result == result2 def test_to_string_with_datetime64_monthformatter(self): @@ -511,7 +511,7 @@ def test_to_string_with_datetime64_monthformatter(self): def format_func(x): return x.strftime('%Y-%m') result = x.to_string(formatters={'months': format_func}) - expected = 'months\n0 2016-01\n1 2016-02' + expected = 'months\n0 2016-01\n1 2016-02' assert result.strip() == expected def test_to_string_with_datetime64_hourformatter(self): @@ -523,7 +523,7 @@ def format_func(x): return x.strftime('%H:%M') result = x.to_string(formatters={'hod': format_func}) - expected = 'hod\n0 10:10\n1 12:12' + expected = 'hod\n0 10:10\n1 12:12' assert result.strip() == expected def test_to_string_with_formatters_unicode(self): @@ -2547,19 +2547,19 @@ class TestDatetime64Formatter: def test_mixed(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 1, 12), pd.NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 00:00:00" assert result[1].strip() == "2013-01-01 12:00:00" def test_dates(self): x = Series([datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01" assert result[1].strip() == "2013-01-02" def test_date_nanos(self): x = Series([Timestamp(200)]) - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "1970-01-01 00:00:00.000000200" def test_dates_display(self): @@ -2568,35 +2568,35 @@ def test_dates_display(self): # make sure that we are consistently display date formatting x = Series(date_range('20130101 09:00:00', periods=5, freq='D')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-05 09:00:00" x = Series(date_range('20130101 09:00:00', periods=5, freq='s')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:04" x = Series(date_range('20130101 09:00:00', periods=5, freq='ms')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.004" x = Series(date_range('20130101 09:00:00', periods=5, freq='us')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000004" x = Series(date_range('20130101 09:00:00', periods=5, freq='N')) x.iloc[1] = np.nan - result = fmt.Datetime64Formatter(x).get_result() + result = fmt.format_array(x) assert result[0].strip() == "2013-01-01 09:00:00.000000000" assert result[1].strip() == "NaT" assert result[4].strip() == "2013-01-01 09:00:00.000000004" @@ -2607,9 +2607,8 @@ def test_datetime64formatter_yearmonth(self): def format_func(x): return x.strftime('%Y-%m') - formatter = fmt.Datetime64Formatter(x, formatter=format_func) - result = formatter.get_result() - assert result == ['2016-01', '2016-02'] + result = fmt.format_array(x, formatter=format_func) + assert result == [' 2016-01', ' 2016-02'] def test_datetime64formatter_hoursecond(self): @@ -2619,9 +2618,8 @@ def test_datetime64formatter_hoursecond(self): def format_func(x): return x.strftime('%H:%M') - formatter = fmt.Datetime64Formatter(x, formatter=format_func) - result = formatter.get_result() - assert result == ['10:10', '12:12'] + result = fmt.format_array(x, formatter=format_func) + assert result == [' 10:10', ' 12:12'] class TestNaTFormatting: diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 97d51f079fb2d..1a95cd06f8e11 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -624,6 +624,17 @@ def test_to_html_invalid_classes_type(classes): df.to_html(classes=classes) +def test_to_html_formatters_object_type(datapath): + # GH 13021 + def f(x): + return x if isinstance(x, str) else '${:,.0f}'.format(x) + + df = pd.DataFrame([['a'], [0], [10.4], [3]], columns=['x']) + result = df.to_html(formatters=dict(x=f)) + expected = expected_html(datapath, 'gh13021_expected_output') + assert result == expected + + def test_to_html_round_column_headers(): # GH 17280 df = DataFrame([1], columns=[0.55555]) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index b9f28ec36d021..280bf8b84feca 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -120,11 +120,11 @@ def test_to_latex_with_formatters(self): expected = r"""\begin{tabular}{llrrl} \toprule -{} & datetime64 & float & int & object \\ +{} & datetime64 & float & int & object \\ \midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ +index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ +index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ +index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ \bottomrule \end{tabular} """