From 7e461a18d9f6928132afec6f48ce968b3e989ba6 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Mon, 3 Dec 2018 17:43:52 +0100 Subject: [PATCH 1/6] remove \n from docstring --- pandas/core/arrays/datetimes.py | 26 +++++++++++++------------- pandas/core/arrays/timedeltas.py | 16 ++++++++-------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cfe3afcf3730a..b3df505d56d78 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -82,7 +82,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -1072,19 +1072,19 @@ def date(self): return tslib.ints_to_pydatetime(timestamps, box="date") - year = _field_accessor('year', 'Y', "\n The year of the datetime\n") + year = _field_accessor('year', 'Y', "The year of the datetime") month = _field_accessor('month', 'M', - "\n The month as January=1, December=12 \n") - day = _field_accessor('day', 'D', "\nThe days of the datetime\n") - hour = _field_accessor('hour', 'h', "\nThe hours of the datetime\n") - minute = _field_accessor('minute', 'm', "\nThe minutes of the datetime\n") - second = _field_accessor('second', 's', "\nThe seconds of the datetime\n") + "The month as January=1, December=12") + day = _field_accessor('day', 'D', "The days of the datetime") + hour = _field_accessor('hour', 'h', "The hours of the datetime") + minute = _field_accessor('minute', 'm', "The minutes of the datetime") + second = _field_accessor('second', 's', "The seconds of the datetime") microsecond = _field_accessor('microsecond', 'us', - "\nThe microseconds of the datetime\n") + "The microseconds of the datetime") nanosecond = _field_accessor('nanosecond', 'ns', - "\nThe nanoseconds of the datetime\n") + "The nanoseconds of the datetime") weekofyear = _field_accessor('weekofyear', 'woy', - "\nThe week ordinal of the year\n") + "The week ordinal of the year") week = weekofyear _dayofweek_doc = """ The day of the week with Monday=0, Sunday=6. @@ -1129,12 +1129,12 @@ def date(self): "The name of day in a week (ex: Friday)\n\n.. deprecated:: 0.23.0") dayofyear = _field_accessor('dayofyear', 'doy', - "\nThe ordinal day of the year\n") - quarter = _field_accessor('quarter', 'q', "\nThe quarter of the date\n") + "The ordinal day of the year") + quarter = _field_accessor('quarter', 'q', "The quarter of the date") days_in_month = _field_accessor( 'days_in_month', 'dim', - "\nThe number of days in the month\n") + "The number of days in the month") daysinmonth = days_in_month _is_month_doc = """ Indicates whether the date is the {first_or_last} day of the month. diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 830283d31a929..4afc9f5483c2a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -59,7 +59,7 @@ def f(self): return result f.__name__ = name - f.__doc__ = docstring + f.__doc__ = "\n{}\n".format(docstring) return property(f) @@ -684,16 +684,16 @@ def to_pytimedelta(self): return tslibs.ints_to_pytimedelta(self.asi8) days = _field_accessor("days", "days", - "\nNumber of days for each element.\n") + "Number of days for each element.") seconds = _field_accessor("seconds", "seconds", - "\nNumber of seconds (>= 0 and less than 1 day) " - "for each element.\n") + "Number of seconds (>= 0 and less than 1 day) " + "for each element.") microseconds = _field_accessor("microseconds", "microseconds", - "\nNumber of microseconds (>= 0 and less " - "than 1 second) for each element.\n") + "Number of microseconds (>= 0 and less " + "than 1 second) for each element.") nanoseconds = _field_accessor("nanoseconds", "nanoseconds", - "\nNumber of nanoseconds (>= 0 and less " - "than 1 microsecond) for each element.\n") + "Number of nanoseconds (>= 0 and less " + "than 1 microsecond) for each element.") @property def components(self): From ab7618e49208f0ca954c34fd1f0c724d29157ad6 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 13 Jan 2020 20:00:37 +0100 Subject: [PATCH 2/6] fix issue 18321 --- pandas/core/reshape/pivot.py | 9 +++++- pandas/tests/reshape/test_pivot.py | 47 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index b443ba142369c..8ea45fd59b6de 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -579,6 +579,8 @@ def crosstab( from pandas import DataFrame df = DataFrame(data, index=common_idx) + original_df_cols = df.columns + if values is None: df["__dummy__"] = 0 kwargs = {"aggfunc": len, "fill_value": 0} @@ -587,7 +589,7 @@ def crosstab( kwargs = {"aggfunc": aggfunc} table = df.pivot_table( - "__dummy__", + ["__dummy__"], index=rownames, columns=colnames, margins=margins, @@ -596,6 +598,11 @@ def crosstab( **kwargs, ) + # Remove extra level from `[__dummy__]` pivoting + if not table.empty: + cols_diff = df.columns.difference(original_df_cols)[0] + table = table[cols_diff] + # Post-process if normalize is not False: table = _normalize( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 743fc50c87e96..36ec45709a692 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2549,6 +2549,53 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "s1_data, s1_name, s2_data, s2_name, " + "expected_index, expected_column, expected_data", + [ + ( + [1, 2, 3], + ("a", "b"), + [1, 2, 3], + ("c", "d"), + [1, 2, 3], + [1, 2, 3], + np.eye(3, dtype=int), + ), + ([1, 1, 1], ("a", "b"), [0, 1, 2], ("c", "d"), [1], [0, 1, 2], [[1, 1, 1]]), + ( + [0, 1, 2], + ("a", "b"), + [1, 1, 1], + ("c", "d"), + [0, 1, 2], + [1], + [[1], [1], [1]], + ), + ], + ) + def test_crosstab_both_tuple_names( + self, + s1_data, + s1_name, + s2_data, + s2_name, + expected_index, + expected_column, + expected_data, + ): + # GH 18321 + s1 = pd.Series(s1_data, name=s1_name) + s2 = pd.Series(s2_data, name=s2_name) + + expected = pd.DataFrame( + expected_data, + index=pd.Index(expected_index, name=s1_name), + columns=pd.Index(expected_column, name=s2_name), + ) + result = crosstab(s1, s2) + tm.assert_frame_equal(result, expected) + def test_crosstab_unsorted_order(self): df = pd.DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"]) result = pd.crosstab(df.index, [df.b, df.a]) From 6fd7abe241bf53e3d964d3a4082ca896c1f98468 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Mon, 13 Jan 2020 22:15:09 +0100 Subject: [PATCH 3/6] address type --- pandas/tests/reshape/test_pivot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 36ec45709a692..0fed700c5cb84 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2560,7 +2560,7 @@ def test_crosstab_tuple_name(self, names): ("c", "d"), [1, 2, 3], [1, 2, 3], - np.eye(3, dtype=int), + np.eye(3, dtype="int64"), ), ([1, 1, 1], ("a", "b"), [0, 1, 2], ("c", "d"), [1], [0, 1, 2], [[1, 1, 1]]), ( From aabad97b15c3758c4c694e916490d8458fcd5356 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 08:20:02 +0100 Subject: [PATCH 4/6] remove parametrize --- pandas/tests/reshape/test_pivot.py | 46 ++++-------------------------- 1 file changed, 6 insertions(+), 40 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 0fed700c5cb84..1ef1a814d703c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2549,49 +2549,15 @@ def test_crosstab_tuple_name(self, names): result = pd.crosstab(s1, s2) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "s1_data, s1_name, s2_data, s2_name, " - "expected_index, expected_column, expected_data", - [ - ( - [1, 2, 3], - ("a", "b"), - [1, 2, 3], - ("c", "d"), - [1, 2, 3], - [1, 2, 3], - np.eye(3, dtype="int64"), - ), - ([1, 1, 1], ("a", "b"), [0, 1, 2], ("c", "d"), [1], [0, 1, 2], [[1, 1, 1]]), - ( - [0, 1, 2], - ("a", "b"), - [1, 1, 1], - ("c", "d"), - [0, 1, 2], - [1], - [[1], [1], [1]], - ), - ], - ) - def test_crosstab_both_tuple_names( - self, - s1_data, - s1_name, - s2_data, - s2_name, - expected_index, - expected_column, - expected_data, - ): + def test_crosstab_both_tuple_names(self): # GH 18321 - s1 = pd.Series(s1_data, name=s1_name) - s2 = pd.Series(s2_data, name=s2_name) + s1 = pd.Series(range(3), name=("a", "b")) + s2 = pd.Series(range(3), name=("c", "d")) expected = pd.DataFrame( - expected_data, - index=pd.Index(expected_index, name=s1_name), - columns=pd.Index(expected_column, name=s2_name), + np.eye(3, dtype="int64"), + index=pd.Index(range(3), name=("a", "b")), + columns=pd.Index(range(3), name=("c", "d")), ) result = crosstab(s1, s2) tm.assert_frame_equal(result, expected) From 6cbf0c204f141178f86bae9472cbac14434f4194 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Tue, 14 Jan 2020 08:25:29 +0100 Subject: [PATCH 5/6] add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c3ee72f6442fc..5f000825da55b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -140,7 +140,7 @@ Reshaping ^^^^^^^^^ - -- +- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) Sparse ^^^^^^ From 768c4992e7d2da3359152bfd763e0b9ec5f71795 Mon Sep 17 00:00:00 2001 From: Kaiqi Date: Wed, 15 Jan 2020 08:47:43 +0100 Subject: [PATCH 6/6] update comment --- pandas/core/reshape/pivot.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index df84de7cc09b7..13df39cc0011b 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -600,7 +600,8 @@ def crosstab( **kwargs, ) - # Remove extra level from `[__dummy__]` pivoting + # GH18321, after pivoting, an extra top level of column index of `__dummy__` is + # created, and this extra level should not be included in the further steps if not table.empty: cols_diff = df.columns.difference(original_df_cols)[0] table = table[cols_diff]