From 5169f5c218ad7885dcbc740df7667d424b82beca Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 17 Jan 2021 16:26:07 -0500 Subject: [PATCH 1/8] Includes the reading from Excel example as suggested via #38990 --- .../comparison_with_spreadsheets.rst | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index 13029173b2e65..aecd90a071129 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -89,13 +89,6 @@ Both `Excel ` can import data from various sources in various formats. -Excel files -''''''''''' - -Excel opens `various Excel file formats `_ -by double-clicking them, or using `the Open menu `_. -In pandas, you use :ref:`special methods for reading and writing from/to Excel files `. - CSV ''' @@ -125,6 +118,27 @@ would be: # alternatively, read_table is an alias to read_csv with tab delimiter tips = pd.read_table("tips.csv", header=None) +Excel files +''''''''''' + +Excel opens `various Excel file formats `_ +by double-clicking them, or using `the Open menu `_. +In pandas, you use :ref:`special methods for reading and writing from/to Excel files `. + +Let's first :ref:`create a new Excel file ` based on the ``tips`` dataframe in the above example: + +.. code-block:: python + + tips.to_excel("./tips.xlsx") + +Should you wish to subsequently access the data in the ``tips.xlsx`` file, you can read it into your module using + +.. code-block:: python + + tips_df = read_excel("./tips.xlsx", header=None) + +You have just read in an Excel file using pandas! + Limiting output ~~~~~~~~~~~~~~~ From 7037920d1307fa77fe48836260627f15a3124f90 Mon Sep 17 00:00:00 2001 From: Brian Date: Sun, 17 Jan 2021 17:01:07 -0500 Subject: [PATCH 2/8] Updated Comparison to Excel documentation with examples of reading Excel file, and fixed typo via #38990 --- .../getting_started/comparison/comparison_with_spreadsheets.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index aecd90a071129..7f864ce80fe4e 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -135,7 +135,7 @@ Should you wish to subsequently access the data in the ``tips.xlsx`` file, you c .. code-block:: python - tips_df = read_excel("./tips.xlsx", header=None) + tips_df = pd.read_excel("./tips.xlsx", header=None) You have just read in an Excel file using pandas! From d8df1fc6d5d0b773e4687b04b1543b3c01df40b5 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 21 Jan 2021 12:37:50 -0500 Subject: [PATCH 3/8] Update doc/source/getting_started/comparison/comparison_with_spreadsheets.rst Co-authored-by: Aidan Feldman --- .../getting_started/comparison/comparison_with_spreadsheets.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst index 7f864ce80fe4e..55f999c099e23 100644 --- a/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst +++ b/doc/source/getting_started/comparison/comparison_with_spreadsheets.rst @@ -135,7 +135,7 @@ Should you wish to subsequently access the data in the ``tips.xlsx`` file, you c .. code-block:: python - tips_df = pd.read_excel("./tips.xlsx", header=None) + tips_df = pd.read_excel("./tips.xlsx", index_col=0) You have just read in an Excel file using pandas! From 51b4353855843e3fe51835d84854223dfc95c171 Mon Sep 17 00:00:00 2001 From: Brian Date: Sat, 3 Apr 2021 15:50:26 -0400 Subject: [PATCH 4/8] TST: added new test to test_reductions.py::TestDataFrameAnalytics::test_idxmax_mixed_dtypes for #40346 --- pandas/tests/frame/test_reductions.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1304e861f948e..9e4519a5cf913 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1025,6 +1025,17 @@ def test_idxmax_mixed_dtype(self): expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) tm.assert_series_equal(result, expected) + # with a column of NaNs? + df[5] = [0] + [np.NaN] * 2 + + result = df.idxmax() + expected = Series([1, 0, 2, 0, 0], index=[1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + + result = df.idxmin() + expected = Series([0, 2, 1, 2, 0], index=[1, 2, 3, 4, 5]) + tm.assert_series_equal(result, expected) + def test_idxmax_dt64_multicolumn_axis1(self): dti = date_range("2016-01-01", periods=3) df = DataFrame({3: dti, 4: dti[::-1]}) From e5056afa509e22d4b2622371bc71de19dbe9780b Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 6 Apr 2021 00:20:54 -0400 Subject: [PATCH 5/8] BUG: included new testcase to ensure resolution of bug highlighted in #40346 --- pandas/tests/frame/test_reductions.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 9e4519a5cf913..cd0da3e6d7341 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1025,16 +1025,26 @@ def test_idxmax_mixed_dtype(self): expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) tm.assert_series_equal(result, expected) - # with a column of NaNs? - df[5] = [0] + [np.NaN] * 2 + # with convert dtypes + df2 = DataFrame( + { + "teamId": [100, 100, 100, 200, 200, 200], + "value": [0, 0, 0, 1, 2, 0], + } + ) + df2 = df2.convert_dtypes() - result = df.idxmax() - expected = Series([1, 0, 2, 0, 0], index=[1, 2, 3, 4, 5]) - tm.assert_series_equal(result, expected) + result = df2.groupby("teamId").idxmax() + expected = DataFrame( + {"value": [0, 4]}, index=Index([100, 200], dtype="object", name="teamId") + ) + tm.assert_frame_equal(result, expected) - result = df.idxmin() - expected = Series([0, 2, 1, 2, 0], index=[1, 2, 3, 4, 5]) - tm.assert_series_equal(result, expected) + result = df2.groupby("teamId").idxmin() + expected = DataFrame( + {"value": [0, 5]}, index=Index([100, 200], dtype="object", name="teamId") + ) + tm.assert_frame_equal(result, expected) def test_idxmax_dt64_multicolumn_axis1(self): dti = date_range("2016-01-01", periods=3) From 9314476fe8787f3a176e02e71e34b8c515da58c1 Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 6 Apr 2021 18:52:52 -0400 Subject: [PATCH 6/8] wrote new parametrized tests for #40346 --- pandas/tests/frame/test_reductions.py | 33 +++++++++++++++++++++------ 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index cd0da3e6d7341..f25e53afc0d49 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1025,24 +1025,43 @@ def test_idxmax_mixed_dtype(self): expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4]) tm.assert_series_equal(result, expected) - # with convert dtypes + @pytest.mark.parametrize( + "ID, value, expected_index, expected_value", + [([100, 100, 100, 200, 200, 200], [0, 0, 0, 1, 2, 0], [100, 200], [0, 4])], + ) + def test_idxmax_convert_dtypes(self, ID, value, expected_index, expected_value): df2 = DataFrame( { - "teamId": [100, 100, 100, 200, 200, 200], - "value": [0, 0, 0, 1, 2, 0], + "ID": ID, + "value": value, } ) df2 = df2.convert_dtypes() - result = df2.groupby("teamId").idxmax() + result = df2.groupby("ID").idxmax() expected = DataFrame( - {"value": [0, 4]}, index=Index([100, 200], dtype="object", name="teamId") + {"value": expected_value}, + index=Index(expected_index, dtype="object", name="ID"), ) tm.assert_frame_equal(result, expected) - result = df2.groupby("teamId").idxmin() + @pytest.mark.parametrize( + "ID, value, expected_index, expected_value", + [([100, 100, 100, 200, 200, 200], [0, 0, 0, 1, 2, 0], [100, 200], [0, 5])], + ) + def test_idxmin_convert_dtypes(self, ID, value, expected_index, expected_value): + df2 = DataFrame( + { + "ID": ID, + "value": value, + } + ) + df2 = df2.convert_dtypes() + + result = df2.groupby("ID").idxmin() expected = DataFrame( - {"value": [0, 5]}, index=Index([100, 200], dtype="object", name="teamId") + {"value": expected_value}, + index=Index(expected_index, dtype="object", name="ID"), ) tm.assert_frame_equal(result, expected) From f5eb1bb52d8b181345bd49d8515b159ea6250742 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 7 Apr 2021 20:12:50 -0400 Subject: [PATCH 7/8] TST adjusted parametrization of test as discussed for #40346 --- pandas/tests/frame/test_reductions.py | 38 +++++++-------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index f25e53afc0d49..1775dc54176ad 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1026,42 +1026,24 @@ def test_idxmax_mixed_dtype(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "ID, value, expected_index, expected_value", - [([100, 100, 100, 200, 200, 200], [0, 0, 0, 1, 2, 0], [100, 200], [0, 4])], + "func_name, expected_value", + [("idxmax", [0, 4]), ("idxmin", [0, 5])], ) - def test_idxmax_convert_dtypes(self, ID, value, expected_index, expected_value): - df2 = DataFrame( + def test_idxmax_idxmin_convert_dtypes(self, func_name, expected_value): + df = DataFrame( { - "ID": ID, - "value": value, + "ID": [100, 100, 100, 200, 200, 200], + "value": [0, 0, 0, 1, 2, 0], } ) - df2 = df2.convert_dtypes() - - result = df2.groupby("ID").idxmax() - expected = DataFrame( - {"value": expected_value}, - index=Index(expected_index, dtype="object", name="ID"), - ) - tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "ID, value, expected_index, expected_value", - [([100, 100, 100, 200, 200, 200], [0, 0, 0, 1, 2, 0], [100, 200], [0, 5])], - ) - def test_idxmin_convert_dtypes(self, ID, value, expected_index, expected_value): - df2 = DataFrame( - { - "ID": ID, - "value": value, - } - ) - df2 = df2.convert_dtypes() + df = df.convert_dtypes().groupby("ID") + func = getattr(df, func_name) - result = df2.groupby("ID").idxmin() + result = func() expected = DataFrame( {"value": expected_value}, - index=Index(expected_index, dtype="object", name="ID"), + index=Index([100, 200], dtype="object", name="ID"), ) tm.assert_frame_equal(result, expected) From 1403c586da1504210d91eda50fe7d73cbab9cda7 Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 13 Apr 2021 20:52:57 -0400 Subject: [PATCH 8/8] TST adjustment made to replace convert_dtypes in test #40346 --- pandas/tests/frame/test_reductions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 1775dc54176ad..93ebe0aba3bc6 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1026,21 +1026,21 @@ def test_idxmax_mixed_dtype(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "func_name, expected_value", + "op, expected_value", [("idxmax", [0, 4]), ("idxmin", [0, 5])], ) - def test_idxmax_idxmin_convert_dtypes(self, func_name, expected_value): + def test_idxmax_idxmin_convert_dtypes(self, op, expected_value): + # GH 40346 df = DataFrame( { "ID": [100, 100, 100, 200, 200, 200], "value": [0, 0, 0, 1, 2, 0], - } + }, + dtype="Int64", ) + df = df.groupby("ID") - df = df.convert_dtypes().groupby("ID") - func = getattr(df, func_name) - - result = func() + result = getattr(df, op)() expected = DataFrame( {"value": expected_value}, index=Index([100, 200], dtype="object", name="ID"),