diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 2f8cb346935a9..684ab0fa38d22 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -101,6 +101,7 @@ Other enhancements - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) - :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`) +- Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 60279395724ff..fdec60a84a708 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -150,6 +150,8 @@ def diff_2d( ) -> None: ... def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ... def ensure_object(arr: object) -> npt.NDArray[np.object_]: ... +def ensure_complex64(arr: object, copy=True) -> npt.NDArray[np.complex64]: ... +def ensure_complex128(arr: object, copy=True) -> npt.NDArray[np.complex128]: ... def ensure_float64(arr: object, copy=True) -> npt.NDArray[np.float64]: ... def ensure_float32(arr: object, copy=True) -> npt.NDArray[np.float32]: ... def ensure_int8(arr: object, copy=True) -> npt.NDArray[np.int8]: ... diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 167fac257075c..b4200456e2c3d 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -15,6 +15,8 @@ import numpy as np cimport numpy as cnp from numpy cimport ( + NPY_COMPLEX64, + NPY_COMPLEX128, NPY_FLOAT32, NPY_FLOAT64, NPY_INT8, diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 64e8bdea4672c..87130906ef28b 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -47,6 +47,8 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('uint16', 'UINT16', 'uint16'), ('uint32', 'UINT32', 'uint32'), ('uint64', 'UINT64', 'uint64'), + ('complex64', 'COMPLEX64', 'complex64'), + ('complex128', 'COMPLEX128', 'complex128') # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 393fe08f7277c..a9c2b31849425 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -97,6 +97,8 @@ def ensure_float(arr): ensure_int32 = algos.ensure_int32 ensure_int16 = algos.ensure_int16 ensure_int8 = algos.ensure_int8 +ensure_complex64 = algos.ensure_complex64 +ensure_complex128 = algos.ensure_complex128 ensure_platform_int = algos.ensure_platform_int ensure_object = algos.ensure_object diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index a714abd461461..f26f18c9c20a0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1009,6 +1009,30 @@ def test_groupby_complex(): tm.assert_series_equal(result, expected) +def test_groupby_complex_numbers(): + # GH 17927 + df = DataFrame( + [ + {"a": 1, "b": 1 + 1j}, + {"a": 1, "b": 1 + 2j}, + {"a": 4, "b": 1}, + ] + ) + expected = DataFrame( + np.array([1, 1, 1], dtype=np.int64), + index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], dtype="object", name="b"), + columns=Index(["a"], dtype="object"), + ) + result = df.groupby("b", sort=False).count() + tm.assert_frame_equal(result, expected) + + # Sorted by the magnitude of the complex numbers + # Complex Index dtype is cast to object + expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], dtype="object", name="b") + result = df.groupby("b", sort=True).count() + tm.assert_frame_equal(result, expected) + + def test_groupby_series_indexed_differently(): s1 = Series( [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 1fd8b0f8b837a..ee517a667d832 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -8,6 +8,7 @@ from pandas import ( DatetimeIndex, MultiIndex, + Series, ) import pandas._testing as tm @@ -299,6 +300,37 @@ def test_duplicated_drop_duplicates(): tm.assert_index_equal(idx.drop_duplicates(keep=False), expected) +@pytest.mark.parametrize( + "dtype", + [ + np.complex64, + np.complex128, + ], +) +def test_duplicated_series_complex_numbers(dtype): + # GH 17927 + expected = Series( + [False, False, False, True, False, False, False, True, False, True], + dtype=bool, + ) + result = Series( + [ + np.nan + np.nan * 1j, + 0, + 1j, + 1j, + 1, + 1 + 1j, + 1 + 2j, + 1 + 1j, + np.nan, + np.nan + np.nan * 1j, + ], + dtype=dtype, + ).duplicated() + tm.assert_series_equal(result, expected) + + def test_multi_drop_duplicates_pos_args_deprecation(): # GH#41485 idx = MultiIndex.from_arrays([[1, 2, 3, 1], [1, 2, 3, 1]]) diff --git a/pandas/tests/indexes/period/methods/test_factorize.py b/pandas/tests/indexes/period/methods/test_factorize.py index 7c9367a1011a2..9e297d6caca27 100644 --- a/pandas/tests/indexes/period/methods/test_factorize.py +++ b/pandas/tests/indexes/period/methods/test_factorize.py @@ -1,6 +1,9 @@ import numpy as np -from pandas import PeriodIndex +from pandas import ( + PeriodIndex, + factorize, +) import pandas._testing as tm @@ -35,3 +38,15 @@ def test_factorize(self): arr, idx = idx2.factorize() tm.assert_numpy_array_equal(arr, exp_arr) tm.assert_index_equal(idx, exp_idx) + + def test_factorize_complex(self): + # GH 17927 + array = [1, 2, 2 + 1j] + labels, uniques = factorize(array) + + expected_labels = np.array([0, 1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(labels, expected_labels) + + # Should return a complex dtype in the future + expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=object) + tm.assert_numpy_array_equal(uniques, expected_uniques) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index c0c1c2f057c96..513b9af18d2b6 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1487,3 +1487,50 @@ def test_mode_boolean_with_na(self): result = ser.mode() expected = Series({0: True}, dtype="boolean") tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected,dtype", + [ + ( + [0, 1j, 1, 1, 1 + 1j, 1 + 2j], + Series([1], dtype=np.complex128), + np.complex128, + ), + ( + [0, 1j, 1, 1, 1 + 1j, 1 + 2j], + Series([1], dtype=np.complex64), + np.complex64, + ), + ( + [1 + 1j, 2j, 1 + 1j], + Series([1 + 1j], dtype=np.complex128), + np.complex128, + ), + ], + ) + def test_single_mode_value_complex(self, array, expected, dtype): + result = Series(array, dtype=dtype).mode() + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "array,expected,dtype", + [ + ( + # no modes + [0, 1j, 1, 1 + 1j, 1 + 2j], + Series([0j, 1j, 1 + 0j, 1 + 1j, 1 + 2j], dtype=np.complex128), + np.complex128, + ), + ( + [1 + 1j, 2j, 1 + 1j, 2j, 3], + Series([2j, 1 + 1j], dtype=np.complex64), + np.complex64, + ), + ], + ) + def test_multimode_complex(self, array, expected, dtype): + # GH 17927 + # mode tries to sort multimodal series. + # Complex numbers are sorted by their magnitude + result = Series(array, dtype=dtype).mode() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index d3a3434872826..f769c08a512ef 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -186,3 +186,18 @@ def test_isin_large_series_mixed_dtypes_and_nan(): result = ser.isin({"foo", "bar"}) expected = Series([False] * 3 * 1_000_000) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "array,expected", + [ + ( + [0, 1j, 1j, 1, 1 + 1j, 1 + 2j, 1 + 1j], + Series([False, True, True, False, True, True, True], dtype=bool), + ) + ], +) +def test_isin_complex_numbers(array, expected): + # GH 17927 + result = Series(array).isin([1j, 1 + 1j, 1 + 2j]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_value_counts.py b/pandas/tests/series/methods/test_value_counts.py index e707c3f4023df..c914dba75dc35 100644 --- a/pandas/tests/series/methods/test_value_counts.py +++ b/pandas/tests/series/methods/test_value_counts.py @@ -207,3 +207,22 @@ def test_value_counts_bool_with_nan(self, ser, dropna, exp): # GH32146 out = ser.value_counts(dropna=dropna) tm.assert_series_equal(out, exp) + + @pytest.mark.parametrize( + "input_array,expected", + [ + ( + [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], + Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex128)), + ), + ( + [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], + Series([3, 2, 1], index=pd.Index([3j, 1 + 1j, 1], dtype=np.complex64)), + ), + ], + ) + def test_value_counts_complex_numbers(self, input_array, expected): + # GH 17927 + # Complex Index dtype is cast to object + result = Series(input_array).value_counts() + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ba587e28b8c3d..5488c076554fd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1513,6 +1513,21 @@ def test_unique_tuples(self, arr, uniques): result = pd.unique(arr) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.parametrize( + "array,expected", + [ + ( + [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j], + # Should return a complex dtype in the future + np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=object), + ) + ], + ) + def test_unique_complex_numbers(self, array, expected): + # GH 17927 + result = pd.unique(array) + tm.assert_numpy_array_equal(result, expected) + class TestHashTable: def test_string_hashtable_set_item_signature(self):