Skip to content

Commit c182d21

Browse files
authored
TST/CLN: misplaced factorize tests (#37411)
1 parent 8da70fc commit c182d21

File tree

2 files changed

+73
-70
lines changed

2 files changed

+73
-70
lines changed

pandas/tests/base/test_factorize.py

-41
This file was deleted.

pandas/tests/test_algos.py

+73-29
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,21 @@
2323
from pandas import (
2424
Categorical,
2525
CategoricalIndex,
26+
DataFrame,
2627
DatetimeIndex,
2728
Index,
2829
IntervalIndex,
30+
MultiIndex,
31+
NaT,
32+
Period,
33+
PeriodIndex,
2934
Series,
35+
Timedelta,
3036
Timestamp,
37+
date_range,
38+
timedelta_range,
39+
to_datetime,
40+
to_timedelta,
3141
)
3242
import pandas._testing as tm
3343
import pandas.core.algorithms as algos
@@ -36,6 +46,40 @@
3646

3747

3848
class TestFactorize:
49+
@pytest.mark.parametrize("sort", [True, False])
50+
def test_factorize(self, index_or_series_obj, sort):
51+
obj = index_or_series_obj
52+
result_codes, result_uniques = obj.factorize(sort=sort)
53+
54+
constructor = Index
55+
if isinstance(obj, MultiIndex):
56+
constructor = MultiIndex.from_tuples
57+
expected_uniques = constructor(obj.unique())
58+
59+
if sort:
60+
expected_uniques = expected_uniques.sort_values()
61+
62+
# construct an integer ndarray so that
63+
# `expected_uniques.take(expected_codes)` is equal to `obj`
64+
expected_uniques_list = list(expected_uniques)
65+
expected_codes = [expected_uniques_list.index(val) for val in obj]
66+
expected_codes = np.asarray(expected_codes, dtype=np.intp)
67+
68+
tm.assert_numpy_array_equal(result_codes, expected_codes)
69+
tm.assert_index_equal(result_uniques, expected_uniques)
70+
71+
def test_series_factorize_na_sentinel_none(self):
72+
# GH#35667
73+
values = np.array([1, 2, 1, np.nan])
74+
ser = Series(values)
75+
codes, uniques = ser.factorize(na_sentinel=None)
76+
77+
expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
78+
expected_uniques = Index([1.0, 2.0, np.nan])
79+
80+
tm.assert_numpy_array_equal(codes, expected_codes)
81+
tm.assert_index_equal(uniques, expected_uniques)
82+
3983
def test_basic(self):
4084

4185
codes, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"])
@@ -111,34 +155,34 @@ def test_datelike(self):
111155
tm.assert_index_equal(uniques, exp)
112156

113157
# period
114-
v1 = pd.Period("201302", freq="M")
115-
v2 = pd.Period("201303", freq="M")
158+
v1 = Period("201302", freq="M")
159+
v2 = Period("201303", freq="M")
116160
x = Series([v1, v1, v1, v2, v2, v1])
117161

118162
# periods are not 'sorted' as they are converted back into an index
119163
codes, uniques = algos.factorize(x)
120164
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
121165
tm.assert_numpy_array_equal(codes, exp)
122-
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
166+
tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
123167

124168
codes, uniques = algos.factorize(x, sort=True)
125169
exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
126170
tm.assert_numpy_array_equal(codes, exp)
127-
tm.assert_index_equal(uniques, pd.PeriodIndex([v1, v2]))
171+
tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
128172

129173
# GH 5986
130-
v1 = pd.to_timedelta("1 day 1 min")
131-
v2 = pd.to_timedelta("1 day")
174+
v1 = to_timedelta("1 day 1 min")
175+
v2 = to_timedelta("1 day")
132176
x = Series([v1, v2, v1, v1, v2, v2, v1])
133177
codes, uniques = algos.factorize(x)
134178
exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
135179
tm.assert_numpy_array_equal(codes, exp)
136-
tm.assert_index_equal(uniques, pd.to_timedelta([v1, v2]))
180+
tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
137181

138182
codes, uniques = algos.factorize(x, sort=True)
139183
exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
140184
tm.assert_numpy_array_equal(codes, exp)
141-
tm.assert_index_equal(uniques, pd.to_timedelta([v2, v1]))
185+
tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
142186

143187
def test_factorize_nan(self):
144188
# nan should map to na_sentinel, not reverse_indexer[na_sentinel]
@@ -241,7 +285,7 @@ def test_string_factorize(self, writable):
241285
tm.assert_numpy_array_equal(uniques, expected_uniques)
242286

243287
def test_object_factorize(self, writable):
244-
data = np.array(["a", "c", None, np.nan, "a", "b", pd.NaT, "c"], dtype=object)
288+
data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
245289
data.setflags(write=writable)
246290
expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
247291
expected_uniques = np.array(["a", "c", "b"], dtype=object)
@@ -404,7 +448,7 @@ def test_object_refcount_bug(self):
404448

405449
def test_on_index_object(self):
406450

407-
mindex = pd.MultiIndex.from_arrays(
451+
mindex = MultiIndex.from_arrays(
408452
[np.arange(5).repeat(5), np.tile(np.arange(5), 5)]
409453
)
410454
expected = mindex.values
@@ -456,7 +500,7 @@ def test_datetime64_dtype_array_returned(self):
456500
dtype="M8[ns]",
457501
)
458502

459-
dt_index = pd.to_datetime(
503+
dt_index = to_datetime(
460504
[
461505
"2015-01-03T00:00:00.000000000",
462506
"2015-01-01T00:00:00.000000000",
@@ -493,7 +537,7 @@ def test_timedelta64_dtype_array_returned(self):
493537
# GH 9431
494538
expected = np.array([31200, 45678, 10000], dtype="m8[ns]")
495539

496-
td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
540+
td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
497541
result = algos.unique(td_index)
498542
tm.assert_numpy_array_equal(result, expected)
499543
assert result.dtype == expected.dtype
@@ -772,7 +816,7 @@ def test_basic(self):
772816

773817
def test_i8(self):
774818

775-
arr = pd.date_range("20130101", periods=3).values
819+
arr = date_range("20130101", periods=3).values
776820
result = algos.isin(arr, [arr[0]])
777821
expected = np.array([True, False, False])
778822
tm.assert_numpy_array_equal(result, expected)
@@ -785,7 +829,7 @@ def test_i8(self):
785829
expected = np.array([True, True, False])
786830
tm.assert_numpy_array_equal(result, expected)
787831

788-
arr = pd.timedelta_range("1 day", periods=3).values
832+
arr = timedelta_range("1 day", periods=3).values
789833
result = algos.isin(arr, [arr[0]])
790834
expected = np.array([True, False, False])
791835
tm.assert_numpy_array_equal(result, expected)
@@ -799,7 +843,7 @@ def test_i8(self):
799843
tm.assert_numpy_array_equal(result, expected)
800844

801845
def test_large(self):
802-
s = pd.date_range("20000101", periods=2000000, freq="s").values
846+
s = date_range("20000101", periods=2000000, freq="s").values
803847
result = algos.isin(s, s[0:2])
804848
expected = np.zeros(len(s), dtype=bool)
805849
expected[0] = True
@@ -950,27 +994,27 @@ def test_different_nans_as_float64(self):
950994
def test_isin_int_df_string_search(self):
951995
"""Comparing df with int`s (1,2) with a string at isin() ("1")
952996
-> should not match values because int 1 is not equal str 1"""
953-
df = pd.DataFrame({"values": [1, 2]})
997+
df = DataFrame({"values": [1, 2]})
954998
result = df.isin(["1"])
955-
expected_false = pd.DataFrame({"values": [False, False]})
999+
expected_false = DataFrame({"values": [False, False]})
9561000
tm.assert_frame_equal(result, expected_false)
9571001

9581002
@pytest.mark.xfail(reason="problem related with issue #34125")
9591003
def test_isin_nan_df_string_search(self):
9601004
"""Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
9611005
-> should not match values because np.nan is not equal str NaN"""
962-
df = pd.DataFrame({"values": [np.nan, 2]})
1006+
df = DataFrame({"values": [np.nan, 2]})
9631007
result = df.isin(["NaN"])
964-
expected_false = pd.DataFrame({"values": [False, False]})
1008+
expected_false = DataFrame({"values": [False, False]})
9651009
tm.assert_frame_equal(result, expected_false)
9661010

9671011
@pytest.mark.xfail(reason="problem related with issue #34125")
9681012
def test_isin_float_df_string_search(self):
9691013
"""Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
9701014
-> should not match values because float 1.4245 is not equal str 1.4245"""
971-
df = pd.DataFrame({"values": [1.4245, 2.32441]})
1015+
df = DataFrame({"values": [1.4245, 2.32441]})
9721016
result = df.isin(["1.4245"])
973-
expected_false = pd.DataFrame({"values": [False, False]})
1017+
expected_false = DataFrame({"values": [False, False]})
9741018
tm.assert_frame_equal(result, expected_false)
9751019

9761020

@@ -1016,8 +1060,8 @@ def test_value_counts_dtypes(self):
10161060
algos.value_counts(["1", 1], bins=1)
10171061

10181062
def test_value_counts_nat(self):
1019-
td = Series([np.timedelta64(10000), pd.NaT], dtype="timedelta64[ns]")
1020-
dt = pd.to_datetime(["NaT", "2014-01-01"])
1063+
td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
1064+
dt = to_datetime(["NaT", "2014-01-01"])
10211065

10221066
for s in [td, dt]:
10231067
vc = algos.value_counts(s)
@@ -1051,7 +1095,7 @@ def test_value_counts_datetime_outofbounds(self):
10511095
tm.assert_series_equal(res, exp)
10521096

10531097
# GH 12424
1054-
res = pd.to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
1098+
res = to_datetime(Series(["2362-01-01", np.nan]), errors="ignore")
10551099
exp = Series(["2362-01-01", np.nan], dtype=object)
10561100
tm.assert_series_equal(res, exp)
10571101

@@ -1323,9 +1367,9 @@ def test_datetime_likes(self):
13231367
cases = [
13241368
np.array([Timestamp(d) for d in dt]),
13251369
np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
1326-
np.array([pd.Period(d, freq="D") for d in dt]),
1370+
np.array([Period(d, freq="D") for d in dt]),
13271371
np.array([np.datetime64(d) for d in dt]),
1328-
np.array([pd.Timedelta(d) for d in td]),
1372+
np.array([Timedelta(d) for d in td]),
13291373
]
13301374

13311375
exp_first = np.array(
@@ -1530,7 +1574,7 @@ def test_hashtable_unique(self, htable, tm_dtype, writable):
15301574
s.loc[500] = np.nan
15311575
elif htable == ht.PyObjectHashTable:
15321576
# use different NaN types for object column
1533-
s.loc[500:502] = [np.nan, None, pd.NaT]
1577+
s.loc[500:502] = [np.nan, None, NaT]
15341578

15351579
# create duplicated selection
15361580
s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
@@ -1570,7 +1614,7 @@ def test_hashtable_factorize(self, htable, tm_dtype, writable):
15701614
s.loc[500] = np.nan
15711615
elif htable == ht.PyObjectHashTable:
15721616
# use different NaN types for object column
1573-
s.loc[500:502] = [np.nan, None, pd.NaT]
1617+
s.loc[500:502] = [np.nan, None, NaT]
15741618

15751619
# create duplicated selection
15761620
s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
@@ -2307,7 +2351,7 @@ def test_diff_datetimelike_nat(self, dtype):
23072351
tm.assert_numpy_array_equal(result, expected.T)
23082352

23092353
def test_diff_ea_axis(self):
2310-
dta = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")._data
2354+
dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
23112355

23122356
msg = "cannot diff DatetimeArray on axis=1"
23132357
with pytest.raises(ValueError, match=msg):

0 commit comments

Comments
 (0)