|
| 1 | +import collections |
1 | 2 | from datetime import datetime, timedelta
|
2 | 3 | from io import StringIO
|
3 | 4 | import sys
|
|
15 | 16 | is_datetime64_dtype,
|
16 | 17 | is_datetime64tz_dtype,
|
17 | 18 | is_object_dtype,
|
18 |
| - is_period_dtype, |
19 | 19 | needs_i8_conversion,
|
20 | 20 | )
|
21 | 21 |
|
|
26 | 26 | Index,
|
27 | 27 | Interval,
|
28 | 28 | IntervalIndex,
|
29 |
| - PeriodIndex, |
30 | 29 | Series,
|
31 | 30 | Timedelta,
|
32 | 31 | TimedeltaIndex,
|
33 |
| - Timestamp, |
34 | 32 | )
|
35 | 33 | import pandas._testing as tm
|
36 | 34 |
|
@@ -207,180 +205,152 @@ def test_ndarray_compat_properties(self, index_or_series_obj):
|
207 | 205 | assert Index([1]).item() == 1
|
208 | 206 | assert Series([1]).item() == 1
|
209 | 207 |
|
210 |
| - def test_value_counts_unique_nunique(self, index_or_series_obj): |
211 |
| - orig = index_or_series_obj |
212 |
| - obj = orig.copy() |
213 |
| - klass = type(obj) |
214 |
| - values = obj._values |
215 |
| - |
216 |
| - if orig.duplicated().any(): |
217 |
| - pytest.xfail( |
218 |
| - "The test implementation isn't flexible enough to deal " |
219 |
| - "with duplicated values. This isn't a bug in the " |
220 |
| - "application code, but in the test code." |
221 |
| - ) |
| 208 | + def test_unique(self, index_or_series_obj): |
| 209 | + obj = index_or_series_obj |
| 210 | + obj = np.repeat(obj, range(1, len(obj) + 1)) |
| 211 | + result = obj.unique() |
222 | 212 |
|
223 |
| - # create repeated values, 'n'th element is repeated by n+1 times |
224 |
| - if isinstance(obj, Index): |
225 |
| - expected_index = Index(obj[::-1]) |
226 |
| - expected_index.name = None |
227 |
| - obj = obj.repeat(range(1, len(obj) + 1)) |
| 213 | + # dict.fromkeys preserves the order |
| 214 | + unique_values = list(dict.fromkeys(obj.values)) |
| 215 | + if isinstance(obj, pd.MultiIndex): |
| 216 | + expected = pd.MultiIndex.from_tuples(unique_values) |
| 217 | + expected.names = obj.names |
| 218 | + tm.assert_index_equal(result, expected) |
| 219 | + elif isinstance(obj, pd.Index): |
| 220 | + expected = pd.Index(unique_values, dtype=obj.dtype) |
| 221 | + if is_datetime64tz_dtype(obj): |
| 222 | + expected = expected.normalize() |
| 223 | + tm.assert_index_equal(result, expected) |
228 | 224 | else:
|
229 |
| - expected_index = Index(values[::-1]) |
230 |
| - idx = obj.index.repeat(range(1, len(obj) + 1)) |
231 |
| - # take-based repeat |
232 |
| - indices = np.repeat(np.arange(len(obj)), range(1, len(obj) + 1)) |
233 |
| - rep = values.take(indices) |
234 |
| - obj = klass(rep, index=idx) |
235 |
| - |
236 |
| - # check values has the same dtype as the original |
237 |
| - assert obj.dtype == orig.dtype |
238 |
| - |
239 |
| - expected_s = Series( |
240 |
| - range(len(orig), 0, -1), index=expected_index, dtype="int64" |
241 |
| - ) |
| 225 | + expected = np.array(unique_values) |
| 226 | + tm.assert_numpy_array_equal(result, expected) |
242 | 227 |
|
243 |
| - result = obj.value_counts() |
244 |
| - tm.assert_series_equal(result, expected_s) |
245 |
| - assert result.index.name is None |
| 228 | + @pytest.mark.parametrize("null_obj", [np.nan, None]) |
| 229 | + def test_unique_null(self, null_obj, index_or_series_obj): |
| 230 | + obj = index_or_series_obj |
| 231 | + |
| 232 | + if not allow_na_ops(obj): |
| 233 | + pytest.skip("type doesn't allow for NA operations") |
| 234 | + elif len(obj) < 1: |
| 235 | + pytest.skip("Test doesn't make sense on empty data") |
| 236 | + elif isinstance(obj, pd.MultiIndex): |
| 237 | + pytest.skip(f"MultiIndex can't hold '{null_obj}'") |
| 238 | + |
| 239 | + values = obj.values |
| 240 | + if needs_i8_conversion(obj): |
| 241 | + values[0:2] = iNaT |
| 242 | + else: |
| 243 | + values[0:2] = null_obj |
246 | 244 |
|
| 245 | + klass = type(obj) |
| 246 | + repeated_values = np.repeat(values, range(1, len(values) + 1)) |
| 247 | + obj = klass(repeated_values, dtype=obj.dtype) |
247 | 248 | result = obj.unique()
|
248 |
| - if isinstance(obj, Index): |
249 |
| - assert isinstance(result, type(obj)) |
250 |
| - tm.assert_index_equal(result, orig) |
251 |
| - assert result.dtype == orig.dtype |
252 |
| - elif is_datetime64tz_dtype(obj): |
253 |
| - # datetimetz Series returns array of Timestamp |
254 |
| - assert result[0] == orig[0] |
255 |
| - for r in result: |
256 |
| - assert isinstance(r, Timestamp) |
257 |
| - |
258 |
| - tm.assert_numpy_array_equal( |
259 |
| - result.astype(object), orig._values.astype(object) |
260 |
| - ) |
| 249 | + |
| 250 | + unique_values_raw = dict.fromkeys(obj.values) |
| 251 | + # because np.nan == np.nan is False, but None == None is True |
| 252 | + # np.nan would be duplicated, whereas None wouldn't |
| 253 | + unique_values_not_null = [ |
| 254 | + val for val in unique_values_raw if not pd.isnull(val) |
| 255 | + ] |
| 256 | + unique_values = [null_obj] + unique_values_not_null |
| 257 | + |
| 258 | + if isinstance(obj, pd.Index): |
| 259 | + expected = pd.Index(unique_values, dtype=obj.dtype) |
| 260 | + if is_datetime64tz_dtype(obj): |
| 261 | + result = result.normalize() |
| 262 | + expected = expected.normalize() |
| 263 | + elif isinstance(obj, pd.CategoricalIndex): |
| 264 | + expected = expected.set_categories(unique_values_not_null) |
| 265 | + tm.assert_index_equal(result, expected) |
261 | 266 | else:
|
262 |
| - tm.assert_numpy_array_equal(result, orig.values) |
263 |
| - assert result.dtype == orig.dtype |
| 267 | + expected = np.array(unique_values, dtype=obj.dtype) |
| 268 | + tm.assert_numpy_array_equal(result, expected) |
264 | 269 |
|
265 |
| - # dropna=True would break for MultiIndex |
266 |
| - assert obj.nunique(dropna=False) == len(np.unique(obj.values)) |
| 270 | + def test_nunique(self, index_or_series_obj): |
| 271 | + obj = index_or_series_obj |
| 272 | + obj = np.repeat(obj, range(1, len(obj) + 1)) |
| 273 | + expected = len(obj.unique()) |
| 274 | + assert obj.nunique(dropna=False) == expected |
267 | 275 |
|
268 | 276 | @pytest.mark.parametrize("null_obj", [np.nan, None])
|
269 |
| - def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): |
270 |
| - orig = index_or_series_obj |
271 |
| - obj = orig.copy() |
272 |
| - klass = type(obj) |
273 |
| - values = obj._ndarray_values |
274 |
| - num_values = len(orig) |
| 277 | + def test_nunique_null(self, null_obj, index_or_series_obj): |
| 278 | + obj = index_or_series_obj |
275 | 279 |
|
276 | 280 | if not allow_na_ops(obj):
|
277 | 281 | pytest.skip("type doesn't allow for NA operations")
|
278 |
| - elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): |
279 |
| - pytest.skip(f"values of {klass} cannot be changed") |
280 |
| - elif isinstance(orig, pd.MultiIndex): |
281 |
| - pytest.skip("MultiIndex doesn't support isna") |
282 |
| - elif orig.duplicated().any(): |
283 |
| - pytest.xfail( |
284 |
| - "The test implementation isn't flexible enough to deal " |
285 |
| - "with duplicated values. This isn't a bug in the " |
286 |
| - "application code, but in the test code." |
287 |
| - ) |
288 |
| - |
289 |
| - # special assign to the numpy array |
290 |
| - if is_datetime64tz_dtype(obj): |
291 |
| - if isinstance(obj, DatetimeIndex): |
292 |
| - v = obj.asi8 |
293 |
| - v[0:2] = iNaT |
294 |
| - values = obj._shallow_copy(v) |
295 |
| - else: |
296 |
| - obj = obj.copy() |
297 |
| - obj[0:2] = pd.NaT |
298 |
| - values = obj._values |
| 282 | + elif isinstance(obj, pd.MultiIndex): |
| 283 | + pytest.skip(f"MultiIndex can't hold '{null_obj}'") |
299 | 284 |
|
300 |
| - elif is_period_dtype(obj): |
301 |
| - values[0:2] = iNaT |
302 |
| - parr = type(obj._data)(values, dtype=obj.dtype) |
303 |
| - values = obj._shallow_copy(parr) |
304 |
| - elif needs_i8_conversion(obj): |
| 285 | + values = obj.values |
| 286 | + if needs_i8_conversion(obj): |
305 | 287 | values[0:2] = iNaT
|
306 |
| - values = obj._shallow_copy(values) |
307 | 288 | else:
|
308 | 289 | values[0:2] = null_obj
|
309 | 290 |
|
310 |
| - # check values has the same dtype as the original |
311 |
| - assert values.dtype == obj.dtype |
312 |
| - |
313 |
| - # create repeated values, 'n'th element is repeated by n+1 |
314 |
| - # times |
315 |
| - if isinstance(obj, (DatetimeIndex, PeriodIndex)): |
316 |
| - expected_index = obj.copy() |
317 |
| - expected_index.name = None |
| 291 | + klass = type(obj) |
| 292 | + repeated_values = np.repeat(values, range(1, len(values) + 1)) |
| 293 | + obj = klass(repeated_values, dtype=obj.dtype) |
318 | 294 |
|
319 |
| - # attach name to klass |
320 |
| - obj = klass(values.repeat(range(1, len(obj) + 1))) |
321 |
| - obj.name = "a" |
322 |
| - else: |
323 |
| - if isinstance(obj, DatetimeIndex): |
324 |
| - expected_index = orig._values._shallow_copy(values) |
325 |
| - else: |
326 |
| - expected_index = Index(values) |
327 |
| - expected_index.name = None |
328 |
| - obj = obj.repeat(range(1, len(obj) + 1)) |
329 |
| - obj.name = "a" |
330 |
| - |
331 |
| - # check values has the same dtype as the original |
332 |
| - assert obj.dtype == orig.dtype |
333 |
| - |
334 |
| - # check values correctly have NaN |
335 |
| - nanloc = np.zeros(len(obj), dtype=np.bool) |
336 |
| - nanloc[:3] = True |
337 |
| - if isinstance(obj, Index): |
338 |
| - tm.assert_numpy_array_equal(pd.isna(obj), nanloc) |
| 295 | + if isinstance(obj, pd.CategoricalIndex): |
| 296 | + assert obj.nunique() == len(obj.categories) |
| 297 | + assert obj.nunique(dropna=False) == len(obj.categories) + 1 |
339 | 298 | else:
|
340 |
| - exp = Series(nanloc, obj.index, name="a") |
341 |
| - tm.assert_series_equal(pd.isna(obj), exp) |
342 |
| - |
343 |
| - expected_data = list(range(num_values, 2, -1)) |
344 |
| - expected_data_na = expected_data.copy() |
345 |
| - if expected_data_na: |
346 |
| - expected_data_na.append(3) |
347 |
| - expected_s_na = Series( |
348 |
| - expected_data_na, |
349 |
| - index=expected_index[num_values - 1 : 0 : -1], |
350 |
| - dtype="int64", |
351 |
| - name="a", |
352 |
| - ) |
353 |
| - expected_s = Series( |
354 |
| - expected_data, |
355 |
| - index=expected_index[num_values - 1 : 1 : -1], |
356 |
| - dtype="int64", |
357 |
| - name="a", |
358 |
| - ) |
| 299 | + num_unique_values = len(obj.unique()) |
| 300 | + assert obj.nunique() == max(0, num_unique_values - 1) |
| 301 | + assert obj.nunique(dropna=False) == max(0, num_unique_values) |
359 | 302 |
|
360 |
| - result_s_na = obj.value_counts(dropna=False) |
361 |
| - tm.assert_series_equal(result_s_na, expected_s_na) |
362 |
| - assert result_s_na.index.name is None |
363 |
| - assert result_s_na.name == "a" |
364 |
| - result_s = obj.value_counts() |
365 |
| - tm.assert_series_equal(obj.value_counts(), expected_s) |
366 |
| - assert result_s.index.name is None |
367 |
| - assert result_s.name == "a" |
| 303 | + def test_value_counts(self, index_or_series_obj): |
| 304 | + obj = index_or_series_obj |
| 305 | + obj = np.repeat(obj, range(1, len(obj) + 1)) |
| 306 | + result = obj.value_counts() |
368 | 307 |
|
369 |
| - result = obj.unique() |
370 |
| - if isinstance(obj, Index): |
371 |
| - tm.assert_index_equal(result, Index(values[1:], name="a")) |
372 |
| - elif is_datetime64tz_dtype(obj): |
373 |
| - # unable to compare NaT / nan |
374 |
| - tm.assert_extension_array_equal(result[1:], values[2:]) |
375 |
| - assert result[0] is pd.NaT |
376 |
| - elif len(obj) > 0: |
377 |
| - tm.assert_numpy_array_equal(result[1:], values[2:]) |
378 |
| - |
379 |
| - assert pd.isna(result[0]) |
380 |
| - assert result.dtype == orig.dtype |
381 |
| - |
382 |
| - assert obj.nunique() == max(0, num_values - 2) |
383 |
| - assert obj.nunique(dropna=False) == max(0, num_values - 1) |
| 308 | + counter = collections.Counter(obj) |
| 309 | + expected = pd.Series(dict(counter.most_common()), dtype=np.int64, name=obj.name) |
| 310 | + expected.index = expected.index.astype(obj.dtype) |
| 311 | + if isinstance(obj, pd.MultiIndex): |
| 312 | + expected.index = pd.Index(expected.index) |
| 313 | + |
| 314 | + # sort_index to avoid switched order when values share the same count |
| 315 | + result = result.sort_index() |
| 316 | + expected = expected.sort_index() |
| 317 | + tm.assert_series_equal(result, expected) |
| 318 | + |
| 319 | + @pytest.mark.parametrize("null_obj", [np.nan, None]) |
| 320 | + def test_value_counts_null(self, null_obj, index_or_series_obj): |
| 321 | + orig = index_or_series_obj |
| 322 | + obj = orig.copy() |
| 323 | + |
| 324 | + if not allow_na_ops(obj): |
| 325 | + pytest.skip("type doesn't allow for NA operations") |
| 326 | + elif len(obj) < 1: |
| 327 | + pytest.skip("Test doesn't make sense on empty data") |
| 328 | + elif isinstance(orig, pd.MultiIndex): |
| 329 | + pytest.skip(f"MultiIndex can't hold '{null_obj}'") |
| 330 | + |
| 331 | + values = obj.values |
| 332 | + if needs_i8_conversion(obj): |
| 333 | + values[0:2] = iNaT |
| 334 | + else: |
| 335 | + values[0:2] = null_obj |
| 336 | + |
| 337 | + klass = type(obj) |
| 338 | + repeated_values = np.repeat(values, range(1, len(values) + 1)) |
| 339 | + obj = klass(repeated_values, dtype=obj.dtype) |
| 340 | + |
| 341 | + # because np.nan == np.nan is False, but None == None is True |
| 342 | + # np.nan would be duplicated, whereas None wouldn't |
| 343 | + counter = collections.Counter(obj.dropna()) |
| 344 | + expected = pd.Series(dict(counter.most_common()), dtype=np.int64) |
| 345 | + expected.index = expected.index.astype(obj.dtype) |
| 346 | + |
| 347 | + tm.assert_series_equal(obj.value_counts(), expected) |
| 348 | + |
| 349 | + # can't use expected[null_obj] = 3 as |
| 350 | + # IntervalIndex doesn't allow assignment |
| 351 | + new_entry = pd.Series({np.nan: 3}, dtype=np.int64) |
| 352 | + expected = expected.append(new_entry) |
| 353 | + tm.assert_series_equal(obj.value_counts(dropna=False), expected) |
384 | 354 |
|
385 | 355 | def test_value_counts_inferred(self, index_or_series):
|
386 | 356 | klass = index_or_series
|
|
0 commit comments