-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
TST/CLN: series.duplicated; parametrisation; fix warning #21899
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
4ca9dee
c06442b
92d708c
12d2888
c9a3f71
d34b7d1
e640040
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
# coding=utf-8 | ||
# pylint: disable-msg=E1101,W0612 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what are you disabling? |
||
|
||
import pytest | ||
|
||
import numpy as np | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can remove space between pandas imports |
||
import pandas as pd | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don’t import pd, directly import instead |
||
|
||
from pandas import Series | ||
|
||
from pandas.util.testing import assert_series_equal | ||
import pandas.util.testing as tm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use tm; don’t import assert_series_equal |
||
from .common import TestData | ||
|
||
|
||
class TestSeriesDuplicates(TestData): | ||
|
||
def test_value_counts_nunique(self): | ||
|
||
# basics.rst doc example | ||
series = Series(np.random.randn(500)) | ||
series[20:500] = np.nan | ||
series[10:20] = 5000 | ||
result = series.nunique() | ||
assert result == 11 | ||
|
||
# GH 18051 | ||
s = pd.Series(pd.Categorical([])) | ||
assert s.nunique() == 0 | ||
s = pd.Series(pd.Categorical([np.nan])) | ||
assert s.nunique() == 0 | ||
|
||
def test_unique(self): | ||
|
||
# 714 also, dtype=float | ||
s = Series([1.2345] * 100) | ||
s[::2] = np.nan | ||
result = s.unique() | ||
assert len(result) == 2 | ||
|
||
s = Series([1.2345] * 100, dtype='f4') | ||
s[::2] = np.nan | ||
result = s.unique() | ||
assert len(result) == 2 | ||
|
||
# NAs in object arrays #714 | ||
s = Series(['foo'] * 100, dtype='O') | ||
s[::2] = np.nan | ||
result = s.unique() | ||
assert len(result) == 2 | ||
|
||
# decision about None | ||
s = Series([1, 2, 3, None, None, None], dtype=object) | ||
result = s.unique() | ||
expected = np.array([1, 2, 3, None], dtype=object) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
# GH 18051 | ||
s = pd.Series(pd.Categorical([])) | ||
tm.assert_categorical_equal(s.unique(), pd.Categorical([]), | ||
check_dtype=False) | ||
s = pd.Series(pd.Categorical([np.nan])) | ||
tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), | ||
check_dtype=False) | ||
|
||
def test_unique_data_ownership(self): | ||
# it works! #1807 | ||
Series(Series(["a", "c", "b"]).unique()).sort_values() | ||
|
||
def test_is_unique(self): | ||
# GH11946 | ||
s = Series(np.random.randint(0, 10, size=1000)) | ||
assert not s.is_unique | ||
s = Series(np.arange(1000)) | ||
assert s.is_unique | ||
|
||
def test_is_unique_class_ne(self, capsys): | ||
# GH 20661 | ||
class Foo(object): | ||
def __init__(self, val): | ||
self._value = val | ||
|
||
def __ne__(self, other): | ||
raise Exception("NEQ not supported") | ||
|
||
li = [Foo(i) for i in range(5)] | ||
s = pd.Series(li, index=[i for i in range(5)]) | ||
_, err = capsys.readouterr() | ||
s.is_unique | ||
_, err = capsys.readouterr() | ||
assert len(err) == 0 | ||
|
||
@pytest.mark.parametrize( | ||
'keep, expected', | ||
[ | ||
('first', Series([False, False, False, False, True, True, False])), | ||
('last', Series([False, True, True, False, False, False, False])), | ||
(False, Series([False, True, True, False, True, True, False])) | ||
]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. either use 2 classes for test definitions or don’t use them (preferred) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same for frame There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Classes of tests? I just wrote them in the same manner as in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes that is the old format is much less readable we only use classes if they r logical separations |
||
def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected): | ||
tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) | ||
|
||
assert_series_equal(tc.duplicated(keep=keep), expected) | ||
assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) | ||
sc = tc.copy() | ||
sc.drop_duplicates(keep=keep, inplace=True) | ||
assert_series_equal(sc, tc[~expected]) | ||
|
||
@pytest.mark.parametrize('keep, expected', | ||
[('first', Series([False, False, True, True])), | ||
('last', Series([True, True, False, False])), | ||
(False, Series([True, True, True, True]))]) | ||
def test_drop_duplicates_bool(self, keep, expected): | ||
tc = Series([True, False, True, False]) | ||
|
||
assert_series_equal(tc.duplicated(keep=keep), expected) | ||
assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) | ||
sc = tc.copy() | ||
sc.drop_duplicates(keep=keep, inplace=True) | ||
assert_series_equal(sc, tc[~expected]) | ||
|
||
@pytest.mark.parametrize('keep, expected', [ | ||
('first', Series([False, False, True, False, True], name='name')), | ||
('last', Series([True, True, False, False, False], name='name')), | ||
(False, Series([True, True, True, False, True], name='name')) | ||
]) | ||
def test_duplicated_keep(self, keep, expected): | ||
s = Series(['a', 'b', 'b', 'c', 'a'], name='name') | ||
|
||
result = s.duplicated(keep=keep) | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.parametrize('keep, expected', [ | ||
('first', Series([False, False, True, False, True])), | ||
('last', Series([True, True, False, False, False])), | ||
(False, Series([True, True, True, False, True])) | ||
]) | ||
def test_duplicated_nan_none(self, keep, expected): | ||
s = Series([np.nan, 3, 3, None, np.nan], dtype=object) | ||
|
||
result = s.duplicated(keep=keep) | ||
tm.assert_series_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sure that these lists aren't duplicated in the file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gfyoung, what do you mean? I removed
ALL_REAL_DTYPES
from above and brought it closer to where it's used. Or do you just want one block of all-caps definitions?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One block of all-caps definitions and use it everywhere in the file. For example, search for :
float, "float32", "float64"
in your version of
conftest.py
. You'll see it exists twice in the file.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pls put these with the other definitions