Skip to content

TST/CLN: series.duplicated; parametrisation; fix warning #21899

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 16, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,6 @@ def complex_dtype(request):
UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"]
SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"]
ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES
ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES


@pytest.fixture(params=SIGNED_INT_DTYPES)
Expand Down Expand Up @@ -338,6 +337,13 @@ def any_int_dtype(request):
return request.param


FLOAT_DTYPES = [float, "float32", "float64"]
COMPLEX_DTYPES = [complex, "complex64", "complex128"]
STRING_DTYPES = [str, 'str', 'U']
ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES
ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + STRING_DTYPES + COMPLEX_DTYPES


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure that these lists aren't duplicated in the file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gfyoung, what do you mean? I removed ALL_REAL_DTYPES from above and brought it closer to where it's used. Or do you just want one block of all-caps definitions?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One block of all-caps definitions and use it everywhere in the file. For example, search for :

float, "float32", "float64"

in your version of conftest.py . You'll see it exists twice in the file.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls put these with the other definitions

@pytest.fixture(params=ALL_REAL_DTYPES)
def any_real_dtype(request):
"""
Expand All @@ -358,6 +364,31 @@ def any_real_dtype(request):
return request.param


@pytest.fixture(params=ALL_NUMPY_DTYPES)
def any_numpy_dtype(request):
"""
Parameterized fixture for any integer dtypes.

* int8
* uint8
* int16
* uint16
* int32
* uint32
* int64
* uint64
* float32
* float64
* complex64
* complex128
* str
* 'str'
* 'U'
"""

return request.param


@pytest.fixture
def mock():
"""
Expand Down
168 changes: 2 additions & 166 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -907,144 +907,6 @@ def test_matmul(self):
pytest.raises(Exception, a.dot, a.values[:3])
pytest.raises(ValueError, a.dot, b.T)

def test_value_counts_nunique(self):

# basics.rst doc example
series = Series(np.random.randn(500))
series[20:500] = np.nan
series[10:20] = 5000
result = series.nunique()
assert result == 11

# GH 18051
s = pd.Series(pd.Categorical([]))
assert s.nunique() == 0
s = pd.Series(pd.Categorical([np.nan]))
assert s.nunique() == 0

def test_unique(self):

# 714 also, dtype=float
s = Series([1.2345] * 100)
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

s = Series([1.2345] * 100, dtype='f4')
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

# NAs in object arrays #714
s = Series(['foo'] * 100, dtype='O')
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

# decision about None
s = Series([1, 2, 3, None, None, None], dtype=object)
result = s.unique()
expected = np.array([1, 2, 3, None], dtype=object)
tm.assert_numpy_array_equal(result, expected)

# GH 18051
s = pd.Series(pd.Categorical([]))
tm.assert_categorical_equal(s.unique(), pd.Categorical([]),
check_dtype=False)
s = pd.Series(pd.Categorical([np.nan]))
tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]),
check_dtype=False)

@pytest.mark.parametrize(
"tc1, tc2",
[
(
Series([1, 2, 3, 3], dtype=np.dtype('int_')),
Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_'))
),
(
Series([1, 2, 3, 3], dtype=np.dtype('uint')),
Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint'))
),
(
Series([1, 2, 3, 3], dtype=np.dtype('float_')),
Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_'))
),
(
Series([1, 2, 3, 3], dtype=np.dtype('unicode_')),
Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_'))
)
]
)
def test_drop_duplicates_non_bool(self, tc1, tc2):
# Test case 1
expected = Series([False, False, False, True])
assert_series_equal(tc1.duplicated(), expected)
assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
sc = tc1.copy()
sc.drop_duplicates(inplace=True)
assert_series_equal(sc, tc1[~expected])

expected = Series([False, False, True, False])
assert_series_equal(tc1.duplicated(keep='last'), expected)
assert_series_equal(tc1.drop_duplicates(keep='last'), tc1[~expected])
sc = tc1.copy()
sc.drop_duplicates(keep='last', inplace=True)
assert_series_equal(sc, tc1[~expected])

expected = Series([False, False, True, True])
assert_series_equal(tc1.duplicated(keep=False), expected)
assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
sc = tc1.copy()
sc.drop_duplicates(keep=False, inplace=True)
assert_series_equal(sc, tc1[~expected])

# Test case 2
expected = Series([False, False, False, False, True, True, False])
assert_series_equal(tc2.duplicated(), expected)
assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
sc = tc2.copy()
sc.drop_duplicates(inplace=True)
assert_series_equal(sc, tc2[~expected])

expected = Series([False, True, True, False, False, False, False])
assert_series_equal(tc2.duplicated(keep='last'), expected)
assert_series_equal(tc2.drop_duplicates(keep='last'), tc2[~expected])
sc = tc2.copy()
sc.drop_duplicates(keep='last', inplace=True)
assert_series_equal(sc, tc2[~expected])

expected = Series([False, True, True, False, True, True, False])
assert_series_equal(tc2.duplicated(keep=False), expected)
assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
sc = tc2.copy()
sc.drop_duplicates(keep=False, inplace=True)
assert_series_equal(sc, tc2[~expected])

def test_drop_duplicates_bool(self):
tc = Series([True, False, True, False])

expected = Series([False, False, True, True])
assert_series_equal(tc.duplicated(), expected)
assert_series_equal(tc.drop_duplicates(), tc[~expected])
sc = tc.copy()
sc.drop_duplicates(inplace=True)
assert_series_equal(sc, tc[~expected])

expected = Series([True, True, False, False])
assert_series_equal(tc.duplicated(keep='last'), expected)
assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
sc = tc.copy()
sc.drop_duplicates(keep='last', inplace=True)
assert_series_equal(sc, tc[~expected])

expected = Series([True, True, True, True])
assert_series_equal(tc.duplicated(keep=False), expected)
assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
sc = tc.copy()
sc.drop_duplicates(keep=False, inplace=True)
assert_series_equal(sc, tc[~expected])

def test_clip(self):
val = self.ts.median()

Expand Down Expand Up @@ -1416,7 +1278,8 @@ def test_ptp(self):
N = 1000
arr = np.random.randn(N)
ser = Series(arr)
assert np.ptp(ser) == np.ptp(arr)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
assert np.ptp(ser) == np.ptp(arr)

# GH11163
s = Series([3, 5, np.nan, -3, 10])
Expand Down Expand Up @@ -1457,10 +1320,6 @@ def test_empty_timeseries_redections_return_nat(self):
assert Series([], dtype=dtype).min() is pd.NaT
assert Series([], dtype=dtype).max() is pd.NaT

def test_unique_data_ownership(self):
# it works! #1807
Series(Series(["a", "c", "b"]).unique()).sort_values()

def test_repeat(self):
s = Series(np.random.randn(3), index=['a', 'b', 'c'])

Expand Down Expand Up @@ -1537,29 +1396,6 @@ def test_searchsorted_sorter(self):
e = np.array([0, 2], dtype=np.intp)
tm.assert_numpy_array_equal(r, e)

def test_is_unique(self):
# GH11946
s = Series(np.random.randint(0, 10, size=1000))
assert not s.is_unique
s = Series(np.arange(1000))
assert s.is_unique

def test_is_unique_class_ne(self, capsys):
# GH 20661
class Foo(object):
def __init__(self, val):
self._value = val

def __ne__(self, other):
raise Exception("NEQ not supported")

li = [Foo(i) for i in range(5)]
s = pd.Series(li, index=[i for i in range(5)])
_, err = capsys.readouterr()
s.is_unique
_, err = capsys.readouterr()
assert len(err) == 0

def test_is_monotonic(self):

s = Series(np.random.randint(0, 10, size=1000))
Expand Down
142 changes: 142 additions & 0 deletions pandas/tests/series/test_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# coding=utf-8
# pylint: disable-msg=E1101,W0612
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are you disabling?


import pytest

import numpy as np
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can remove space between pandas imports

import pandas as pd
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don’t import pd, directly import instead


from pandas import Series

from pandas.util.testing import assert_series_equal
import pandas.util.testing as tm
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use tm; don’t import assert_series_equal

from .common import TestData


class TestSeriesDuplicates(TestData):

def test_value_counts_nunique(self):

# basics.rst doc example
series = Series(np.random.randn(500))
series[20:500] = np.nan
series[10:20] = 5000
result = series.nunique()
assert result == 11

# GH 18051
s = pd.Series(pd.Categorical([]))
assert s.nunique() == 0
s = pd.Series(pd.Categorical([np.nan]))
assert s.nunique() == 0

def test_unique(self):

# 714 also, dtype=float
s = Series([1.2345] * 100)
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

s = Series([1.2345] * 100, dtype='f4')
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

# NAs in object arrays #714
s = Series(['foo'] * 100, dtype='O')
s[::2] = np.nan
result = s.unique()
assert len(result) == 2

# decision about None
s = Series([1, 2, 3, None, None, None], dtype=object)
result = s.unique()
expected = np.array([1, 2, 3, None], dtype=object)
tm.assert_numpy_array_equal(result, expected)

# GH 18051
s = pd.Series(pd.Categorical([]))
tm.assert_categorical_equal(s.unique(), pd.Categorical([]),
check_dtype=False)
s = pd.Series(pd.Categorical([np.nan]))
tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]),
check_dtype=False)

def test_unique_data_ownership(self):
# it works! #1807
Series(Series(["a", "c", "b"]).unique()).sort_values()

def test_is_unique(self):
# GH11946
s = Series(np.random.randint(0, 10, size=1000))
assert not s.is_unique
s = Series(np.arange(1000))
assert s.is_unique

def test_is_unique_class_ne(self, capsys):
# GH 20661
class Foo(object):
def __init__(self, val):
self._value = val

def __ne__(self, other):
raise Exception("NEQ not supported")

li = [Foo(i) for i in range(5)]
s = pd.Series(li, index=[i for i in range(5)])
_, err = capsys.readouterr()
s.is_unique
_, err = capsys.readouterr()
assert len(err) == 0

@pytest.mark.parametrize(
'keep, expected',
[
('first', Series([False, False, False, False, True, True, False])),
('last', Series([False, True, True, False, False, False, False])),
(False, Series([False, True, True, False, True, True, False]))
])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

either use 2 classes for test definitions or don’t use them (preferred)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same for frame

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Classes of tests? I just wrote them in the same manner as in test_analytics and then transferred them here. You want me to get rid of TestSeriesDuplicates and do class-less tests - do I get that correctly?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes that is the old format is much less readable

we only use classes if they r logical separations

def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected):
tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype))

assert_series_equal(tc.duplicated(keep=keep), expected)
assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
sc.drop_duplicates(keep=keep, inplace=True)
assert_series_equal(sc, tc[~expected])

@pytest.mark.parametrize('keep, expected',
[('first', Series([False, False, True, True])),
('last', Series([True, True, False, False])),
(False, Series([True, True, True, True]))])
def test_drop_duplicates_bool(self, keep, expected):
tc = Series([True, False, True, False])

assert_series_equal(tc.duplicated(keep=keep), expected)
assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
sc.drop_duplicates(keep=keep, inplace=True)
assert_series_equal(sc, tc[~expected])

@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, False, True], name='name')),
('last', Series([True, True, False, False, False], name='name')),
(False, Series([True, True, True, False, True], name='name'))
])
def test_duplicated_keep(self, keep, expected):
s = Series(['a', 'b', 'b', 'c', 'a'], name='name')

result = s.duplicated(keep=keep)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize('keep, expected', [
('first', Series([False, False, True, False, True])),
('last', Series([True, True, False, False, False])),
(False, Series([True, True, True, False, True]))
])
def test_duplicated_nan_none(self, keep, expected):
s = Series([np.nan, 3, 3, None, np.nan], dtype=object)

result = s.duplicated(keep=keep)
tm.assert_series_equal(result, expected)