Skip to content

BUG: Fix value_counts name handling #10419

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 27, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

- Bug in ``DataFrame.apply`` when function returns categorical series. (:issue:`9573`)


Expand Down Expand Up @@ -100,3 +101,6 @@ Bug Fixes


- Bug that caused segfault when resampling an empty Series (:issue:`10228`)
- Bug in ``DatetimeIndex`` and ``PeriodIndex.value_counts`` resets name from its result, but retains in result's ``Index``. (:issue:`10150`)


5 changes: 3 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
from pandas.tools.tile import cut
from pandas.tseries.period import PeriodIndex

name = getattr(values, 'name', None)
values = Series(values).values

if bins is not None:
Expand All @@ -222,7 +223,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
if com.is_datetime_or_timedelta_dtype(dtype) or is_period:

if is_period:
values = PeriodIndex(values)
values = PeriodIndex(values, name=name)

values = values.view(np.int64)
keys, counts = htable.value_count_int64(values)
Expand All @@ -247,7 +248,7 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
keys = np.insert(keys, 0, np.NaN)
counts = np.insert(counts, 0, mask.sum())

result = Series(counts, index=com._values_from_object(keys))
result = Series(counts, index=com._values_from_object(keys), name=name)

if bins is not None:
# TODO: This next line should be more efficient
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,10 @@ def value_counts(self, normalize=False, sort=True, ascending=False,

if isinstance(self, PeriodIndex):
# preserve freq
result.index = self._simple_new(result.index.values, self.name,
result.index = self._simple_new(result.index.values,
freq=self.freq)
elif isinstance(self, DatetimeIndex):
result.index = self._simple_new(result.index.values, self.name,
result.index = self._simple_new(result.index.values,
tz=getattr(self, 'tz', None))
return result

Expand Down
122 changes: 72 additions & 50 deletions pandas/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,23 +181,24 @@ def f():

class Ops(tm.TestCase):
def setUp(self):
self.bool_index = tm.makeBoolIndex(10)
self.int_index = tm.makeIntIndex(10)
self.float_index = tm.makeFloatIndex(10)
self.dt_index = tm.makeDateIndex(10)
self.dt_tz_index = tm.makeDateIndex(10).tz_localize(tz='US/Eastern')
self.period_index = tm.makePeriodIndex(10)
self.string_index = tm.makeStringIndex(10)
self.bool_index = tm.makeBoolIndex(10, name='a')
self.int_index = tm.makeIntIndex(10, name='a')
self.float_index = tm.makeFloatIndex(10, name='a')
self.dt_index = tm.makeDateIndex(10, name='a')
self.dt_tz_index = tm.makeDateIndex(10, name='a').tz_localize(tz='US/Eastern')
self.period_index = tm.makePeriodIndex(10, name='a')
self.string_index = tm.makeStringIndex(10, name='a')
self.unicode_index = tm.makeUnicodeIndex(10, name='a')

arr = np.random.randn(10)
self.int_series = Series(arr, index=self.int_index)
self.float_series = Series(arr, index=self.float_index)
self.dt_series = Series(arr, index=self.dt_index)
self.int_series = Series(arr, index=self.int_index, name='a')
self.float_series = Series(arr, index=self.float_index, name='a')
self.dt_series = Series(arr, index=self.dt_index, name='a')
self.dt_tz_series = self.dt_tz_index.to_series(keep_tz=True)
self.period_series = Series(arr, index=self.period_index)
self.string_series = Series(arr, index=self.string_index)
self.period_series = Series(arr, index=self.period_index, name='a')
self.string_series = Series(arr, index=self.string_index, name='a')

types = ['bool','int','float','dt', 'dt_tz', 'period','string']
types = ['bool','int','float','dt', 'dt_tz', 'period','string', 'unicode']
fmts = [ "{0}_{1}".format(t,f) for t in types for f in ['index','series'] ]
self.objs = [ getattr(self,f) for f in fmts if getattr(self,f,None) is not None ]

Expand All @@ -213,9 +214,9 @@ def check_ops_properties(self, props, filter=None, ignore_failures=False):

try:
if isinstance(o, Series):
expected = Series(getattr(o.index,op),index=o.index)
expected = Series(getattr(o.index,op), index=o.index, name='a')
else:
expected = getattr(o,op)
expected = getattr(o, op)
except (AttributeError):
if ignore_failures:
continue
Expand Down Expand Up @@ -361,21 +362,28 @@ def test_value_counts_unique_nunique(self):
# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(o, PeriodIndex):
# freq must be specified because repeat makes freq ambiguous
expected_index = o[::-1]
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)

# resets name from Index
expected_index = pd.Index(o[::-1], name=None)

# attach name to klass
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a')
# don't test boolean
elif isinstance(o,Index) and o.is_boolean():
continue
elif isinstance(o, Index):
expected_index = values[::-1]
o = klass(np.repeat(values, range(1, len(o) + 1)))
expected_index = pd.Index(values[::-1], name=None)
o = klass(np.repeat(values, range(1, len(o) + 1)), name='a')
else:
expected_index = values[::-1]
expected_index = pd.Index(values[::-1], name=None)
idx = np.repeat(o.index.values, range(1, len(o) + 1))
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a')

expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64')
tm.assert_series_equal(o.value_counts(), expected_s)
expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a')
result = o.value_counts()
tm.assert_series_equal(result, expected_s)
self.assertTrue(result.index.name is None)
self.assertEqual(result.name, 'a')

result = o.unique()
if isinstance(o, (DatetimeIndex, PeriodIndex)):
Expand Down Expand Up @@ -410,21 +418,34 @@ def test_value_counts_unique_nunique(self):
# create repeated values, 'n'th element is repeated by n+1 times
if isinstance(o, PeriodIndex):
# freq must be specified because repeat makes freq ambiguous
expected_index = o
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq)

# resets name from Index
expected_index = pd.Index(o, name=None)
# attach name to klass
o = klass(np.repeat(values, range(1, len(o) + 1)), freq=o.freq, name='a')
elif isinstance(o, Index):
expected_index = values
o = klass(np.repeat(values, range(1, len(o) + 1)))
expected_index = pd.Index(values, name=None)
o = klass(np.repeat(values, range(1, len(o) + 1)), name='a')
else:
expected_index = values
expected_index = pd.Index(values, name=None)
idx = np.repeat(o.index.values, range(1, len(o) + 1))
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx)

expected_s_na = Series(list(range(10, 2, -1)) +[3], index=expected_index[9:0:-1], dtype='int64')
expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64')

tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na)
o = klass(np.repeat(values, range(1, len(o) + 1)), index=idx, name='a')

expected_s_na = Series(list(range(10, 2, -1)) +[3],
index=expected_index[9:0:-1],
dtype='int64', name='a')
expected_s = Series(list(range(10, 2, -1)),
index=expected_index[9:1:-1],
dtype='int64', name='a')

result_s_na = o.value_counts(dropna=False)
tm.assert_series_equal(result_s_na, expected_s_na)
self.assertTrue(result_s_na.index.name is None)
self.assertEqual(result_s_na.name, 'a')
result_s = o.value_counts()
tm.assert_series_equal(o.value_counts(), expected_s)
self.assertTrue(result_s.index.name is None)
self.assertEqual(result_s.name, 'a')

# numpy_array_equal cannot compare arrays includes nan
result = o.unique()
Expand Down Expand Up @@ -508,14 +529,15 @@ def test_value_counts_inferred(self):
df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
parse_dates=["dt"])

s = klass(df['dt'].copy())
s = klass(df['dt'].copy(), name='dt')

idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
expected_s = Series([3, 2, 1], index=idx)
idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
'2009-01-01 00:00:00X'])
expected_s = Series([3, 2, 1], index=idx, name='dt')
tm.assert_series_equal(s.value_counts(), expected_s)

expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
dtype='datetime64[ns]')
expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
'2008-09-09 00:00:00Z'], dtype='datetime64[ns]')
if isinstance(s, DatetimeIndex):
expected = DatetimeIndex(expected)
self.assertTrue(s.unique().equals(expected))
Expand All @@ -526,7 +548,7 @@ def test_value_counts_inferred(self):

# with NaT
s = df['dt'].copy()
s = klass([v for v in s.values] + [pd.NaT])
s = klass([v for v in s.values] + [pd.NaT], name='dt')

result = s.value_counts()
self.assertEqual(result.index.dtype, 'datetime64[ns]')
Expand All @@ -547,10 +569,10 @@ def test_value_counts_inferred(self):

# timedelta64[ns]
td = df.dt - df.dt + timedelta(1)
td = klass(td)
td = klass(td, name='dt')

result = td.value_counts()
expected_s = Series([6], index=[Timedelta('1day')])
expected_s = Series([6], index=[Timedelta('1day')], name='dt')
tm.assert_series_equal(result, expected_s)

expected = TimedeltaIndex(['1 days'])
Expand All @@ -560,9 +582,8 @@ def test_value_counts_inferred(self):
self.assert_numpy_array_equal(td.unique(), expected.values)

td2 = timedelta(1) + (df.dt - df.dt)
td2 = klass(td2)
td2 = klass(td2, name='dt')
result2 = td2.value_counts()

tm.assert_series_equal(result2, expected_s)

def test_factorize(self):
Expand Down Expand Up @@ -629,7 +650,7 @@ def test_duplicated_drop_duplicates(self):
# special case
if original.is_boolean():
result = original.drop_duplicates()
expected = Index([False,True])
expected = Index([False,True], name='a')
tm.assert_index_equal(result, expected)
continue

Expand Down Expand Up @@ -668,25 +689,26 @@ def test_duplicated_drop_duplicates(self):
idx.drop_duplicates(inplace=True)

else:
expected = Series([False] * len(original), index=original.index)
expected = Series([False] * len(original),
index=original.index, name='a')
tm.assert_series_equal(original.duplicated(), expected)
result = original.drop_duplicates()
tm.assert_series_equal(result, original)
self.assertFalse(result is original)

idx = original.index[list(range(len(original))) + [5, 3]]
values = original.values[list(range(len(original))) + [5, 3]]
s = Series(values, index=idx)
s = Series(values, index=idx, name='a')

expected = Series([False] * len(original) + [True, True], index=idx)
expected = Series([False] * len(original) + [True, True],
index=idx, name='a')
tm.assert_series_equal(s.duplicated(), expected)
tm.assert_series_equal(s.drop_duplicates(), original)

last_base = [False] * len(idx)
last_base[3] = True
last_base[5] = True
expected = Series(last_base, index=idx)
expected
expected = Series(last_base, index=idx, name='a')
tm.assert_series_equal(s.duplicated(take_last=True), expected)
tm.assert_series_equal(s.drop_duplicates(take_last=True),
s[~np.array(last_base)])
Expand Down
60 changes: 30 additions & 30 deletions pandas/util/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,43 +817,43 @@ def getArangeMat():


# make index
def makeStringIndex(k=10):
return Index(rands_array(nchars=10, size=k))
def makeStringIndex(k=10, name=None):
return Index(rands_array(nchars=10, size=k), name=name)


def makeUnicodeIndex(k=10):
def makeUnicodeIndex(k=10, name=None):
return Index(randu_array(nchars=10, size=k))

def makeCategoricalIndex(k=10, n=3):
def makeCategoricalIndex(k=10, n=3, name=None):
""" make a length k index or n categories """
x = rands_array(nchars=4, size=n)
return CategoricalIndex(np.random.choice(x,k))
return CategoricalIndex(np.random.choice(x,k), name=name)

def makeBoolIndex(k=10):
def makeBoolIndex(k=10, name=None):
if k == 1:
return Index([True])
return Index([True], name=name)
elif k == 2:
return Index([False,True])
return Index([False,True] + [False]*(k-2))
return Index([False,True], name=name)
return Index([False,True] + [False]*(k-2), name=name)

def makeIntIndex(k=10):
return Index(lrange(k))
def makeIntIndex(k=10, name=None):
return Index(lrange(k), name=name)

def makeFloatIndex(k=10):
def makeFloatIndex(k=10, name=None):
values = sorted(np.random.random_sample(k)) - np.random.random_sample(1)
return Index(values * (10 ** np.random.randint(0, 9)))
return Index(values * (10 ** np.random.randint(0, 9)), name=name)

def makeDateIndex(k=10, freq='B'):
def makeDateIndex(k=10, freq='B', name=None):
dt = datetime(2000, 1, 1)
dr = bdate_range(dt, periods=k, freq=freq)
return DatetimeIndex(dr)
dr = bdate_range(dt, periods=k, freq=freq, name=name)
return DatetimeIndex(dr, name=name)

def makeTimedeltaIndex(k=10, freq='D'):
return TimedeltaIndex(start='1 day',periods=k,freq=freq)
def makeTimedeltaIndex(k=10, freq='D', name=None):
return TimedeltaIndex(start='1 day', periods=k, freq=freq, name=name)

def makePeriodIndex(k=10):
def makePeriodIndex(k=10, name=None):
dt = datetime(2000, 1, 1)
dr = PeriodIndex(start=dt, periods=k, freq='B')
dr = PeriodIndex(start=dt, periods=k, freq='B', name=name)
return dr

def all_index_generator(k=10):
Expand Down Expand Up @@ -885,38 +885,38 @@ def all_timeseries_index_generator(k=10):


# make series
def makeFloatSeries():
def makeFloatSeries(name=None):
index = makeStringIndex(N)
return Series(randn(N), index=index)
return Series(randn(N), index=index, name=name)


def makeStringSeries():
def makeStringSeries(name=None):
index = makeStringIndex(N)
return Series(randn(N), index=index)
return Series(randn(N), index=index, name=name)


def makeObjectSeries():
def makeObjectSeries(name=None):
dateIndex = makeDateIndex(N)
dateIndex = Index(dateIndex, dtype=object)
index = makeStringIndex(N)
return Series(dateIndex, index=index)
return Series(dateIndex, index=index, name=name)


def getSeriesData():
index = makeStringIndex(N)
return dict((c, Series(randn(N), index=index)) for c in getCols(K))


def makeTimeSeries(nper=None, freq='B'):
def makeTimeSeries(nper=None, freq='B', name=None):
if nper is None:
nper = N
return Series(randn(nper), index=makeDateIndex(nper, freq=freq))
return Series(randn(nper), index=makeDateIndex(nper, freq=freq), name=name)


def makePeriodSeries(nper=None):
def makePeriodSeries(nper=None, name=None):
if nper is None:
nper = N
return Series(randn(nper), index=makePeriodIndex(nper))
return Series(randn(nper), index=makePeriodIndex(nper), name=name)


def getTimeSeriesData(nper=None, freq='B'):
Expand Down