Skip to content

BUG:Pivot table drops column/index names=nan when dropna=false #16142

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ cdef class {{name}}HashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques,
Py_ssize_t count_prior, Py_ssize_t na_sentinel,
bint check_null=True):
bint check_null=True, bint dropna=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -347,9 +347,10 @@ cdef class {{name}}HashTable(HashTable):
for i in range(n):
val = values[i]

if check_null and {{null_condition}}:
labels[i] = na_sentinel
continue
if dropna:
if check_null and {{null_condition}}:
labels[i] = na_sentinel
continue

k = kh_get_{{dtype}}(self.table, val)

Expand Down Expand Up @@ -642,7 +643,7 @@ cdef class StringHashTable(HashTable):
@cython.boundscheck(False)
def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=1):
bint check_null=1, bint dropna=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand Down Expand Up @@ -815,7 +816,7 @@ cdef class PyObjectHashTable(HashTable):

def get_labels(self, ndarray[object] values, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
bint check_null=True, bint dropna=True):
cdef:
Py_ssize_t i, n = len(values)
int64_t[:] labels
Expand All @@ -830,9 +831,10 @@ cdef class PyObjectHashTable(HashTable):
val = values[i]
hash(val)

if check_null and val != val or val is None:
labels[i] = na_sentinel
continue
if dropna:
if (check_null and val != val) or val is None:
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
if k != self.table.n_buckets:
Expand Down Expand Up @@ -968,5 +970,5 @@ cdef class MultiIndexHashTable(HashTable):

def get_labels(self, object mi, ObjectVector uniques,
Py_ssize_t count_prior, int64_t na_sentinel,
bint check_null=True):
bint check_null=True, bint dropna=True):
raise NotImplementedError
6 changes: 4 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,8 @@ def sort_mixed(values):
return ordered, _ensure_platform_int(new_labels)


def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None,
dropna=True):
"""
Encode input values as an enumerated type or categorical variable

Expand Down Expand Up @@ -552,7 +553,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
table = hash_klass(size_hint or len(values))
uniques = vec_klass()
check_nulls = not is_integer_dtype(original)
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls)
labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls,
dropna)

labels = _ensure_platform_int(labels)
uniques = uniques.to_array()
Expand Down
21 changes: 10 additions & 11 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,8 @@ class Categorical(PandasObject):
__array_priority__ = 1000
_typ = 'categorical'

def __init__(self, values, categories=None, ordered=False, fastpath=False):
def __init__(self, values, categories=None, ordered=False, fastpath=False,
dropna=True):

self._validate_ordered(ordered)

Expand Down Expand Up @@ -281,9 +282,10 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):

if categories is None:
try:
codes, categories = factorize(values, sort=True)
codes, categories = factorize(values, sort=True, dropna=dropna)
except TypeError:
codes, categories = factorize(values, sort=False)
codes, categories = factorize(values, sort=False,
dropna=dropna)
if ordered:
# raise, as we don't have a sortable data structure and so
# the user should give us one by specifying categories
Expand Down Expand Up @@ -548,10 +550,6 @@ def _validate_categories(cls, categories, fastpath=False):

if not fastpath:

# Categories cannot contain NaN.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you have some unintentional changes in here? This shouldn't be removed.

Copy link
Contributor Author

@OXPHOS OXPHOS Apr 27, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like the Index, Index([None, u'A', u'B'], dtype='object'), needs to be passed to Categorical when doing MultiIndex, as when dropna=False, None could also be the index/column name. Or I didn't get this correctly?

if categories.hasnans:
raise ValueError('Categorial categories cannot be null')

# Categories must be unique.
if not categories.is_unique:
raise ValueError('Categorical categories must be unique')
Expand Down Expand Up @@ -2110,7 +2108,7 @@ def _convert_to_list_like(list_like):
return [list_like]


def _factorize_from_iterable(values):
def _factorize_from_iterable(values, dropna=True):
"""
Factorize an input `values` into `categories` and `codes`. Preserves
categorical dtype in `categories`.
Expand Down Expand Up @@ -2141,13 +2139,13 @@ def _factorize_from_iterable(values):
ordered=values.ordered)
codes = values.codes
else:
cat = Categorical(values, ordered=True)
cat = Categorical(values, ordered=True, dropna=dropna)
categories = cat.categories
codes = cat.codes
return codes, categories


def _factorize_from_iterables(iterables):
def _factorize_from_iterables(iterables, dropna=True):
"""
A higher-level wrapper over `_factorize_from_iterable`.

Expand All @@ -2169,4 +2167,5 @@ def _factorize_from_iterables(iterables):
if len(iterables) == 0:
# For consistency, it should return a list of 2 lists.
return [[], []]
return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables]))
return map(list, lzip(*[_factorize_from_iterable(it, dropna)
for it in iterables]))
4 changes: 2 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4217,7 +4217,7 @@ def clip_lower(self, threshold, axis=None):
return self.where(subset, threshold, axis=axis)

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
group_keys=True, squeeze=False, **kwargs):
group_keys=True, squeeze=False, dropna=True, **kwargs):
"""
Group series using mapper (dict or key function, apply given function
to group, return result as series) or by a series of columns.
Expand Down Expand Up @@ -4273,7 +4273,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
axis = self._get_axis_number(axis)
return groupby(self, by=by, axis=axis, level=level, as_index=as_index,
sort=sort, group_keys=group_keys, squeeze=squeeze,
**kwargs)
dropna=dropna, **kwargs)

def asfreq(self, freq, method=None, how=None, normalize=False,
fill_value=None):
Expand Down
20 changes: 12 additions & 8 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,8 @@ class _GroupBy(PandasObject, SelectionMixin):

def __init__(self, obj, keys=None, axis=0, level=None,
grouper=None, exclusions=None, selection=None, as_index=True,
sort=True, group_keys=True, squeeze=False, **kwargs):
sort=True, group_keys=True, squeeze=False, dropna=True,
**kwargs):

self._selection = selection

Expand All @@ -388,7 +389,8 @@ def __init__(self, obj, keys=None, axis=0, level=None,
axis=axis,
level=level,
sort=sort,
mutated=self.mutated)
mutated=self.mutated,
dropna=dropna)

self.obj = obj
self.axis = obj._get_axis_number(axis)
Expand Down Expand Up @@ -1614,15 +1616,15 @@ def tail(self, n=5):


@Appender(GroupBy.__doc__)
def groupby(obj, by, **kwds):
def groupby(obj, by, dropna=True, **kwds):
if isinstance(obj, Series):
klass = SeriesGroupBy
elif isinstance(obj, DataFrame):
klass = DataFrameGroupBy
else: # pragma: no cover
raise TypeError('invalid type: %s' % type(obj))

return klass(obj, by, **kwds)
return klass(obj, by, dropna=dropna, **kwds)


def _get_axes(group):
Expand Down Expand Up @@ -2339,7 +2341,7 @@ class Grouping(object):
"""

def __init__(self, index, grouper=None, obj=None, name=None, level=None,
sort=True, in_axis=False):
sort=True, in_axis=False, dropna=True):

self.name = name
self.level = level
Expand All @@ -2348,6 +2350,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
self.sort = sort
self.obj = obj
self.in_axis = in_axis
self.dropna = dropna

# right place for this?
if isinstance(grouper, (Series, Index)) and name is None:
Expand Down Expand Up @@ -2468,7 +2471,7 @@ def group_index(self):
def _make_labels(self):
if self._labels is None or self._group_index is None:
labels, uniques = algorithms.factorize(
self.grouper, sort=self.sort)
self.grouper, sort=self.sort, dropna=self.dropna)
uniques = Index(uniques, name=self.name)
self._labels = labels
self._group_index = uniques
Expand All @@ -2480,7 +2483,7 @@ def groups(self):


def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
mutated=False):
mutated=False, dropna=True):
"""
create and return a BaseGrouper, which is an internal
mapping of how to create the grouper indexers.
Expand Down Expand Up @@ -2633,7 +2636,8 @@ def is_in_obj(gpr):
name=name,
level=level,
sort=sort,
in_axis=in_axis) \
in_axis=in_axis,
dropna=dropna) \
if not isinstance(gpr, Grouping) else gpr

groupings.append(ping)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1043,7 +1043,7 @@ def lexsort_depth(self):
return 0

@classmethod
def from_arrays(cls, arrays, sortorder=None, names=None):
def from_arrays(cls, arrays, sortorder=None, names=None, dropna=False):
"""
Convert arrays to MultiIndex

Expand Down Expand Up @@ -1083,7 +1083,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):

from pandas.core.categorical import _factorize_from_iterables

labels, levels = _factorize_from_iterables(arrays)
labels, levels = _factorize_from_iterables(arrays, dropna=dropna)
if names is None:
names = [getattr(arr, "name", None) for arr in arrays]

Expand Down
24 changes: 13 additions & 11 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
pass
values = list(values)

grouped = data.groupby(keys)
grouped = data.groupby(keys, dropna=dropna)
agged = grouped.agg(aggfunc)

table = agged
Expand All @@ -159,15 +159,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
if isinstance(table, DataFrame):
table = table.sort_index(axis=1)

if fill_value is not None:
table = table.fillna(value=fill_value, downcast='infer')

if margins:
if dropna:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I remove this if dropna, most of the tests pass (including a fix for this one) other than

        df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
                           'b': [3, 3, 4, 4, 4, 4]})
        actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
        expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
        expected.index = Index([1.0, 2.0, 'All'], name='a')
        expected.columns = Index([3, 4, 'All'], name='b')

Here's the result and expected

(Pdb) pp actual
b    3  4  All
a
1.0  1  0    1
2.0  1  3    4
All  2  3    5
(Pdb) pp expected
b    3  4  All
a
1.0  1  0    1
2.0  1  3    4
All  2  4    6

You have more experience with this section of the code than I do, but the margins on the expected look incorrect to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're definitely right. I think it should be (if dropna=False):

b 3 4 All
a
1.0 1 0 1
2.0 1 3 4
np.nan 0 1 1
All 2 4 6
  • I need to fix the np.nan as it is still being ignored even with the current fix when dropna=False (i.e. the fix only works for None)
  • Actually I didn't get why removing dropna here would help yet. I'll check closer.

data = data[data.notnull().all(axis=1)]
table = _add_margins(table, data, values, rows=index,
cols=columns, aggfunc=aggfunc,
margins_name=margins_name)
margins_name=margins_name, dropna=dropna)

if fill_value is not None:
table = table.fillna(value=fill_value, downcast='infer')

# discard the top level
if values_passed and not values_multi and not table.empty and \
Expand All @@ -188,7 +188,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',


def _add_margins(table, data, values, rows, cols, aggfunc,
margins_name='All'):
margins_name='All', dropna=True):
if not isinstance(margins_name, compat.string_types):
raise ValueError('margins_name argument must be a string')

Expand Down Expand Up @@ -219,7 +219,8 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
marginal_result_set = _generate_marginal_results(table, data, values,
rows, cols, aggfunc,
grand_margin,
margins_name)
margins_name,
dropna=dropna)
if not isinstance(marginal_result_set, tuple):
return marginal_result_set
result, margin_keys, row_margin = marginal_result_set
Expand Down Expand Up @@ -277,8 +278,7 @@ def _compute_grand_margin(data, values, aggfunc,


def _generate_marginal_results(table, data, values, rows, cols, aggfunc,
grand_margin,
margins_name='All'):
grand_margin, margins_name='All', dropna=True):
if len(cols) > 0:
# need to "interleave" the margins
table_pieces = []
Expand All @@ -288,7 +288,8 @@ def _all_key(key):
return (key, margins_name) + ('',) * (len(cols) - 1)

if len(rows) > 0:
margin = data[rows + values].groupby(rows).agg(aggfunc)
margin = data[rows +
values].groupby(rows, dropna=dropna).agg(aggfunc)
cat_axis = 1

for key, piece in table.groupby(level=0, axis=cat_axis):
Expand Down Expand Up @@ -325,7 +326,8 @@ def _all_key(key):
margin_keys = table.columns

if len(cols) > 0:
row_margin = data[cols + values].groupby(cols).agg(aggfunc)
row_margin = data[cols +
values].groupby(cols, dropna=dropna).agg(aggfunc)
row_margin = row_margin.stack()

# slight hack
Expand Down
Loading