diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..e545c66e90c66 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -330,7 +330,7 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) def get_labels(self, {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior, Py_ssize_t na_sentinel, - bint check_null=True): + bint check_null=True, bint dropna=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -347,9 +347,10 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if check_null and {{null_condition}}: - labels[i] = na_sentinel - continue + if dropna: + if check_null and {{null_condition}}: + labels[i] = na_sentinel + continue k = kh_get_{{dtype}}(self.table, val) @@ -642,7 +643,7 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=1): + bint check_null=1, bint dropna=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -815,7 +816,7 @@ cdef class PyObjectHashTable(HashTable): def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): + bint check_null=True, bint dropna=True): cdef: Py_ssize_t i, n = len(values) int64_t[:] labels @@ -830,9 +831,10 @@ cdef class PyObjectHashTable(HashTable): val = values[i] hash(val) - if check_null and val != val or val is None: - labels[i] = na_sentinel - continue + if dropna: + if (check_null and val != val) or val is None: + labels[i] = na_sentinel + continue k = kh_get_pymap(self.table, val) if k != self.table.n_buckets: @@ -968,5 +970,5 @@ cdef class MultiIndexHashTable(HashTable): def get_labels(self, object mi, ObjectVector uniques, Py_ssize_t count_prior, int64_t na_sentinel, - bint check_null=True): + bint check_null=True, bint dropna=True): raise NotImplementedError diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a745ec616eda8..89462f9c9df5a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -519,7 +519,8 @@ def sort_mixed(values): return ordered, _ensure_platform_int(new_labels) -def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): +def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None, + dropna=True): """ Encode input values as an enumerated type or categorical variable @@ -552,7 +553,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): table = hash_klass(size_hint or len(values)) uniques = vec_klass() check_nulls = not is_integer_dtype(original) - labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls) + labels = table.get_labels(values, uniques, 0, na_sentinel, check_nulls, + dropna) labels = _ensure_platform_int(labels) uniques = uniques.to_array() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index a3667e9322959..27565661afceb 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -234,7 +234,8 @@ class Categorical(PandasObject): __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, fastpath=False): + def __init__(self, values, categories=None, ordered=False, fastpath=False, + dropna=True): self._validate_ordered(ordered) @@ -281,9 +282,10 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): if categories is None: try: - codes, categories = factorize(values, sort=True) + codes, categories = factorize(values, sort=True, dropna=dropna) except TypeError: - codes, categories = factorize(values, sort=False) + codes, categories = factorize(values, sort=False, + dropna=dropna) if ordered: # raise, as we don't have a sortable data structure and so # the user should give us one by specifying categories @@ -548,10 +550,6 @@ def _validate_categories(cls, categories, fastpath=False): if not fastpath: - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - # Categories must be unique. if not categories.is_unique: raise ValueError('Categorical categories must be unique') @@ -2110,7 +2108,7 @@ def _convert_to_list_like(list_like): return [list_like] -def _factorize_from_iterable(values): +def _factorize_from_iterable(values, dropna=True): """ Factorize an input `values` into `categories` and `codes`. Preserves categorical dtype in `categories`. @@ -2141,13 +2139,13 @@ def _factorize_from_iterable(values): ordered=values.ordered) codes = values.codes else: - cat = Categorical(values, ordered=True) + cat = Categorical(values, ordered=True, dropna=dropna) categories = cat.categories codes = cat.codes return codes, categories -def _factorize_from_iterables(iterables): +def _factorize_from_iterables(iterables, dropna=True): """ A higher-level wrapper over `_factorize_from_iterable`. @@ -2169,4 +2167,5 @@ def _factorize_from_iterables(iterables): if len(iterables) == 0: # For consistency, it should return a list of 2 lists. return [[], []] - return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) + return map(list, lzip(*[_factorize_from_iterable(it, dropna) + for it in iterables])) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 70862015dff5b..50937c480fb4a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4217,7 +4217,7 @@ def clip_lower(self, threshold, axis=None): return self.where(subset, threshold, axis=axis) def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, - group_keys=True, squeeze=False, **kwargs): + group_keys=True, squeeze=False, dropna=True, **kwargs): """ Group series using mapper (dict or key function, apply given function to group, return result as series) or by a series of columns. @@ -4273,7 +4273,7 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, axis = self._get_axis_number(axis) return groupby(self, by=by, axis=axis, level=level, as_index=as_index, sort=sort, group_keys=group_keys, squeeze=squeeze, - **kwargs) + dropna=dropna, **kwargs) def asfreq(self, freq, method=None, how=None, normalize=False, fill_value=None): diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 1f715c685c27e..85ff19d424871 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -361,7 +361,8 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, **kwargs): + sort=True, group_keys=True, squeeze=False, dropna=True, + **kwargs): self._selection = selection @@ -388,7 +389,8 @@ def __init__(self, obj, keys=None, axis=0, level=None, axis=axis, level=level, sort=sort, - mutated=self.mutated) + mutated=self.mutated, + dropna=dropna) self.obj = obj self.axis = obj._get_axis_number(axis) @@ -1614,7 +1616,7 @@ def tail(self, n=5): @Appender(GroupBy.__doc__) -def groupby(obj, by, **kwds): +def groupby(obj, by, dropna=True, **kwds): if isinstance(obj, Series): klass = SeriesGroupBy elif isinstance(obj, DataFrame): @@ -1622,7 +1624,7 @@ def groupby(obj, by, **kwds): else: # pragma: no cover raise TypeError('invalid type: %s' % type(obj)) - return klass(obj, by, **kwds) + return klass(obj, by, dropna=dropna, **kwds) def _get_axes(group): @@ -2339,7 +2341,7 @@ class Grouping(object): """ def __init__(self, index, grouper=None, obj=None, name=None, level=None, - sort=True, in_axis=False): + sort=True, in_axis=False, dropna=True): self.name = name self.level = level @@ -2348,6 +2350,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, self.sort = sort self.obj = obj self.in_axis = in_axis + self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: @@ -2468,7 +2471,7 @@ def group_index(self): def _make_labels(self): if self._labels is None or self._group_index is None: labels, uniques = algorithms.factorize( - self.grouper, sort=self.sort) + self.grouper, sort=self.sort, dropna=self.dropna) uniques = Index(uniques, name=self.name) self._labels = labels self._group_index = uniques @@ -2480,7 +2483,7 @@ def groups(self): def _get_grouper(obj, key=None, axis=0, level=None, sort=True, - mutated=False): + mutated=False, dropna=True): """ create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -2633,7 +2636,8 @@ def is_in_obj(gpr): name=name, level=level, sort=sort, - in_axis=in_axis) \ + in_axis=in_axis, + dropna=dropna) \ if not isinstance(gpr, Grouping) else gpr groupings.append(ping) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c760d2943b823..ac4021b3de606 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1043,7 +1043,7 @@ def lexsort_depth(self): return 0 @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): + def from_arrays(cls, arrays, sortorder=None, names=None, dropna=False): """ Convert arrays to MultiIndex @@ -1083,7 +1083,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): from pandas.core.categorical import _factorize_from_iterables - labels, levels = _factorize_from_iterables(arrays) + labels, levels = _factorize_from_iterables(arrays, dropna=dropna) if names is None: names = [getattr(arr, "name", None) for arr in arrays] diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 74dbbfc00cb11..c390f1d91624f 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -132,7 +132,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys) + grouped = data.groupby(keys, dropna=dropna) agged = grouped.agg(aggfunc) table = agged @@ -159,15 +159,15 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', if isinstance(table, DataFrame): table = table.sort_index(axis=1) - if fill_value is not None: - table = table.fillna(value=fill_value, downcast='infer') - if margins: if dropna: data = data[data.notnull().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, - margins_name=margins_name) + margins_name=margins_name, dropna=dropna) + + if fill_value is not None: + table = table.fillna(value=fill_value, downcast='infer') # discard the top level if values_passed and not values_multi and not table.empty and \ @@ -188,7 +188,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', def _add_margins(table, data, values, rows, cols, aggfunc, - margins_name='All'): + margins_name='All', dropna=True): if not isinstance(margins_name, compat.string_types): raise ValueError('margins_name argument must be a string') @@ -219,7 +219,8 @@ def _add_margins(table, data, values, rows, cols, aggfunc, marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin, - margins_name) + margins_name, + dropna=dropna) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set @@ -277,8 +278,7 @@ def _compute_grand_margin(data, values, aggfunc, def _generate_marginal_results(table, data, values, rows, cols, aggfunc, - grand_margin, - margins_name='All'): + grand_margin, margins_name='All', dropna=True): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] @@ -288,7 +288,8 @@ def _all_key(key): return (key, margins_name) + ('',) * (len(cols) - 1) if len(rows) > 0: - margin = data[rows + values].groupby(rows).agg(aggfunc) + margin = data[rows + + values].groupby(rows, dropna=dropna).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): @@ -325,7 +326,8 @@ def _all_key(key): margin_keys = table.columns if len(cols) > 0: - row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = data[cols + + values].groupby(cols, dropna=dropna).agg(aggfunc) row_margin = row_margin.stack() # slight hack diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index df679966e0002..f8de40727e99f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -90,6 +90,39 @@ def test_pivot_table_dropna(self): tm.assert_index_equal(pv_col.columns, m) tm.assert_index_equal(pv_ind.index, m) + def test_pivot_table_dropna_margins(self): + # GH 14072 + df = DataFrame([ + [1, 'a', 'A'], + [1, 'b', 'B'], + [1, 'c', None]], + columns=['x', 'y', 'z']) + + result_false = df.pivot_table(values='x', index='y', columns='z', + aggfunc='sum', fill_value=0, + margins=True, dropna=False) + expected_index = Series(['a', 'b', 'c', 'All'], name='y') + expected_columns = Series([None, 'A', 'B', 'All'], name='z') + expected_false = DataFrame([[0, 1, 0, 1], + [0, 0, 1, 1], + [1, 0, 0, 1], + [1, 1, 1, 3]], + index=expected_index, + columns=expected_columns) + tm.assert_frame_equal(expected_false, result_false) + + result_true = df.pivot_table(values='x', index='y', columns='z', + aggfunc='sum', fill_value=0, + margins=True, dropna=True) + expected_index = Series(['a', 'b', 'All'], name='y') + expected_columns = Series(['A', 'B', 'All'], name='z') + expected_true = DataFrame([[1, 0, 1], + [0, 1, 1], + [1, 1, 2]], + index=expected_index, + columns=expected_columns) + tm.assert_frame_equal(expected_true, result_true) + def test_pivot_table_dropna_categoricals(self): # GH 15193 categories = ['a', 'b', 'c', 'd'] @@ -1147,17 +1180,18 @@ def test_margin_dropna(self): df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan], 'b': [3, 3, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') + expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]]) + expected.index = Index([1.0, 2.0, np.nan, 'All'], name='a') expected.columns = Index([3, 4, 'All'], name='b') tm.assert_frame_equal(actual, expected) df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan], 'b': [3, np.nan, 4, 4, 4, 4]}) actual = pd.crosstab(df.a, df.b, margins=True, dropna=False) - expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]]) - expected.index = Index([1.0, 2.0, 'All'], name='a') - expected.columns = Index([3.0, 4.0, 'All'], name='b') + expected = pd.DataFrame([[1, 0, 0, 1], [0, 1, 0, 1], [0, 3, 1, 4], + [1, 4, 0, 6]]) + expected.index = Index([1.0, 2.0, np.nan, 'All'], name='a') + expected.columns = Index([3.0, 4.0, np.nan, 'All'], name='b') tm.assert_frame_equal(actual, expected) a = np.array(['foo', 'foo', 'foo', 'bar', @@ -1169,21 +1203,25 @@ def test_margin_dropna(self): actual = pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'], - ['dull', 'shiny', 'dull', 'shiny', '']], + m = MultiIndex.from_arrays([[np.nan, np.nan, 'one', 'one', 'two', + 'two', 'All'], + ['dull', 'shiny', 'dull', 'shiny', 'dull', + 'shiny', '']], names=['b', 'c']) - expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5], - [3, 0, 2, 1, 7]], columns=m) + expected = DataFrame([[0, 0, 1, 0, 1, 0, 2], [0, 1, 2, 0, 1, 1, 5], + [0, 1, 3, 0, 2, 1, 7]], columns=m) expected.index = Index(['bar', 'foo', 'All'], name='a') tm.assert_frame_equal(actual, expected) actual = pd.crosstab([a, b], c, rownames=['a', 'b'], colnames=['c'], margins=True, dropna=False) - m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'], - ['one', 'two', 'one', 'two', '']], + m = MultiIndex.from_arrays([['bar', 'bar', 'bar', 'foo', 'foo', + 'foo', 'All'], + [np.nan, 'one', 'two', np.nan, 'one', + 'two', '']], names=['a', 'b']) - expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], - [5, 2, 7]], index=m) + expected = DataFrame([[0, 0, 0], [1, 0, 1], [1, 0, 1], [0, 1, 1], + [2, 0, 2], [1, 1, 2], [5, 2, 7]], index=m) expected.columns = Index(['dull', 'shiny', 'All'], name='c') tm.assert_frame_equal(actual, expected)