Skip to content

BUG/WIP: fix pivot with nan indexes #7481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,6 @@ def _make_sorted_values_labels(self):

comp_index, obs_ids = get_compressed_ids(to_sort, sizes)

# group_index = get_group_index(to_sort, sizes)
# comp_index, obs_ids = _compress_group_index(group_index)

ngroups = len(obs_ids)

indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
Expand All @@ -132,11 +129,14 @@ def _make_selectors(self):
stride = self.index.levshape[self.level]
self.full_shape = ngroups, stride

selector = self.sorted_labels[-1] + stride * comp_index
idx = comp_index != -1
selector = self.sorted_labels[-1][idx] + stride * comp_index[idx]
mask = np.zeros(np.prod(self.full_shape), dtype=bool)
mask.put(selector, True)

if mask.sum() < len(self.index):
min_nans = min(np.sum(lab.values() != -1)
for lab in self.sorted_labels)
if mask.sum() != min_nans:
raise ValueError('Index contains duplicate entries, '
'cannot reshape')

Expand Down Expand Up @@ -197,12 +197,14 @@ def get_new_values(self):

new_mask = np.zeros(result_shape, dtype=bool)

gi = self.group_index != -1

# is there a simpler / faster way of doing this?
for i in range(values.shape[1]):
chunk = new_values[:, i * width: (i + 1) * width]
mask_chunk = new_mask[:, i * width: (i + 1) * width]

chunk.flat[self.mask] = self.sorted_values[:, i]
chunk.flat[self.mask] = self.sorted_values[gi, i]
mask_chunk.flat[self.mask] = True

return new_values, new_mask
Expand Down
27 changes: 19 additions & 8 deletions pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime

import numpy as np
from numpy import nan
from numpy.testing import assert_equal

import pandas as pd
Expand Down Expand Up @@ -103,7 +104,6 @@ def test_pivot_table_dropna(self):
assert_equal(pv_col.columns.values, m.values)
assert_equal(pv_ind.index.values, m.values)


def test_pass_array(self):
result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C)
expected = self.data.pivot_table('D', index='A', columns='C')
Expand Down Expand Up @@ -172,15 +172,25 @@ def test_pivot_multi_functions(self):

def test_pivot_index_with_nan(self):
# GH 3588
nan = np.nan
df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]})
result = df.pivot('a','b','c')
expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan],
[nan,nan,nan,nan],[nan,nan,15,20]],
index = Index(['R1','R2',nan,'R4'],name='a'),
columns = Index(['C1','C2','C3','C4'],name='b'))
df = DataFrame({'a': ['R1', 'R2', nan, 'R4'],
'b': ['C1', 'C2', 'C3', 'C4'],
'c': [10, 15, nan, 20]})
result = df.pivot('a', 'b', 'c')
expected = DataFrame([[10, nan, nan, nan],
[nan, 15, nan, nan],
[nan, nan, nan, nan],
[nan, nan, nan, 20]],
index=Index(df.a.values, name='a'),
columns=Index(df.b.values, name='b'))
tm.assert_frame_equal(result, expected)

def test_pivot_dups(self):
df = DataFrame({'a': ['R1', 'R2', 'R2', 'R4'],
'b': ['C1', 'C2', 'C2', 'C4'],
'c': [10, 15, 17, 20]})
with tm.assertRaisesRegexp(ValueError, "Index contains duplicate .+"):
df.pivot('a', 'b', 'c')

def test_pivot_with_tz(self):
# GH 5878
df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0),
Expand Down Expand Up @@ -638,6 +648,7 @@ def test_crosstab_dropna(self):
('two', 'dull'), ('two', 'shiny')])
assert_equal(res.columns.values, m.values)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down