From a32f3c5f6675d1fae78ba0e526392584bd280c33 Mon Sep 17 00:00:00 2001 From: Phillip Cloud Date: Mon, 16 Jun 2014 22:12:54 -0400 Subject: [PATCH] BUG/WIP: fix pivot with nan indexes --- pandas/core/reshape.py | 14 ++++++++------ pandas/tools/tests/test_pivot.py | 27 +++++++++++++++++++-------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index e1712be7b5a5f..d5b80f239d5db 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -107,9 +107,6 @@ def _make_sorted_values_labels(self): comp_index, obs_ids = get_compressed_ids(to_sort, sizes) - # group_index = get_group_index(to_sort, sizes) - # comp_index, obs_ids = _compress_group_index(group_index) - ngroups = len(obs_ids) indexer = algos.groupsort_indexer(comp_index, ngroups)[0] @@ -132,11 +129,14 @@ def _make_selectors(self): stride = self.index.levshape[self.level] self.full_shape = ngroups, stride - selector = self.sorted_labels[-1] + stride * comp_index + idx = comp_index != -1 + selector = self.sorted_labels[-1][idx] + stride * comp_index[idx] mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) - if mask.sum() < len(self.index): + min_nans = min(np.sum(lab.values() != -1) + for lab in self.sorted_labels) + if mask.sum() != min_nans: raise ValueError('Index contains duplicate entries, ' 'cannot reshape') @@ -197,12 +197,14 @@ def get_new_values(self): new_mask = np.zeros(result_shape, dtype=bool) + gi = self.group_index != -1 + # is there a simpler / faster way of doing this? for i in range(values.shape[1]): chunk = new_values[:, i * width: (i + 1) * width] mask_chunk = new_mask[:, i * width: (i + 1) * width] - chunk.flat[self.mask] = self.sorted_values[:, i] + chunk.flat[self.mask] = self.sorted_values[gi, i] mask_chunk.flat[self.mask] = True return new_values, new_mask diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index a16df00351d76..684faca508f64 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1,6 +1,7 @@ import datetime import numpy as np +from numpy import nan from numpy.testing import assert_equal import pandas as pd @@ -103,7 +104,6 @@ def test_pivot_table_dropna(self): assert_equal(pv_col.columns.values, m.values) assert_equal(pv_ind.index.values, m.values) - def test_pass_array(self): result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C) expected = self.data.pivot_table('D', index='A', columns='C') @@ -172,15 +172,25 @@ def test_pivot_multi_functions(self): def test_pivot_index_with_nan(self): # GH 3588 - nan = np.nan - df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]}) - result = df.pivot('a','b','c') - expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan], - [nan,nan,nan,nan],[nan,nan,15,20]], - index = Index(['R1','R2',nan,'R4'],name='a'), - columns = Index(['C1','C2','C3','C4'],name='b')) + df = DataFrame({'a': ['R1', 'R2', nan, 'R4'], + 'b': ['C1', 'C2', 'C3', 'C4'], + 'c': [10, 15, nan, 20]}) + result = df.pivot('a', 'b', 'c') + expected = DataFrame([[10, nan, nan, nan], + [nan, 15, nan, nan], + [nan, nan, nan, nan], + [nan, nan, nan, 20]], + index=Index(df.a.values, name='a'), + columns=Index(df.b.values, name='b')) tm.assert_frame_equal(result, expected) + def test_pivot_dups(self): + df = DataFrame({'a': ['R1', 'R2', 'R2', 'R4'], + 'b': ['C1', 'C2', 'C2', 'C4'], + 'c': [10, 15, 17, 20]}) + with tm.assertRaisesRegexp(ValueError, "Index contains duplicate .+"): + df.pivot('a', 'b', 'c') + def test_pivot_with_tz(self): # GH 5878 df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0), @@ -638,6 +648,7 @@ def test_crosstab_dropna(self): ('two', 'dull'), ('two', 'shiny')]) assert_equal(res.columns.values, m.values) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],