diff --git a/RELEASE.rst b/RELEASE.rst index 87acddb74df8a..46f7c832ae149 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -163,6 +163,11 @@ pandas 0.11.0 when invalid shapes are passed - Methods return None when inplace=True (GH1893_) + - ``HDFStore`` + + - added the method ``select_column`` to select a single column from a table as a Series. + - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + **Bug Fixes** - Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill`` diff --git a/doc/source/io.rst b/doc/source/io.rst index 8440f6f566659..25c42780afd65 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1352,16 +1352,17 @@ then the ``nrows`` of the table are considered. Advanced Queries ~~~~~~~~~~~~~~~~ -**Unique** +**Select a Single Column** -To retrieve the *unique* values of an indexable or data column, use the -method ``unique``. This will, for example, enable you to get the index -very quickly. Note ``nan`` are excluded from the result set. +To retrieve a single indexable or data column, use the +method ``select_column``. This will, for example, enable you to get the index +very quickly. These return a ``Series`` of the result, indexed by the row number. +These do not currently accept the ``where`` selector (coming soon) .. ipython:: python - store.unique('df_dc', 'index') - store.unique('df_dc', 'string') + store.select_column('df_dc', 'index') + store.select_column('df_dc', 'string') **Replicating or** diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt index 4c460849c0588..c6553b909f7a6 100644 --- a/doc/source/v0.11.0.txt +++ b/doc/source/v0.11.0.txt @@ -226,6 +226,10 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT` API changes ~~~~~~~~~~~ + - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series. + + - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()`` + Enhancements ~~~~~~~~~~~~ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 1c223f58471f0..0568ee7f7f8bf 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -423,8 +423,13 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs) def unique(self, key, column, **kwargs): + warnings.warn("unique(key,column) is deprecated\n" + "use select_column(key,column).unique() instead") + return self.get_storer(key).read_column(column = column, **kwargs).unique() + + def select_column(self, key, column, **kwargs): """ - return a single column uniquely from the table. This is generally only useful to select an indexable + return a single column from the table. This is generally only useful to select an indexable Parameters ---------- @@ -2525,7 +2530,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs) return Coordinates(self.selection.select_coords(), group=self.group, where=where) - def read_column(self, column, **kwargs): + def read_column(self, column, where = None, **kwargs): """ return a single column from the table, generally only indexables are interesting """ # validate the version @@ -2535,6 +2540,9 @@ def read_column(self, column, **kwargs): if not self.infer_axes(): return False + if where is not None: + raise Exception("read_column does not currently accept a where clause") + # find the axes for a in self.axes: if column == a.name: @@ -2544,7 +2552,7 @@ def read_column(self, column, **kwargs): # column must be an indexable or a data column c = getattr(self.table.cols, column) - return Categorical.from_array(a.convert(c[:], nan_rep=self.nan_rep).take_data()).levels + return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data()) raise KeyError("column [%s] not found in the table" % column) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 071bcfbb8b3e9..1973c578cb9e6 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2068,6 +2068,7 @@ def test_string_select(self): expected = df2[isnull(df2.x)] assert_frame_equal(result,expected) + # int ==/!= df['int'] = 1 df.ix[2:7,'int'] = 2 @@ -2083,42 +2084,44 @@ def test_string_select(self): assert_frame_equal(result,expected) - def test_unique(self): + def test_read_column(self): df = tm.makeTimeDataFrame() - def check(x, y): - self.assert_((np.unique(x) == np.unique(y)).all() == True) - with ensure_clean(self.path) as store: store.remove('df') store.append('df', df) # error - self.assertRaises(KeyError, store.unique, 'df', 'foo') + self.assertRaises(KeyError, store.select_column, 'df', 'foo') + + def f(): + store.select_column('df', 'index', where = ['index>5']) + self.assertRaises(Exception, f) # valid - result = store.unique('df', 'index') - check(result.values, df.index.values) - + result = store.select_column('df', 'index') + tm.assert_almost_equal(result.values, Series(df.index).values) + self.assert_(isinstance(result,Series)) + # not a data indexable column self.assertRaises( - ValueError, store.unique, 'df', 'values_block_0') + ValueError, store.select_column, 'df', 'values_block_0') # a data column df2 = df.copy() df2['string'] = 'foo' store.append('df2', df2, data_columns=['string']) - result = store.unique('df2', 'string') - check(result.values, df2['string'].unique()) + result = store.select_column('df2', 'string') + tm.assert_almost_equal(result.values, df2['string'].values) # a data column with NaNs, result excludes the NaNs df3 = df.copy() df3['string'] = 'foo' df3.ix[4:6, 'string'] = np.nan store.append('df3', df3, data_columns=['string']) - result = store.unique('df3', 'string') - check(result.values, df3['string'].valid().unique()) + result = store.select_column('df3', 'string') + tm.assert_almost_equal(result.values, df3['string'].values) def test_coordinates(self): df = tm.makeTimeDataFrame()