Merge pull request #3256 from jreback/hdf_column

jreback · jreback · commit 3e4665eb9d1a · 2013-04-04T05:11:15.000-07:00
ENH: In HDFStore, add select_column method, deprecate unique method
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -163,6 +163,11 @@ pandas 0.11.0
     when invalid shapes are passed
   - Methods return None when inplace=True (GH1893_)
 
+  - ``HDFStore``
+
+     - added the method ``select_column`` to select a single column from a table as a Series.
+     - deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+
 **Bug Fixes**
 
   - Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill``
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1352,16 +1352,17 @@ then the ``nrows`` of the table are considered.
 Advanced Queries
 ~~~~~~~~~~~~~~~~
 
-**Unique**
+**Select a Single Column**
 
-To retrieve the *unique* values of an indexable or data column, use the
-method ``unique``. This will, for example, enable you to get the index
-very quickly. Note ``nan`` are excluded from the result set.
+To retrieve a single indexable or data column, use the
+method ``select_column``. This will, for example, enable you to get the index
+very quickly. These return a ``Series`` of the result, indexed by the row number.
+These do not currently accept the ``where`` selector (coming soon)
 
 .. ipython:: python
 
-   store.unique('df_dc', 'index')
-   store.unique('df_dc', 'string')
+   store.select_column('df_dc', 'index')
+   store.select_column('df_dc', 'string')
 
 **Replicating or**
 
diff --git a/doc/source/v0.11.0.txt b/doc/source/v0.11.0.txt
@@ -226,6 +226,10 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`
 API changes
 ~~~~~~~~~~~
 
+  - In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series.
+
+  - In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
+
 Enhancements
 ~~~~~~~~~~~~
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -423,8 +423,13 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs
         return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
 
     def unique(self, key, column, **kwargs):
+        warnings.warn("unique(key,column) is deprecated\n"
+                      "use select_column(key,column).unique() instead")
+        return self.get_storer(key).read_column(column = column, **kwargs).unique()
+
+    def select_column(self, key, column, **kwargs):
         """
-        return a single column uniquely from the table. This is generally only useful to select an indexable
+        return a single column from the table. This is generally only useful to select an indexable
 
         Parameters
         ----------
@@ -2525,7 +2530,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
         self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
         return Coordinates(self.selection.select_coords(), group=self.group, where=where)
 
-    def read_column(self, column, **kwargs):
+    def read_column(self, column, where = None, **kwargs):
         """ return a single column from the table, generally only indexables are interesting """
 
         # validate the version
@@ -2535,6 +2540,9 @@ def read_column(self, column, **kwargs):
         if not self.infer_axes():
             return False
 
+        if where is not None:
+            raise Exception("read_column does not currently accept a where clause")
+
         # find the axes
         for a in self.axes:
             if column == a.name:
@@ -2544,7 +2552,7 @@ def read_column(self, column, **kwargs):
 
                 # column must be an indexable or a data column
                 c = getattr(self.table.cols, column)
-                return Categorical.from_array(a.convert(c[:], nan_rep=self.nan_rep).take_data()).levels
+                return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data())
 
         raise KeyError("column [%s] not found in the table" % column)
 
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -2068,6 +2068,7 @@ def test_string_select(self):
             expected = df2[isnull(df2.x)]
             assert_frame_equal(result,expected)
 
+
             # int ==/!=
             df['int'] = 1
             df.ix[2:7,'int'] = 2
@@ -2083,42 +2084,44 @@ def test_string_select(self):
             assert_frame_equal(result,expected)
 
 
-    def test_unique(self):
+    def test_read_column(self):
 
         df = tm.makeTimeDataFrame()
 
-        def check(x, y):
-            self.assert_((np.unique(x) == np.unique(y)).all() == True)
-
         with ensure_clean(self.path) as store:
             store.remove('df')
             store.append('df', df)
             
             # error
-            self.assertRaises(KeyError, store.unique, 'df', 'foo')
+            self.assertRaises(KeyError, store.select_column, 'df', 'foo')
+
+            def f():
+                store.select_column('df', 'index', where = ['index>5'])
+            self.assertRaises(Exception, f)
 
             # valid
-            result = store.unique('df', 'index')
-            check(result.values, df.index.values)
-            
+            result = store.select_column('df', 'index')
+            tm.assert_almost_equal(result.values, Series(df.index).values)
+            self.assert_(isinstance(result,Series))
+
             # not a data indexable column
             self.assertRaises(
-                ValueError, store.unique, 'df', 'values_block_0')
+                ValueError, store.select_column, 'df', 'values_block_0')
 
             # a data column
             df2 = df.copy()
             df2['string'] = 'foo'
             store.append('df2', df2, data_columns=['string'])
-            result = store.unique('df2', 'string')
-            check(result.values, df2['string'].unique())
+            result = store.select_column('df2', 'string')
+            tm.assert_almost_equal(result.values, df2['string'].values)
             
             # a data column with NaNs, result excludes the NaNs
             df3 = df.copy()
             df3['string'] = 'foo'
             df3.ix[4:6, 'string'] = np.nan
             store.append('df3', df3, data_columns=['string'])
-            result = store.unique('df3', 'string')
-            check(result.values, df3['string'].valid().unique())
+            result = store.select_column('df3', 'string')
+            tm.assert_almost_equal(result.values, df3['string'].values)
 
     def test_coordinates(self):
         df = tm.makeTimeDataFrame()