Skip to content

ENH: In HDFStore, add select_column method, deprecate unique method #3256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 4, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,11 @@ pandas 0.11.0
when invalid shapes are passed
- Methods return None when inplace=True (GH1893_)

- ``HDFStore``

- added the method ``select_column`` to select a single column from a table as a Series.
- deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``

**Bug Fixes**

- Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill``
Expand Down
13 changes: 7 additions & 6 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1352,16 +1352,17 @@ then the ``nrows`` of the table are considered.
Advanced Queries
~~~~~~~~~~~~~~~~

**Unique**
**Select a Single Column**

To retrieve the *unique* values of an indexable or data column, use the
method ``unique``. This will, for example, enable you to get the index
very quickly. Note ``nan`` are excluded from the result set.
To retrieve a single indexable or data column, use the
method ``select_column``. This will, for example, enable you to get the index
very quickly. These return a ``Series`` of the result, indexed by the row number.
These do not currently accept the ``where`` selector (coming soon)

.. ipython:: python

store.unique('df_dc', 'index')
store.unique('df_dc', 'string')
store.select_column('df_dc', 'index')
store.select_column('df_dc', 'string')

**Replicating or**

Expand Down
4 changes: 4 additions & 0 deletions doc/source/v0.11.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`
API changes
~~~~~~~~~~~

- In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series.

- In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``

Enhancements
~~~~~~~~~~~~

Expand Down
14 changes: 11 additions & 3 deletions pandas/io/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,13 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs
return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)

def unique(self, key, column, **kwargs):
warnings.warn("unique(key,column) is deprecated\n"
"use select_column(key,column).unique() instead")
return self.get_storer(key).read_column(column = column, **kwargs).unique()

def select_column(self, key, column, **kwargs):
"""
return a single column uniquely from the table. This is generally only useful to select an indexable
return a single column from the table. This is generally only useful to select an indexable

Parameters
----------
Expand Down Expand Up @@ -2525,7 +2530,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
return Coordinates(self.selection.select_coords(), group=self.group, where=where)

def read_column(self, column, **kwargs):
def read_column(self, column, where = None, **kwargs):
""" return a single column from the table, generally only indexables are interesting """

# validate the version
Expand All @@ -2535,6 +2540,9 @@ def read_column(self, column, **kwargs):
if not self.infer_axes():
return False

if where is not None:
raise Exception("read_column does not currently accept a where clause")

# find the axes
for a in self.axes:
if column == a.name:
Expand All @@ -2544,7 +2552,7 @@ def read_column(self, column, **kwargs):

# column must be an indexable or a data column
c = getattr(self.table.cols, column)
return Categorical.from_array(a.convert(c[:], nan_rep=self.nan_rep).take_data()).levels
return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data())

raise KeyError("column [%s] not found in the table" % column)

Expand Down
29 changes: 16 additions & 13 deletions pandas/io/tests/test_pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2068,6 +2068,7 @@ def test_string_select(self):
expected = df2[isnull(df2.x)]
assert_frame_equal(result,expected)


# int ==/!=
df['int'] = 1
df.ix[2:7,'int'] = 2
Expand All @@ -2083,42 +2084,44 @@ def test_string_select(self):
assert_frame_equal(result,expected)


def test_unique(self):
def test_read_column(self):

df = tm.makeTimeDataFrame()

def check(x, y):
self.assert_((np.unique(x) == np.unique(y)).all() == True)

with ensure_clean(self.path) as store:
store.remove('df')
store.append('df', df)

# error
self.assertRaises(KeyError, store.unique, 'df', 'foo')
self.assertRaises(KeyError, store.select_column, 'df', 'foo')

def f():
store.select_column('df', 'index', where = ['index>5'])
self.assertRaises(Exception, f)

# valid
result = store.unique('df', 'index')
check(result.values, df.index.values)

result = store.select_column('df', 'index')
tm.assert_almost_equal(result.values, Series(df.index).values)
self.assert_(isinstance(result,Series))

# not a data indexable column
self.assertRaises(
ValueError, store.unique, 'df', 'values_block_0')
ValueError, store.select_column, 'df', 'values_block_0')

# a data column
df2 = df.copy()
df2['string'] = 'foo'
store.append('df2', df2, data_columns=['string'])
result = store.unique('df2', 'string')
check(result.values, df2['string'].unique())
result = store.select_column('df2', 'string')
tm.assert_almost_equal(result.values, df2['string'].values)

# a data column with NaNs, result excludes the NaNs
df3 = df.copy()
df3['string'] = 'foo'
df3.ix[4:6, 'string'] = np.nan
store.append('df3', df3, data_columns=['string'])
result = store.unique('df3', 'string')
check(result.values, df3['string'].valid().unique())
result = store.select_column('df3', 'string')
tm.assert_almost_equal(result.values, df3['string'].values)

def test_coordinates(self):
df = tm.makeTimeDataFrame()
Expand Down