Skip to content

Commit 3e4665e

Browse files
committed
Merge pull request #3256 from jreback/hdf_column
ENH: In HDFStore, add select_column method, deprecate unique method
2 parents 6e67862 + 332795e commit 3e4665e

File tree

5 files changed

+43
-22
lines changed

5 files changed

+43
-22
lines changed

RELEASE.rst

+5
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,11 @@ pandas 0.11.0
163163
when invalid shapes are passed
164164
- Methods return None when inplace=True (GH1893_)
165165

166+
- ``HDFStore``
167+
168+
- added the method ``select_column`` to select a single column from a table as a Series.
169+
- deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
170+
166171
**Bug Fixes**
167172

168173
- Fix seg fault on empty data frame when fillna with ``pad`` or ``backfill``

doc/source/io.rst

+7-6
Original file line numberDiff line numberDiff line change
@@ -1352,16 +1352,17 @@ then the ``nrows`` of the table are considered.
13521352
Advanced Queries
13531353
~~~~~~~~~~~~~~~~
13541354

1355-
**Unique**
1355+
**Select a Single Column**
13561356

1357-
To retrieve the *unique* values of an indexable or data column, use the
1358-
method ``unique``. This will, for example, enable you to get the index
1359-
very quickly. Note ``nan`` are excluded from the result set.
1357+
To retrieve a single indexable or data column, use the
1358+
method ``select_column``. This will, for example, enable you to get the index
1359+
very quickly. These return a ``Series`` of the result, indexed by the row number.
1360+
These do not currently accept the ``where`` selector (coming soon)
13601361

13611362
.. ipython:: python
13621363
1363-
store.unique('df_dc', 'index')
1364-
store.unique('df_dc', 'string')
1364+
store.select_column('df_dc', 'index')
1365+
store.select_column('df_dc', 'string')
13651366
13661367
**Replicating or**
13671368

doc/source/v0.11.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicity converts ``NaT`
226226
API changes
227227
~~~~~~~~~~~
228228

229+
- In ``HDFStore``, added the method ``select_column`` to select a single column from a table as a Series.
230+
231+
- In ``HDFStore``, deprecated the ``unique`` method, can be replicated by ``select_column(key,column).unique()``
232+
229233
Enhancements
230234
~~~~~~~~~~~~
231235

pandas/io/pytables.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -423,8 +423,13 @@ def select_as_coordinates(self, key, where=None, start=None, stop=None, **kwargs
423423
return self.get_storer(key).read_coordinates(where=where, start=start, stop=stop, **kwargs)
424424

425425
def unique(self, key, column, **kwargs):
426+
warnings.warn("unique(key,column) is deprecated\n"
427+
"use select_column(key,column).unique() instead")
428+
return self.get_storer(key).read_column(column = column, **kwargs).unique()
429+
430+
def select_column(self, key, column, **kwargs):
426431
"""
427-
return a single column uniquely from the table. This is generally only useful to select an indexable
432+
return a single column from the table. This is generally only useful to select an indexable
428433
429434
Parameters
430435
----------
@@ -2525,7 +2530,7 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
25252530
self.selection = Selection(self, where=where, start=start, stop=stop, **kwargs)
25262531
return Coordinates(self.selection.select_coords(), group=self.group, where=where)
25272532

2528-
def read_column(self, column, **kwargs):
2533+
def read_column(self, column, where = None, **kwargs):
25292534
""" return a single column from the table, generally only indexables are interesting """
25302535

25312536
# validate the version
@@ -2535,6 +2540,9 @@ def read_column(self, column, **kwargs):
25352540
if not self.infer_axes():
25362541
return False
25372542

2543+
if where is not None:
2544+
raise Exception("read_column does not currently accept a where clause")
2545+
25382546
# find the axes
25392547
for a in self.axes:
25402548
if column == a.name:
@@ -2544,7 +2552,7 @@ def read_column(self, column, **kwargs):
25442552

25452553
# column must be an indexable or a data column
25462554
c = getattr(self.table.cols, column)
2547-
return Categorical.from_array(a.convert(c[:], nan_rep=self.nan_rep).take_data()).levels
2555+
return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data())
25482556

25492557
raise KeyError("column [%s] not found in the table" % column)
25502558

pandas/io/tests/test_pytables.py

+16-13
Original file line numberDiff line numberDiff line change
@@ -2068,6 +2068,7 @@ def test_string_select(self):
20682068
expected = df2[isnull(df2.x)]
20692069
assert_frame_equal(result,expected)
20702070

2071+
20712072
# int ==/!=
20722073
df['int'] = 1
20732074
df.ix[2:7,'int'] = 2
@@ -2083,42 +2084,44 @@ def test_string_select(self):
20832084
assert_frame_equal(result,expected)
20842085

20852086

2086-
def test_unique(self):
2087+
def test_read_column(self):
20872088

20882089
df = tm.makeTimeDataFrame()
20892090

2090-
def check(x, y):
2091-
self.assert_((np.unique(x) == np.unique(y)).all() == True)
2092-
20932091
with ensure_clean(self.path) as store:
20942092
store.remove('df')
20952093
store.append('df', df)
20962094

20972095
# error
2098-
self.assertRaises(KeyError, store.unique, 'df', 'foo')
2096+
self.assertRaises(KeyError, store.select_column, 'df', 'foo')
2097+
2098+
def f():
2099+
store.select_column('df', 'index', where = ['index>5'])
2100+
self.assertRaises(Exception, f)
20992101

21002102
# valid
2101-
result = store.unique('df', 'index')
2102-
check(result.values, df.index.values)
2103-
2103+
result = store.select_column('df', 'index')
2104+
tm.assert_almost_equal(result.values, Series(df.index).values)
2105+
self.assert_(isinstance(result,Series))
2106+
21042107
# not a data indexable column
21052108
self.assertRaises(
2106-
ValueError, store.unique, 'df', 'values_block_0')
2109+
ValueError, store.select_column, 'df', 'values_block_0')
21072110

21082111
# a data column
21092112
df2 = df.copy()
21102113
df2['string'] = 'foo'
21112114
store.append('df2', df2, data_columns=['string'])
2112-
result = store.unique('df2', 'string')
2113-
check(result.values, df2['string'].unique())
2115+
result = store.select_column('df2', 'string')
2116+
tm.assert_almost_equal(result.values, df2['string'].values)
21142117

21152118
# a data column with NaNs, result excludes the NaNs
21162119
df3 = df.copy()
21172120
df3['string'] = 'foo'
21182121
df3.ix[4:6, 'string'] = np.nan
21192122
store.append('df3', df3, data_columns=['string'])
2120-
result = store.unique('df3', 'string')
2121-
check(result.values, df3['string'].valid().unique())
2123+
result = store.select_column('df3', 'string')
2124+
tm.assert_almost_equal(result.values, df3['string'].values)
21222125

21232126
def test_coordinates(self):
21242127
df = tm.makeTimeDataFrame()

0 commit comments

Comments
 (0)