Skip to content

Commit 197b3c7

Browse files
committed
BUG: HDFStore didn't implement != correctly for string columns, GH 2973
1 parent 3790f16 commit 197b3c7

File tree

4 files changed

+75
-15
lines changed

4 files changed

+75
-15
lines changed

RELEASE.rst

+5
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ pandas 0.11.0
106106
(e.g. store.df == store['df'])
107107
- Internally, change all variables to be private-like (now have leading
108108
underscore)
109+
- fixes for query parsing to correctly interpret boolean and != (GH2849_, GH2973_)
110+
- fixes for pathological case on SparseSeries with 0-len array and compression (GH2931_)
109111

110112
- Bug showing up in applymap where some object type columns are converted (GH2909_)
111113
had an incorrect default in convert_objects
@@ -138,8 +140,11 @@ pandas 0.11.0
138140
.. _GH2845: https://github.com/pydata/pandas/issues/2845
139141
.. _GH2867: https://github.com/pydata/pandas/issues/2867
140142
.. _GH2807: https://github.com/pydata/pandas/issues/2807
143+
.. _GH2849: https://github.com/pydata/pandas/issues/2849
141144
.. _GH2898: https://github.com/pydata/pandas/issues/2898
142145
.. _GH2909: https://github.com/pydata/pandas/issues/2909
146+
.. _GH2931: https://github.com/pydata/pandas/issues/2931
147+
.. _GH2973: https://github.com/pydata/pandas/issues/2973
143148

144149

145150
pandas 0.10.1

pandas/core/panelnd.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _combine_with_constructor(self, other, func):
105105
klass._combine_with_constructor = _combine_with_constructor
106106

107107
# set as NonImplemented operations which we don't support
108-
for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'filter', 'dropna', 'shift', 'take']:
108+
for f in ['to_frame', 'to_excel', 'to_sparse', 'groupby', 'join', 'filter', 'dropna', 'shift']:
109109
def func(self, *args, **kwargs):
110110
raise NotImplementedError
111111
setattr(klass, f, func)

pandas/io/pytables.py

+20-13
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ def process_axes(self, obj, columns=None):
22962296

22972297
# apply the selection filters (but keep in the same order)
22982298
if self.selection.filter:
2299-
for field, filt in self.selection.filter:
2299+
for field, op, filt in self.selection.filter:
23002300

23012301
def process_filter(field, filt):
23022302

@@ -2306,9 +2306,8 @@ def process_filter(field, filt):
23062306

23072307
# see if the field is the name of an axis
23082308
if field == axis_name:
2309-
ordd = axis_values & filt
2310-
ordd = sorted(axis_values.get_indexer(ordd))
2311-
return obj.reindex_axis(axis_values.take(ordd), axis=axis_number, copy=False)
2309+
takers = op(axis_values,filt)
2310+
return obj.ix._getitem_axis(takers,axis=axis_number)
23122311

23132312
# this might be the name of a file IN an axis
23142313
elif field in axis_values:
@@ -2320,7 +2319,8 @@ def process_filter(field, filt):
23202319
# hack until we support reversed dim flags
23212320
if isinstance(obj,DataFrame):
23222321
axis_number = 1-axis_number
2323-
return obj.ix._getitem_axis(values.isin(filt),axis=axis_number)
2322+
takers = op(values,filt)
2323+
return obj.ix._getitem_axis(takers,axis=axis_number)
23242324

23252325
raise Exception("cannot find the field [%s] for filtering!" % field)
23262326

@@ -2969,7 +2969,7 @@ def __init__(self, field, op=None, value=None, queryables=None):
29692969
# backwards compatible
29702970
if isinstance(field, dict):
29712971
self.field = field.get('field')
2972-
self.op = field.get('op') or '='
2972+
self.op = field.get('op') or '=='
29732973
self.value = field.get('value')
29742974

29752975
# passed a term
@@ -2996,7 +2996,7 @@ def __init__(self, field, op=None, value=None, queryables=None):
29962996
self.op = op
29972997
self.value = value
29982998
else:
2999-
self.op = '='
2999+
self.op = '=='
30003000
self.value = op
30013001

30023002
else:
@@ -3008,8 +3008,8 @@ def __init__(self, field, op=None, value=None, queryables=None):
30083008
raise Exception("Could not create this term [%s]" % str(self))
30093009

30103010
# = vs ==
3011-
if self.op == '==':
3012-
self.op = '='
3011+
if self.op == '=':
3012+
self.op = '=='
30133013

30143014
# we have valid conditions
30153015
if self.op in ['>', '>=', '<', '<=']:
@@ -3055,22 +3055,29 @@ def eval(self):
30553055
values = [[v, v] for v in self.value]
30563056

30573057
# equality conditions
3058-
if self.op in ['=', '!=']:
3058+
if self.op in ['==', '!=']:
3059+
3060+
# our filter op expression
3061+
if self.op == '!=':
3062+
filter_op = lambda axis, values: not axis.isin(values)
3063+
else:
3064+
filter_op = lambda axis, values: axis.isin(values)
3065+
30593066

30603067
if self.is_in_table:
30613068

30623069
# too many values to create the expression?
30633070
if len(values) <= self._max_selectors:
30643071
self.condition = "(%s)" % ' | '.join(
3065-
["(%s == %s)" % (self.field, v[0]) for v in values])
3072+
["(%s %s %s)" % (self.field, self.op, v[0]) for v in values])
30663073

30673074
# use a filter after reading
30683075
else:
3069-
self.filter = (self.field, Index([v[1] for v in values]))
3076+
self.filter = (self.field, filter_op, Index([v[1] for v in values]))
30703077

30713078
else:
30723079

3073-
self.filter = (self.field, Index([v[1] for v in values]))
3080+
self.filter = (self.field, filter_op, Index([v[1] for v in values]))
30743081

30753082
else:
30763083

pandas/io/tests/test_pytables.py

+49-1
Original file line numberDiff line numberDiff line change
@@ -1673,7 +1673,7 @@ def test_select_dtypes(self):
16731673
expected = df[df.ts >= Timestamp('2012-02-01')]
16741674
tm.assert_frame_equal(expected, result)
16751675

1676-
# bool columns
1676+
# bool columns (GH #2849)
16771677
df = DataFrame(np.random.randn(5,2), columns =['A','B'])
16781678
df['object'] = 'foo'
16791679
df.ix[4:5,'object'] = 'bar'
@@ -1801,6 +1801,54 @@ def test_frame_select(self):
18011801
# self.assertRaises(Exception, store.select,
18021802
# 'frame', [crit1, crit2])
18031803

1804+
def test_string_select(self):
1805+
1806+
# GH 2973
1807+
1808+
df = tm.makeTimeDataFrame()
1809+
1810+
with ensure_clean(self.path) as store:
1811+
1812+
1813+
# test string ==/!=
1814+
1815+
df['x'] = 'none'
1816+
df.ix[2:7,'x'] = ''
1817+
1818+
store.append('df',df,data_columns=['x'])
1819+
1820+
result = store.select('df',Term('x=none'))
1821+
expected = df[df.x == 'none']
1822+
assert_frame_equal(result,expected)
1823+
1824+
result = store.select('df',Term('x!=none'))
1825+
expected = df[df.x != 'none']
1826+
assert_frame_equal(result,expected)
1827+
1828+
df2 = df.copy()
1829+
df2.x[df2.x==''] = np.nan
1830+
1831+
from pandas import isnull
1832+
store.append('df2',df2,data_columns=['x'])
1833+
result = store.select('df2',Term('x!=none'))
1834+
expected = df2[isnull(df2.x)]
1835+
assert_frame_equal(result,expected)
1836+
1837+
# int ==/!=
1838+
df['int'] = 1
1839+
df.ix[2:7,'int'] = 2
1840+
1841+
store.append('df3',df,data_columns=['int'])
1842+
1843+
result = store.select('df3',Term('int=2'))
1844+
expected = df[df.int==2]
1845+
assert_frame_equal(result,expected)
1846+
1847+
result = store.select('df3',Term('int!=2'))
1848+
expected = df[df.int!=2]
1849+
assert_frame_equal(result,expected)
1850+
1851+
18041852
def test_unique(self):
18051853

18061854
df = tm.makeTimeDataFrame()

0 commit comments

Comments
 (0)