BUG: Fix filter method so that accepts byte and unicode column names (#18238)

Licht-T · TomAugspurger · commit 98d4cc72dde8 · 2017-12-11T11:24:50.000-06:00
(cherry picked from commit ec065b2)
diff --git a/doc/source/whatsnew/v0.21.1.txt b/doc/source/whatsnew/v0.21.1.txt
@@ -116,7 +116,7 @@ Reshaping
 
 - Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`)
 - Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
--
+- Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`)
 -
 
 Numeric
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -257,6 +257,16 @@ def u(s):
     def u_safe(s):
         return s
 
+    def to_str(s):
+        """
+        Convert bytes and non-string into Python 3 str
+        """
+        if isinstance(s, binary_type):
+            s = bytes_to_str(s)
+        elif not isinstance(s, string_types):
+            s = str(s)
+        return s
+
     def strlen(data, encoding=None):
         # encoding is for compat with PY2
         return len(data)
@@ -302,6 +312,14 @@ def u_safe(s):
         except:
             return s
 
+    def to_str(s):
+        """
+        Convert unicode and non-string into Python 2 str
+        """
+        if not isinstance(s, string_types):
+            s = str(s)
+        return s
+
     def strlen(data, encoding=None):
         try:
             data = data.decode(encoding)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -49,7 +49,7 @@
 from pandas.tseries.frequencies import to_offset
 from pandas import compat
 from pandas.compat.numpy import function as nv
-from pandas.compat import (map, zip, lzip, lrange, string_types,
+from pandas.compat import (map, zip, lzip, lrange, string_types, to_str,
                            isidentifier, set_function_name, cPickle as pkl)
 from pandas.core.ops import _align_method_FRAME
 import pandas.core.nanops as nanops
@@ -3235,14 +3235,14 @@ def filter(self, items=None, like=None, regex=None, axis=None):
                 **{name: [r for r in items if r in labels]})
         elif like:
             def f(x):
-                if not isinstance(x, string_types):
-                    x = str(x)
-                return like in x
+                return like in to_str(x)
             values = labels.map(f)
             return self.loc(axis=axis)[values]
         elif regex:
+            def f(x):
+                return matcher.search(to_str(x)) is not None
             matcher = re.compile(regex)
-            values = labels.map(lambda x: matcher.search(str(x)) is not None)
+            values = labels.map(f)
             return self.loc(axis=axis)[values]
         else:
             raise TypeError('Must pass either `items`, `like`, or `regex`')
diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py
@@ -884,6 +884,27 @@ def test_filter_regex_search(self):
         exp = df[[x for x in df.columns if 'BB' in x]]
         assert_frame_equal(result, exp)
 
+    @pytest.mark.parametrize('name,expected', [
+        ('a', DataFrame({u'a': [1, 2]})),
+        (u'a', DataFrame({u'a': [1, 2]})),
+        (u'あ', DataFrame({u'あ': [3, 4]}))
+    ])
+    def test_filter_unicode(self, name, expected):
+        # GH13101
+        df = DataFrame({u'a': [1, 2], u'あ': [3, 4]})
+
+        assert_frame_equal(df.filter(like=name), expected)
+        assert_frame_equal(df.filter(regex=name), expected)
+
+    @pytest.mark.parametrize('name', ['a', u'a'])
+    def test_filter_bytestring(self, name):
+        # GH13101
+        df = DataFrame({b'a': [1, 2], b'b': [3, 4]})
+        expected = DataFrame({b'a': [1, 2]})
+
+        assert_frame_equal(df.filter(like=name), expected)
+        assert_frame_equal(df.filter(regex=name), expected)
+
     def test_filter_corner(self):
         empty = DataFrame()
 

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,7 @@ Reshaping`
`116`	`116`
`117`	`117`	- Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`)
`118`	`118`	- Bug in ``pd.concat`` when empty and non-empty DataFrames or Series are concatenated (:issue:`18178` :issue:`18187`)
`119`		`--`
	`119`	+- Bug in ``DataFrame.filter(...)`` when :class:`unicode` is passed as a condition in Python 2 (:issue:`13101`)
`120`	`120`	`-`
`121`	`121`
`122`	`122`	`Numeric`