COMPAT: Pypy tweaks (#17351)

mattip · jreback · commit ee6185e2fb94 · 2017-09-07T07:56:33.000-04:00
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -371,13 +371,11 @@ Performance Improvements
 Bug Fixes
 ~~~~~~~~~
 
-
 Conversion
 ^^^^^^^^^^
 
 - Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
 - Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
-- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`)
 - Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods.  Previously returned a ``numpy.bool_``. (:issue:`17237`)
 - Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`)
 - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`)
@@ -463,6 +461,15 @@ Categorical
   the ``.categories`` to be an empty ``Float64Index`` rather than an empty
   ``Index`` with object dtype (:issue:`17248`)
 
+PyPy
+^^^^
+
+- Compatibility with PyPy in :func:`read_csv` with ``usecols=[<unsorted ints>]`` and
+  :func:`read_json` (:issue:`17351`)
+- Split tests into cases for CPython and PyPy where needed, which highlights the fragility
+  of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`)
+- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size,
+  so an approximation is used instead (:issue:`17228`)
 
 Other
 ^^^^^
diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c
@@ -409,7 +409,7 @@ JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
 }
 
 int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
-    PyObject *label;
+    PyObject *label, *labels;
     npy_intp labelidx;
     // add key to label array, value to values array
     NpyArrContext *npyarr = (NpyArrContext *)obj;
@@ -424,11 +424,11 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
     if (!npyarr->labels[labelidx]) {
         npyarr->labels[labelidx] = PyList_New(0);
     }
-
+    labels = npyarr->labels[labelidx];
     // only fill label array once, assumes all column labels are the same
     // for 2-dimensional arrays.
-    if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) {
-        PyList_Append(npyarr->labels[labelidx], label);
+    if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
+        PyList_Append(labels, label);
     }
 
     if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
@@ -439,16 +439,16 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
 }
 
 int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
-    PyDict_SetItem(obj, name, value);
+    int ret = PyDict_SetItem(obj, name, value);
     Py_DECREF((PyObject *)name);
     Py_DECREF((PyObject *)value);
-    return 1;
+    return ret == 0 ? 1 : 0;
 }
 
 int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
-    PyList_Append(obj, value);
+    int ret = PyList_Append(obj, value);
     Py_DECREF((PyObject *)value);
-    return 1;
+    return ret == 0 ? 1 : 0;
 }
 
 JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1716,6 +1716,7 @@ def _set_noconvert_columns(self):
             # A set of integers will be converted to a list in
             # the correct order every single time.
             usecols = list(self.usecols)
+            usecols.sort()
         elif (callable(self.usecols) or
                 self.usecols_dtype not in ('empty', None)):
             # The names attribute should have the correct columns
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
@@ -9,7 +9,7 @@
 from pandas.tests.indexes.common import Base
 
 from pandas.compat import (range, lrange, lzip, u,
-                           text_type, zip, PY3, PY36)
+                           text_type, zip, PY3, PY36, PYPY)
 import operator
 import numpy as np
 
@@ -1370,13 +1370,21 @@ def test_isin(self):
         assert len(result) == 0
         assert result.dtype == np.bool_
 
-    def test_isin_nan(self):
+    @pytest.mark.skipif(PYPY, reason="np.nan is float('nan') on PyPy")
+    def test_isin_nan_not_pypy(self):
+        tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
+                                    np.array([False, False]))
+
+    @pytest.mark.skipif(not PYPY, reason="np.nan is float('nan') on PyPy")
+    def test_isin_nan_pypy(self):
+        tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
+                                    np.array([False, True]))
+
+    def test_isin_nan_common(self):
         tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([np.nan]),
                                     np.array([False, True]))
         tm.assert_numpy_array_equal(Index(['a', pd.NaT]).isin([pd.NaT]),
                                     np.array([False, True]))
-        tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
-                                    np.array([False, False]))
         tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
                                     np.array([False, False]))
 
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -14,7 +14,7 @@
 
 from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex,
                     compat, date_range, period_range)
-from pandas.compat import PY3, long, lrange, lzip, range, u
+from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY
 from pandas.errors import PerformanceWarning, UnsortedIndexError
 from pandas.core.indexes.base import InvalidIndexError
 from pandas._libs import lib
@@ -2571,13 +2571,22 @@ def test_isin(self):
         assert len(result) == 0
         assert result.dtype == np.bool_
 
-    def test_isin_nan(self):
+    @pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
+    def test_isin_nan_not_pypy(self):
         idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
         tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
                                     np.array([False, False]))
         tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
                                     np.array([False, False]))
 
+    @pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
+    def test_isin_nan_pypy(self):
+        idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
+        tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
+                                    np.array([False, True]))
+        tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
+                                    np.array([False, True]))
+
     def test_isin_level_kwarg(self):
         idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
             4)])
diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py
@@ -3,8 +3,10 @@
 import os
 import pandas.util.testing as tm
 
-from pandas import read_csv, read_table
+from pandas import read_csv, read_table, DataFrame
 from pandas.core.common import AbstractMethodError
+from pandas._libs.lib import Timestamp
+from pandas.compat import StringIO
 
 from .common import ParserTests
 from .header import HeaderTests
@@ -100,3 +102,51 @@ def read_table(self, *args, **kwds):
         kwds = kwds.copy()
         kwds['engine'] = self.engine
         return read_table(*args, **kwds)
+
+
+class TestUnsortedUsecols(object):
+    def test_override__set_noconvert_columns(self):
+        # GH 17351 - usecols needs to be sorted in _setnoconvert_columns
+        # based on the test_usecols_with_parse_dates test from usecols.py
+        from pandas.io.parsers import CParserWrapper, TextFileReader
+
+        s = """a,b,c,d,e
+        0,1,20140101,0900,4
+        0,1,20140102,1000,4"""
+
+        parse_dates = [[1, 2]]
+        cols = {
+            'a': [0, 0],
+            'c_d': [
+                Timestamp('2014-01-01 09:00:00'),
+                Timestamp('2014-01-02 10:00:00')
+            ]
+        }
+        expected = DataFrame(cols, columns=['c_d', 'a'])
+
+        class MyTextFileReader(TextFileReader):
+            def __init__(self):
+                self._currow = 0
+                self.squeeze = False
+
+        class MyCParserWrapper(CParserWrapper):
+            def _set_noconvert_columns(self):
+                if self.usecols_dtype == 'integer':
+                    # self.usecols is a set, which is documented as unordered
+                    # but in practice, a CPython set of integers is sorted.
+                    # In other implementations this assumption does not hold.
+                    # The following code simulates a different order, which
+                    # before GH 17351 would cause the wrong columns to be
+                    # converted via the parse_dates parameter
+                    self.usecols = list(self.usecols)
+                    self.usecols.reverse()
+                return CParserWrapper._set_noconvert_columns(self)
+
+        parser = MyTextFileReader()
+        parser.options = {'usecols': [0, 2, 3],
+                          'parse_dates': parse_dates,
+                          'delimiter': ','}
+        parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
+        df = parser.read()
+
+        tm.assert_frame_equal(df, expected)