Skip to content

Commit ee6185e

Browse files
mattipjreback
authored andcommitted
COMPAT: Pypy tweaks (#17351)
1 parent 3a291bb commit ee6185e

File tree

6 files changed

+92
-17
lines changed

6 files changed

+92
-17
lines changed

doc/source/whatsnew/v0.21.0.txt

+9-2
Original file line numberDiff line numberDiff line change
@@ -371,13 +371,11 @@ Performance Improvements
371371
Bug Fixes
372372
~~~~~~~~~
373373

374-
375374
Conversion
376375
^^^^^^^^^^
377376

378377
- Bug in assignment against datetime-like data with ``int`` may incorrectly convert to datetime-like (:issue:`14145`)
379378
- Bug in assignment against ``int64`` data with ``np.ndarray`` with ``float64`` dtype may keep ``int64`` dtype (:issue:`14001`)
380-
- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size, so an approximation is used instead (:issue:`17228`)
381379
- Fixed the return type of ``IntervalIndex.is_non_overlapping_monotonic`` to be a Python ``bool`` for consistency with similar attributes/methods. Previously returned a ``numpy.bool_``. (:issue:`17237`)
382380
- Bug in ``IntervalIndex.is_non_overlapping_monotonic`` when intervals are closed on both sides and overlap at a point (:issue:`16560`)
383381
- Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`)
@@ -463,6 +461,15 @@ Categorical
463461
the ``.categories`` to be an empty ``Float64Index`` rather than an empty
464462
``Index`` with object dtype (:issue:`17248`)
465463

464+
PyPy
465+
^^^^
466+
467+
- Compatibility with PyPy in :func:`read_csv` with ``usecols=[<unsorted ints>]`` and
468+
:func:`read_json` (:issue:`17351`)
469+
- Split tests into cases for CPython and PyPy where needed, which highlights the fragility
470+
of index matching with ``float('nan')``, ``np.nan`` and ``NAT`` (:issue:`17351`)
471+
- Fix :func:`DataFrame.memory_usage` to support PyPy. Objects on PyPy do not have a fixed size,
472+
so an approximation is used instead (:issue:`17228`)
466473

467474
Other
468475
^^^^^

pandas/_libs/src/ujson/python/JSONtoObj.c

+8-8
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ JSOBJ Object_npyEndObject(void *prv, JSOBJ obj) {
409409
}
410410

411411
int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
412-
PyObject *label;
412+
PyObject *label, *labels;
413413
npy_intp labelidx;
414414
// add key to label array, value to values array
415415
NpyArrContext *npyarr = (NpyArrContext *)obj;
@@ -424,11 +424,11 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
424424
if (!npyarr->labels[labelidx]) {
425425
npyarr->labels[labelidx] = PyList_New(0);
426426
}
427-
427+
labels = npyarr->labels[labelidx];
428428
// only fill label array once, assumes all column labels are the same
429429
// for 2-dimensional arrays.
430-
if (PyList_GET_SIZE(npyarr->labels[labelidx]) <= npyarr->elcount) {
431-
PyList_Append(npyarr->labels[labelidx], label);
430+
if (PyList_Check(labels) && PyList_GET_SIZE(labels) <= npyarr->elcount) {
431+
PyList_Append(labels, label);
432432
}
433433

434434
if (((JSONObjectDecoder *)npyarr->dec)->arrayAddItem(prv, obj, value)) {
@@ -439,16 +439,16 @@ int Object_npyObjectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
439439
}
440440

441441
int Object_objectAddKey(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value) {
442-
PyDict_SetItem(obj, name, value);
442+
int ret = PyDict_SetItem(obj, name, value);
443443
Py_DECREF((PyObject *)name);
444444
Py_DECREF((PyObject *)value);
445-
return 1;
445+
return ret == 0 ? 1 : 0;
446446
}
447447

448448
int Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) {
449-
PyList_Append(obj, value);
449+
int ret = PyList_Append(obj, value);
450450
Py_DECREF((PyObject *)value);
451-
return 1;
451+
return ret == 0 ? 1 : 0;
452452
}
453453

454454
JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) {

pandas/io/parsers.py

+1
Original file line numberDiff line numberDiff line change
@@ -1716,6 +1716,7 @@ def _set_noconvert_columns(self):
17161716
# A set of integers will be converted to a list in
17171717
# the correct order every single time.
17181718
usecols = list(self.usecols)
1719+
usecols.sort()
17191720
elif (callable(self.usecols) or
17201721
self.usecols_dtype not in ('empty', None)):
17211722
# The names attribute should have the correct columns

pandas/tests/indexes/test_base.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pandas.tests.indexes.common import Base
1010

1111
from pandas.compat import (range, lrange, lzip, u,
12-
text_type, zip, PY3, PY36)
12+
text_type, zip, PY3, PY36, PYPY)
1313
import operator
1414
import numpy as np
1515

@@ -1370,13 +1370,21 @@ def test_isin(self):
13701370
assert len(result) == 0
13711371
assert result.dtype == np.bool_
13721372

1373-
def test_isin_nan(self):
1373+
@pytest.mark.skipif(PYPY, reason="np.nan is float('nan') on PyPy")
1374+
def test_isin_nan_not_pypy(self):
1375+
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
1376+
np.array([False, False]))
1377+
1378+
@pytest.mark.skipif(not PYPY, reason="np.nan is float('nan') on PyPy")
1379+
def test_isin_nan_pypy(self):
1380+
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
1381+
np.array([False, True]))
1382+
1383+
def test_isin_nan_common(self):
13741384
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([np.nan]),
13751385
np.array([False, True]))
13761386
tm.assert_numpy_array_equal(Index(['a', pd.NaT]).isin([pd.NaT]),
13771387
np.array([False, True]))
1378-
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([float('nan')]),
1379-
np.array([False, False]))
13801388
tm.assert_numpy_array_equal(Index(['a', np.nan]).isin([pd.NaT]),
13811389
np.array([False, False]))
13821390

pandas/tests/indexes/test_multi.py

+11-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from pandas import (CategoricalIndex, DataFrame, Index, MultiIndex,
1616
compat, date_range, period_range)
17-
from pandas.compat import PY3, long, lrange, lzip, range, u
17+
from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY
1818
from pandas.errors import PerformanceWarning, UnsortedIndexError
1919
from pandas.core.indexes.base import InvalidIndexError
2020
from pandas._libs import lib
@@ -2571,13 +2571,22 @@ def test_isin(self):
25712571
assert len(result) == 0
25722572
assert result.dtype == np.bool_
25732573

2574-
def test_isin_nan(self):
2574+
@pytest.mark.skipif(PYPY, reason="tuples cmp recursively on PyPy")
2575+
def test_isin_nan_not_pypy(self):
25752576
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
25762577
tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
25772578
np.array([False, False]))
25782579
tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
25792580
np.array([False, False]))
25802581

2582+
@pytest.mark.skipif(not PYPY, reason="tuples cmp recursively on PyPy")
2583+
def test_isin_nan_pypy(self):
2584+
idx = MultiIndex.from_arrays([['foo', 'bar'], [1.0, np.nan]])
2585+
tm.assert_numpy_array_equal(idx.isin([('bar', np.nan)]),
2586+
np.array([False, True]))
2587+
tm.assert_numpy_array_equal(idx.isin([('bar', float('nan'))]),
2588+
np.array([False, True]))
2589+
25812590
def test_isin_level_kwarg(self):
25822591
idx = MultiIndex.from_arrays([['qux', 'baz', 'foo', 'bar'], np.arange(
25832592
4)])

pandas/tests/io/parser/test_parsers.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import os
44
import pandas.util.testing as tm
55

6-
from pandas import read_csv, read_table
6+
from pandas import read_csv, read_table, DataFrame
77
from pandas.core.common import AbstractMethodError
8+
from pandas._libs.lib import Timestamp
9+
from pandas.compat import StringIO
810

911
from .common import ParserTests
1012
from .header import HeaderTests
@@ -100,3 +102,51 @@ def read_table(self, *args, **kwds):
100102
kwds = kwds.copy()
101103
kwds['engine'] = self.engine
102104
return read_table(*args, **kwds)
105+
106+
107+
class TestUnsortedUsecols(object):
108+
def test_override__set_noconvert_columns(self):
109+
# GH 17351 - usecols needs to be sorted in _setnoconvert_columns
110+
# based on the test_usecols_with_parse_dates test from usecols.py
111+
from pandas.io.parsers import CParserWrapper, TextFileReader
112+
113+
s = """a,b,c,d,e
114+
0,1,20140101,0900,4
115+
0,1,20140102,1000,4"""
116+
117+
parse_dates = [[1, 2]]
118+
cols = {
119+
'a': [0, 0],
120+
'c_d': [
121+
Timestamp('2014-01-01 09:00:00'),
122+
Timestamp('2014-01-02 10:00:00')
123+
]
124+
}
125+
expected = DataFrame(cols, columns=['c_d', 'a'])
126+
127+
class MyTextFileReader(TextFileReader):
128+
def __init__(self):
129+
self._currow = 0
130+
self.squeeze = False
131+
132+
class MyCParserWrapper(CParserWrapper):
133+
def _set_noconvert_columns(self):
134+
if self.usecols_dtype == 'integer':
135+
# self.usecols is a set, which is documented as unordered
136+
# but in practice, a CPython set of integers is sorted.
137+
# In other implementations this assumption does not hold.
138+
# The following code simulates a different order, which
139+
# before GH 17351 would cause the wrong columns to be
140+
# converted via the parse_dates parameter
141+
self.usecols = list(self.usecols)
142+
self.usecols.reverse()
143+
return CParserWrapper._set_noconvert_columns(self)
144+
145+
parser = MyTextFileReader()
146+
parser.options = {'usecols': [0, 2, 3],
147+
'parse_dates': parse_dates,
148+
'delimiter': ','}
149+
parser._engine = MyCParserWrapper(StringIO(s), **parser.options)
150+
df = parser.read()
151+
152+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)