From 0338b5d3e9b7a7f1e48031a3fefadf4c6138d120 Mon Sep 17 00:00:00 2001 From: gfyoung Date: Sat, 23 Jul 2016 05:52:20 -0400 Subject: [PATCH] BUG: Fix segfault in lib.isnullobj Weird segfault arises when you call lib.isnullobj (or any of its equivalents like lib.isnullobj2d) with an array that uses 0-field values to mean None. Changed input to be a Python object (i.e. no typing), and the segfault went away. Closes gh-13717. [ci skip] --- asv_bench/benchmarks/frame_methods.py | 16 +++++++++ doc/source/whatsnew/v0.19.0.txt | 1 + pandas/lib.pyx | 16 ++++++--- pandas/tests/test_lib.py | 50 +++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 5c5a1df4ea1f8..29a8733e88601 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -1,4 +1,5 @@ from .pandas_vb_common import * +import string class frame_apply_axis_1(object): @@ -606,6 +607,21 @@ def time_frame_isnull(self): isnull(self.df) +class frame_isnull_strings(object): + goal_time = 0.2 + + def setup(self): + np.random.seed(1234) + self.sample = np.array(list(string.ascii_lowercase) + + list(string.ascii_uppercase) + + list(string.whitespace)) + self.data = np.random.choice(self.choice, (1000, 1000)) + self.df = DataFrame(self.data) + + def time_frame_isnull(self): + isnull(self.df) + + class frame_isnull_obj(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 646e8822ed46f..9f10190bcec89 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -694,6 +694,7 @@ Bug Fixes - Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`) - Bug in ``.rolling()`` that allowed a negative integer window in contruction of the ``Rolling()`` object, but would later fail on aggregation (:issue:`13383`) +- Bug in printing ``pd.DataFrame`` where unusual elements with the object dtype were causing segfaults (:issue:`13717`) - Bug in various index types, which did not propagate the name of passed index (:issue:`12309`) - Bug in ``DatetimeIndex``, which did not honour the ``copy=True`` (:issue:`13205`) - Bug in ``DatetimeIndex.is_normalized`` returns incorrectly for normalized date_range in case of local timezones (:issue:`13459`) diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 7cbb502315b64..bf1dd1246120b 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -342,11 +342,13 @@ def item_from_zerodim(object val): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj(ndarray[object] arr): +def isnullobj(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.empty(n, dtype=np.uint8) for i from 0 <= i < n: @@ -356,11 +358,13 @@ def isnullobj(ndarray[object] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj_old(ndarray[object] arr): +def isnullobj_old(ndarray arr): cdef Py_ssize_t i, n cdef object val cdef ndarray[uint8_t] result + assert arr.ndim == 1, "'arr' must be 1-D." + n = len(arr) result = np.zeros(n, dtype=np.uint8) for i from 0 <= i < n: @@ -370,11 +374,13 @@ def isnullobj_old(ndarray[object] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d(ndarray[object, ndim=2] arr): +def isnullobj2d(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: @@ -386,11 +392,13 @@ def isnullobj2d(ndarray[object, ndim=2] arr): @cython.wraparound(False) @cython.boundscheck(False) -def isnullobj2d_old(ndarray[object, ndim=2] arr): +def isnullobj2d_old(ndarray arr): cdef Py_ssize_t i, j, n, m cdef object val cdef ndarray[uint8_t, ndim=2] result + assert arr.ndim == 2, "'arr' must be 2-D." + n, m = ( arr).shape result = np.zeros((n, m), dtype=np.uint8) for i from 0 <= i < n: diff --git a/pandas/tests/test_lib.py b/pandas/tests/test_lib.py index 84d7226f1b2f5..80b5e41e881cd 100644 --- a/pandas/tests/test_lib.py +++ b/pandas/tests/test_lib.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import numpy as np +import pandas as pd import pandas.lib as lib import pandas.util.testing as tm @@ -184,6 +185,55 @@ def test_get_reverse_indexer(self): self.assertTrue(np.array_equal(result, expected)) +class TestNullObj(tm.TestCase): + + _1d_methods = ['isnullobj', 'isnullobj_old'] + _2d_methods = ['isnullobj2d', 'isnullobj2d_old'] + + def _check_behavior(self, arr, expected): + for method in TestNullObj._1d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + arr = np.atleast_2d(arr) + expected = np.atleast_2d(expected) + + for method in TestNullObj._2d_methods: + result = getattr(lib, method)(arr) + tm.assert_numpy_array_equal(result, expected) + + def test_basic(self): + arr = np.array([1, None, 'foo', -5.1, pd.NaT, np.nan]) + expected = np.array([False, True, False, False, True, True]) + + self._check_behavior(arr, expected) + + def test_non_obj_dtype(self): + arr = np.array([1, 3, np.nan, 5], dtype=float) + expected = np.array([False, False, True, False]) + + self._check_behavior(arr, expected) + + def test_empty_arr(self): + arr = np.array([]) + expected = np.array([], dtype=bool) + + self._check_behavior(arr, expected) + + def test_empty_str_inp(self): + arr = np.array([""]) # empty but not null + expected = np.array([False]) + + self._check_behavior(arr, expected) + + def test_empty_like(self): + # see gh-13717: no segfaults! + arr = np.empty_like([None]) + expected = np.array([True]) + + self._check_behavior(arr, expected) + + def test_duplicated_with_nas(): keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)