From 2f00c7fec4945f4748315273dff57295a7f7a42e Mon Sep 17 00:00:00 2001 From: "HE, Tao" Date: Thu, 14 Mar 2019 22:59:16 +0800 Subject: [PATCH 1/7] Convert tuple to list before `_list_to_arrays` when construct DataFrame. Signed-off-by: HE, Tao --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 18 ++++++++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 72c40b04a1195..bd36fcb598dcc 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -124,7 +124,7 @@ Bug Fixes ~~~~~~~~~ - Bug in :func:`to_datetime` which would raise an (incorrect) ``ValueError`` when called with a date far into the future and the ``format`` argument specified instead of raising ``OutOfBoundsDatetime`` (:issue:`23830`) - Bug in an error message in :meth:`DataFrame.plot`. Improved the error message if non-numerics are passed to :meth:`DataFrame.plot` (:issue:`25481`) -- +- Segmentation fault when construct :class:`DataFrame` from non-empty tuples (:issue:`25691`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 7e97512682720..cab8078521505 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -394,7 +394,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(data, columns, coerce_float=coerce_float, + return _list_to_arrays(list(data), columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], compat.Mapping): return _list_of_dict_to_arrays(data, columns, diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1d5cbfec8de52..142a87e1a7424 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1202,12 +1202,26 @@ def test_constructor_mixed_type_rows(self): expected = DataFrame([[1, 2], [3, 4]]) tm.assert_frame_equal(result, expected) - def test_constructor_tuples(self): + @pytest.mark.parametrize("tuples,lists", [ + ((), []), + ((()), []), + (((), ()), [(), ()]), + (((), ()), [[], []]), + (([], []), [[], []]), + (([1, 2, 3], [4, 5, 6]), [[1, 2, 3], [4, 5, 6]]) + ]) + def test_constructor_tuple(self, tuples, lists): + # GH 25691 + result = DataFrame(tuples) + expected = DataFrame(lists) + tm.assert_frame_equal(result, expected) + + def test_constructor_list_of_tuples(self): result = DataFrame({'A': [(1, 2), (3, 4)]}) expected = DataFrame({'A': Series([(1, 2), (3, 4)])}) tm.assert_frame_equal(result, expected) - def test_constructor_namedtuples(self): + def test_constructor_list_of_namedtuples(self): # GH11181 from collections import namedtuple named_tuple = namedtuple("Pandas", list('ab')) From e4e68ceeddf81cfe86ac5892c18e5b7bde6c5933 Mon Sep 17 00:00:00 2001 From: "HE, Tao" Date: Fri, 15 Mar 2019 00:15:06 +0800 Subject: [PATCH 2/7] Revise. Signed-off-by: HE, Tao --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/internals/construction.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 86975cbe7a275..83e7df35bf46a 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -245,7 +245,7 @@ Reshaping - Bug in :func:`merge` when merging by index name would sometimes result in an incorrectly numbered index (:issue:`24212`) - :func:`to_records` now accepts dtypes to its `column_dtypes` parameter (:issue:`24895`) - Bug in :func:`concat` where order of ``OrderedDict`` (and ``dict`` in Python 3.6+) is not respected, when passed in as ``objs`` argument (:issue:`21510`) -- Segmentation fault when construct :class:`DataFrame` from non-empty tuples (:issue:`25691`) +- Bug in :class:`DataFrame` construct when passing non-empty tuples would cause segmentation fault (:issue:`25691`) Sparse diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index cab8078521505..46656bf5430f5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -394,7 +394,7 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): return [[]] * len(columns), columns return [], [] # columns if columns is not None else [] if isinstance(data[0], (list, tuple)): - return _list_to_arrays(list(data), columns, coerce_float=coerce_float, + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], compat.Mapping): return _list_of_dict_to_arrays(data, columns, @@ -422,10 +422,10 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays(data, columns, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(data).T) + content = list(lib.to_object_array_tuples(list(data)).T) else: # list of lists - content = list(lib.to_object_array(data).T) + content = list(lib.to_object_array(list(data)).T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) From 4aa629ed72f67af588c4f7df03e2f5e226ea1d1f Mon Sep 17 00:00:00 2001 From: "HE, Tao" Date: Fri, 15 Mar 2019 22:07:35 +0800 Subject: [PATCH 3/7] Remove the unnecessary conversion. --- pandas/_libs/lib.pyx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 34ceeb20e260e..21ad592ac3851 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2245,7 +2245,7 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(rows: object, int min_width=0): +def to_object_array(rows: list, int min_width=0): """ Convert a list of lists into an object array. @@ -2266,22 +2266,20 @@ def to_object_array(rows: object, int min_width=0): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result - list input_rows list row - input_rows = rows - n = len(input_rows) + n = len(rows) k = min_width for i in range(n): - tmp = len(input_rows[i]) + tmp = len(rows[i]) if tmp > k: k = tmp result = np.empty((n, k), dtype=object) for i in range(n): - row = list(input_rows[i]) + row = list(rows[i]) for j in range(len(row)): result[i, j] = row[j] From 2c896f994f39494cf57d15ea53e27a4cd343f0d6 Mon Sep 17 00:00:00 2001 From: Tao He Date: Tue, 26 Mar 2019 12:01:07 +0800 Subject: [PATCH 4/7] Add comment in to_object_array_tuples. --- pandas/_libs/lib.pyx | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 21ad592ac3851..d4a592f7fa691 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2305,6 +2305,24 @@ def tuples_to_object_array(ndarray[object] tuples): def to_object_array_tuples(rows: list): + """ + Convert a list of tuples into an object array. Any subclass of + tuple in `rows` will be casted to tuple. + + Parameters + ---------- + rows : 2-d array (N, K) + A list of tuples to be converted into an array + min_width : int + The minimum width of the object array. If a tuple + in `rows` contains fewer than `width` elements, + the remaining elements in the corresponding row + will all be `NaN`. + + Returns + ------- + obj_array : numpy array of the object dtype + """ cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result From d9e3fe72957884eebb9a8b4047b3686262e10578 Mon Sep 17 00:00:00 2001 From: Tao He Date: Tue, 26 Mar 2019 20:20:52 +0800 Subject: [PATCH 5/7] Do conversion in cython routines. --- pandas/_libs/lib.pyx | 25 ++++++++++++++++--------- pandas/core/internals/construction.py | 4 ++-- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d4a592f7fa691..4ca184f7d4665 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2245,7 +2245,7 @@ def map_infer(ndarray arr, object f, bint convert=1): return result -def to_object_array(rows: list, int min_width=0): +def to_object_array(rows: object, int min_width=0): """ Convert a list of lists into an object array. @@ -2266,20 +2266,22 @@ def to_object_array(rows: list, int min_width=0): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result + list input_rows list row - n = len(rows) + input_rows = list(rows) + n = len(input_rows) k = min_width for i in range(n): - tmp = len(rows[i]) + tmp = len(input_rows[i]) if tmp > k: k = tmp result = np.empty((n, k), dtype=object) for i in range(n): - row = list(rows[i]) + row = list(input_rows[i]) for j in range(len(row)): result[i, j] = row[j] @@ -2304,7 +2306,7 @@ def tuples_to_object_array(ndarray[object] tuples): return result -def to_object_array_tuples(rows: list): +def to_object_array_tuples(rows: object): """ Convert a list of tuples into an object array. Any subclass of tuple in `rows` will be casted to tuple. @@ -2326,13 +2328,15 @@ def to_object_array_tuples(rows: list): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result + list input_rows tuple row - n = len(rows) + input_rows = list(rows) + n = len(input_rows) k = 0 for i in range(n): - tmp = 1 if checknull(rows[i]) else len(rows[i]) + tmp = 1 if checknull(input_rows[i]) else len(input_rows[i]) if tmp > k: k = tmp @@ -2340,13 +2344,16 @@ def to_object_array_tuples(rows: list): try: for i in range(n): - row = rows[i] + row = input_rows[i] for j in range(len(row)): result[i, j] = row[j] except Exception: # upcast any subclasses to tuple for i in range(n): - row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) + if (checknull(input_rows[i])): + row = (input_rows[i],) + else: + row = tuple(input_rows[i]) for j in range(len(row)): result[i, j] = row[j] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d019677862410..d06be8b70fc96 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -422,10 +422,10 @@ def to_arrays(data, columns, coerce_float=False, dtype=None): def _list_to_arrays(data, columns, coerce_float=False, dtype=None): if len(data) > 0 and isinstance(data[0], tuple): - content = list(lib.to_object_array_tuples(list(data)).T) + content = list(lib.to_object_array_tuples(data).T) else: # list of lists - content = list(lib.to_object_array(list(data)).T) + content = list(lib.to_object_array(data).T) return _convert_object_array(content, columns, dtype=dtype, coerce_float=coerce_float) From e9a0ab3601f1b295c8c38f068a2490742ad56ff5 Mon Sep 17 00:00:00 2001 From: Tao He Date: Fri, 29 Mar 2019 22:46:08 +0800 Subject: [PATCH 6/7] Remove input_rows. --- pandas/_libs/lib.pyx | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a7f6cdace3623..3e47e8859b473 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2264,22 +2264,21 @@ def to_object_array(rows: object, int min_width=0): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result - list input_rows list row - input_rows = list(rows) - n = len(input_rows) + rows = list(rows) + n = len(rows) k = min_width for i in range(n): - tmp = len(input_rows[i]) + tmp = len(rows[i]) if tmp > k: k = tmp result = np.empty((n, k), dtype=object) for i in range(n): - row = list(input_rows[i]) + row = list(rows[i]) for j in range(len(row)): result[i, j] = row[j] @@ -2326,15 +2325,14 @@ def to_object_array_tuples(rows: object): cdef: Py_ssize_t i, j, n, k, tmp ndarray[object, ndim=2] result - list input_rows tuple row - input_rows = list(rows) - n = len(input_rows) + rows = list(rows) + n = len(rows) k = 0 for i in range(n): - tmp = 1 if checknull(input_rows[i]) else len(input_rows[i]) + tmp = 1 if checknull(rows[i]) else len(rows[i]) if tmp > k: k = tmp @@ -2342,16 +2340,13 @@ def to_object_array_tuples(rows: object): try: for i in range(n): - row = input_rows[i] + row = rows[i] for j in range(len(row)): result[i, j] = row[j] except Exception: # upcast any subclasses to tuple for i in range(n): - if (checknull(input_rows[i])): - row = (input_rows[i],) - else: - row = tuple(input_rows[i]) + row = (rows[i],) if checknull(rows[i]) else tuple(rows[i]) for j in range(len(row)): result[i, j] = row[j] From 4e41de3d7b8aa5fbcb3c2a4316a3d16f140f2725 Mon Sep 17 00:00:00 2001 From: Tao He Date: Sun, 31 Mar 2019 14:34:11 +0800 Subject: [PATCH 7/7] Remove the extra docstring. --- pandas/_libs/lib.pyx | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3e47e8859b473..f98f29d178c15 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2311,12 +2311,7 @@ def to_object_array_tuples(rows: object): Parameters ---------- rows : 2-d array (N, K) - A list of tuples to be converted into an array - min_width : int - The minimum width of the object array. If a tuple - in `rows` contains fewer than `width` elements, - the remaining elements in the corresponding row - will all be `NaN`. + A list of tuples to be converted into an array. Returns -------