Regression in DataFrame.sum with mixed datetime, numeric and missing values #30886

TomAugspurger · 2020-01-10T14:36:13Z

On 0.25.3

In [3]: df = pd.DataFrame({"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex([2, 3, 4])

In [4]: df.sum()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
TypeError: unsupported operand type(s) for +: 'Timestamp' and 'Timestamp'

The above exception was the direct cause of the following exception:

SystemError                               Traceback (most recent call last)
<ipython-input-4-7e5fdb616c56> in <module>
----> 1 df.sum()

~/sandbox/pandas/pandas/core/generic.py in stat_func(self, axis, skipna, level, numeric_only, min_count, **kwargs)
  11035             skipna=skipna,
  11036             numeric_only=numeric_only,
> 11037             min_count=min_count,
  11038         )
  11039

~/sandbox/pandas/pandas/core/frame.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   7885             values = self.values
   7886             try:
-> 7887                 result = f(values)
   7888
   7889                 if filter_type == "bool" and is_object_dtype(values) and axis is None:

~/sandbox/pandas/pandas/core/frame.py in f(x)
   7843
   7844         def f(x):
-> 7845             return op(x, axis=axis, skipna=skipna, **kwds)
   7846
   7847         def _get_data(axis_matters):

~/sandbox/pandas/pandas/core/nanops.py in _f(*args, **kwargs)
     67             try:
     68                 with np.errstate(invalid="ignore"):
---> 69                     return f(*args, **kwargs)
     70             except ValueError as e:
     71                 # we want to transform an object array

~/sandbox/pandas/pandas/core/nanops.py in nansum(values, axis, skipna, min_count, mask)
    491     elif is_timedelta64_dtype(dtype):
    492         dtype_sum = np.float64
--> 493     the_sum = values.sum(axis, dtype=dtype_sum)
    494     the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
    495

~/Envs/pandas-dev/lib/python3.7/site-packages/numpy/core/_methods.py in _sum(a, axis, dtype, out, keepdims, initial, where)
     36 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     37          initial=_NoValue, where=True):
---> 38     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     39
     40 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,

~/sandbox/pandas/pandas/_libs/tslibs/c_timestamp.pyx in pandas._libs.tslibs.c_timestamp._Timestamp.__add__()

~/sandbox/pandas/pandas/_libs/tslibs/c_timestamp.pyx in pandas._libs.tslibs.c_timestamp.integer_op_not_supported()

SystemError: <class 'TypeError'> returned a result with an error set

On 0.25.x

In [1]: import pandas as pd
df
In [2]: df = pd.DataFrame({"A": pd.date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex([2, 3, 4])

In [3]: df.sum()
Out[3]:
B    7.0
dtype: float64

cc @jbrockmendel

jbrockmendel · 2020-01-10T16:46:47Z

huh, the fact that the reindex is necessary is weird. I'll take a look a this.

TomAugspurger · 2020-01-10T17:31:45Z

I assume it's the introduction of the missing values, but haven't looked closely.

jbrockmendel · 2020-01-10T21:38:20Z

Tentatively looks like moving from fstring back to .format in integer_op_not_supported fixes this

scoder · 2020-01-11T06:43:47Z

Could you paste the C code that is generated by Cython for the f-string and the return TypeError(…) ?

jbrockmendel · 2020-01-11T17:23:22Z

Could you paste the C code that is generated by Cython for the f-string and the return TypeError(…) ?

fstring version:


  /* "pandas/_libs/tslibs/c_timestamp.pyx":61
 * 
 *     int_addsub_msg = (
 *         f"Addition/subtraction of integers and integer-arrays with {cls} is "             # <<<<<<<<<<<<<<
 *         "no longer supported.  Instead of adding/subtracting `n`, "
 *         "use `n * obj.freq`"
 */
  __pyx_t_1 = PyTuple_New(3); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 61, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = 0;
  __pyx_t_3 = 127;
  __Pyx_INCREF(__pyx_kp_u_Addition_subtraction_of_integers);
  __pyx_t_2 += 57;
  __Pyx_GIVEREF(__pyx_kp_u_Addition_subtraction_of_integers);
  PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_kp_u_Addition_subtraction_of_integers);
  __pyx_t_4 = __Pyx_PyObject_FormatSimple(__pyx_v_cls, __pyx_empty_unicode); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 61, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_t_3 = (__Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_4) > __pyx_t_3) ? __Pyx_PyUnicode_MAX_CHAR_VALUE(__pyx_t_4) : __pyx_t_3;
  __pyx_t_2 += __Pyx_PyUnicode_GET_LENGTH(__pyx_t_4);
  __Pyx_GIVEREF(__pyx_t_4);
  PyTuple_SET_ITEM(__pyx_t_1, 1, __pyx_t_4);
  __pyx_t_4 = 0;
  __Pyx_INCREF(__pyx_kp_u_is_no_longer_supported_Instead);
  __pyx_t_2 += 79;
  __Pyx_GIVEREF(__pyx_kp_u_is_no_longer_supported_Instead);
  PyTuple_SET_ITEM(__pyx_t_1, 2, __pyx_kp_u_is_no_longer_supported_Instead);
  __pyx_t_4 = __Pyx_PyUnicode_Join(__pyx_t_1, 3, __pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 61, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __pyx_v_int_addsub_msg = ((PyObject*)__pyx_t_4);
  __pyx_t_4 = 0;

  /* "pandas/_libs/tslibs/c_timestamp.pyx":65
 *         "use `n * obj.freq`"
 *     )
 *     return TypeError(int_addsub_msg)             # <<<<<<<<<<<<<<
 * 
 * 
 */
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_4 = __Pyx_PyObject_CallOneArg(__pyx_builtin_TypeError, __pyx_v_int_addsub_msg); if (unlikely(!__pyx_t_4)) __PYX_ERR(0, 65, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_4);
  __pyx_r = __pyx_t_4;
  __pyx_t_4 = 0;
  goto __pyx_L0;

.format version

  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(((PyObject *)Py_TYPE(__pyx_v_obj)), __pyx_n_s_name); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 58, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_v_cls = __pyx_t_1;
  __pyx_t_1 = 0;

  /* "pandas/_libs/tslibs/c_timestamp.pyx":65
 *         "no longer supported.  Instead of adding/subtracting `n`, "
 *         "use `n * obj.freq`"
 *     ).format(cls=cls)             # <<<<<<<<<<<<<<
 *     return TypeError(int_addsub_msg)
 * 
 */
  __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_kp_u_Addition_subtraction_of_integers, __pyx_n_s_format); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 65, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_1);
  __pyx_t_2 = __Pyx_PyDict_NewPresized(1); if (unlikely(!__pyx_t_2)) __PYX_ERR(0, 65, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_2);
  if (PyDict_SetItem(__pyx_t_2, __pyx_n_s_cls, __pyx_v_cls) < 0) __PYX_ERR(0, 65, __pyx_L1_error)
  __pyx_t_3 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, __pyx_t_2); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 65, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;
  __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0;
  __pyx_v_int_addsub_msg = __pyx_t_3;
  __pyx_t_3 = 0;

  /* "pandas/_libs/tslibs/c_timestamp.pyx":66
 *         "use `n * obj.freq`"
 *     ).format(cls=cls)
 *     return TypeError(int_addsub_msg)             # <<<<<<<<<<<<<<
 * 
 * 
 */
  __Pyx_XDECREF(__pyx_r);
  __pyx_t_3 = __Pyx_PyObject_CallOneArg(__pyx_builtin_TypeError, __pyx_v_int_addsub_msg); if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 66, __pyx_L1_error)
  __Pyx_GOTREF(__pyx_t_3);
  __pyx_r = __pyx_t_3;
  __pyx_t_3 = 0;
  goto __pyx_L0;

scoder · 2020-01-12T06:45:21Z

Thanks. Both look completely correct to me. Even the call to TypeError is exactly the same, I can see no reason why one would raise a SystemError and the other wouldn't.

@TomAugspurger, are both tests using the exact same CPython installation and version? Can you confirm that the only difference is the Pandas version?

@jbrockmendel, can you confirm that the C code snippets you pasted are those that allowed you to reproduce the differing behaviour?

jbrockmendel · 2020-01-12T16:22:28Z

can you confirm that the C code snippets you pasted are those that allowed you to reproduce the differing behaviour?

Just double checked: the pasted code matches what I have locally for master and #30905, respectively, which reproduce the differing behavior.

scoder · 2020-01-29T14:35:15Z

the fact that the reindex is necessary is weird

I agree. May suggest that the problem is somewhere else entirely. From what I gather, the creation of the TypeError isn't the place where things go wrong. It's only the call that happens to detect that an exception was set earlier without being reported and now reports the inconsistent system state.

It could be that the .format() accidentally clears the error state along the way without looking at it, so that the TypeError creation doesn't find it any more and succeeds normally. My guess is that the f-string version is correct in reporting that there is an issue, and that going back to the .format() just covers it up without solving it.

Could someone run both versions in a CPython debug build to see if that's more informative?

jbrockmendel · 2020-01-29T16:45:02Z

To get things rolling, I can reproduce without the reindex, but it does appear to require NaT:

(this is on 8bdd7b1)

dti = pd.date_range('2000', periods=4).append(pd.Index([pd.NaT]))
df = pd.DataFrame({"A": dti, "B": [1, 2, 3, 4, 5]})

>>> df.sum()
[...]
SystemError: <class 'TypeError'> returned a result with an error set

>>> df.iloc[:-1].sum()
B    10.0
dtype: float64

The traceback for df.sum() includes a result = f(values) in DataFrame._reduce (core.frame L7871, corresponds to L7976 in master). This is inside a "try/except TypeError" but I don't see any obvious candidates for what would have raised and been ignored before we got here

A Cpython debug build is going to have tom wait until after a bunch more caffeine

jbrockmendel · 2020-01-29T23:20:00Z

Digging in a little further, it looks like it has something to do with whether the ndarray we end up summing in nanops.nansum is F-contiguous or not. The relevant call ends up being values.sum(0)

dti = pd.date_range('2000', periods=4).append(pd.Index([pd.NaT]))
df = pd.DataFrame({"A": dti, "B": [1, 2, 3, 4, 5]})

values = df.values
values[-1, 0] = 0   #  this is effectively done inside nanops.nansum, is necessary here

>>> values.flags["F_CONTIGUOUS"]
True
>>> values.sum(0)  
TypeError: unsupported operand type(s) for +: 'Timestamp' and 'Timestamp'

values2 = np.array(values, order="C")  # like the array produced by nanops._get_values

>>> values2.flags["F_CONTIGUOUS"]
False
>>> values2.sum(0)
SystemError: <class 'TypeError'> returned a result with an error set

@scoder is this verging on being helpful? Worth pinging one of our numpy friends?

scoder · 2020-01-30T05:39:19Z

@jbrockmendel ah, yes, that looks like a bug in NumPy then. Could just be a missing error return on their side. Worth pinging them, absolutely.

TomAugspurger added this to the 1.0.0 milestone Jan 10, 2020

TomAugspurger added Numeric Operations Arithmetic, Comparison, and Logical operations Regression Functionality that used to work in a prior pandas version Datetime Datetime data dtype labels Jan 10, 2020

jbrockmendel self-assigned this Jan 10, 2020

jbrockmendel mentioned this issue Jan 10, 2020

BUG: SystemError in df.sum #30905

Merged

5 tasks

TomAugspurger closed this as completed in #30905 Jan 15, 2020

jbrockmendel mentioned this issue Feb 4, 2020

SystemError: Exception set without being reported? numpy/numpy#15502

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Regression in DataFrame.sum with mixed datetime, numeric and missing values #30886

Regression in DataFrame.sum with mixed datetime, numeric and missing values #30886

TomAugspurger commented Jan 10, 2020

jbrockmendel commented Jan 10, 2020

TomAugspurger commented Jan 10, 2020

jbrockmendel commented Jan 10, 2020 •

edited

Loading

scoder commented Jan 11, 2020

jbrockmendel commented Jan 11, 2020

scoder commented Jan 12, 2020

jbrockmendel commented Jan 12, 2020

scoder commented Jan 29, 2020

jbrockmendel commented Jan 29, 2020

jbrockmendel commented Jan 29, 2020

scoder commented Jan 30, 2020

Regression in DataFrame.sum with mixed datetime, numeric and missing values #30886

Regression in DataFrame.sum with mixed datetime, numeric and missing values #30886

Comments

TomAugspurger commented Jan 10, 2020

jbrockmendel commented Jan 10, 2020

TomAugspurger commented Jan 10, 2020

jbrockmendel commented Jan 10, 2020 • edited Loading

scoder commented Jan 11, 2020

jbrockmendel commented Jan 11, 2020

scoder commented Jan 12, 2020

jbrockmendel commented Jan 12, 2020

scoder commented Jan 29, 2020

jbrockmendel commented Jan 29, 2020

jbrockmendel commented Jan 29, 2020

scoder commented Jan 30, 2020

jbrockmendel commented Jan 10, 2020 •

edited

Loading