From 2763717b04f802cd84d7e278fb3ccafaef0d46b6 Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Sun, 25 Nov 2018 18:52:35 +0100 Subject: [PATCH 1/6] wip --- pandas/core/frame.py | 7 ++++--- pandas/tests/frame/test_convert_to.py | 28 +++++++++++++++++++++------ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5104cf815abf6..9fa9cd1721e29 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1227,9 +1227,10 @@ def to_dict(self, orient='dict', into=dict): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [into_c((k, com.maybe_box_datetimelike(v)) - for k, v in zip(self.columns, np.atleast_1d(row))) - for row in self.values] + return [ + into_c((k, com.maybe_box_datetimelike(v)) + for k, v in compat.iteritems(row._asdict())) + for row in self.itertuples(index=False)] elif orient.lower().startswith('i'): if not self.index.is_unique: raise ValueError( diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 61fe9d12c173c..76c392b68ece3 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -282,16 +282,32 @@ def test_to_records_datetimeindex_with_tz(self, tz): tm.assert_numpy_array_equal(result, expected) def test_to_dict_box_scalars(self): - # 14216 + # 14216, 23753 # make sure that we are boxing properly - d = {'a': [1], 'b': ['b']} + df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) - result = DataFrame(d).to_dict() - assert isinstance(list(result['a'])[0], (int, long)) - assert isinstance(list(result['b'])[0], (int, long)) + result = df.to_dict() + assert isinstance(result['a'][0], (int, long)) + assert isinstance(result['b'][0], float) - result = DataFrame(d).to_dict(orient='records') + result = df.to_dict(orient='records') + assert isinstance(result[0]['a'], (int, long)) + assert isinstance(result[0]['b'], float) + + result = df.to_dict(orient='list') + assert isinstance(result['a'][0], (int, long)) + assert isinstance(result['b'][0], float) + + result = df.to_dict(orient='split') + assert isinstance(result['data'][0][0], (int, long)) + assert isinstance(result['data'][0][1], float) + + result = df.to_dict(orient='index') assert isinstance(result[0]['a'], (int, long)) + assert isinstance(result[0]['b'], float) + + + def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of From e872f5045291587471bd88a1a22f40c76f8f0c1c Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Sun, 25 Nov 2018 19:56:40 +0100 Subject: [PATCH 2/6] tests for all meaningful orientations --- pandas/core/frame.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9fa9cd1721e29..8d7ba10ad795f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1220,9 +1220,9 @@ def to_dict(self, orient='dict', into=dict): elif orient.lower().startswith('sp'): return into_c((('index', self.index.tolist()), ('columns', self.columns.tolist()), - ('data', lib.map_infer(self.values.ravel(), - com.maybe_box_datetimelike) - .reshape(self.values.shape).tolist()))) + ('data', [ + [com.maybe_box_datetimelike(v) for v in t] for t in self.itertuples(index=False)] + ))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) From 20c0d6658a7e15b9b56e9bca6f77ea8ea25eb4a7 Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Mon, 26 Nov 2018 10:37:33 +0100 Subject: [PATCH 3/6] add whatsnew --- doc/source/whatsnew/v0.23.5.txt | 2 +- pandas/core/frame.py | 17 +++++++++++------ pandas/tests/frame/test_convert_to.py | 9 +++------ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 8f4b1a13c2e9d..c1716ff59c171 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -42,7 +42,6 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). -- **Missing** @@ -52,3 +51,4 @@ Bug Fixes **I/O** - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) +- Bug in :meth:`DataFrame.to_dict` when the result dict contains non-Python scalars (:issue:`23753`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d7ba10ad795f..e5bb9239617c7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -280,6 +280,7 @@ Index(['value'], dtype='object') """ + # ----------------------------------------------------------------------- # DataFrame class @@ -1221,7 +1222,8 @@ def to_dict(self, orient='dict', into=dict): return into_c((('index', self.index.tolist()), ('columns', self.columns.tolist()), ('data', [ - [com.maybe_box_datetimelike(v) for v in t] for t in self.itertuples(index=False)] + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False)] ))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) @@ -1229,8 +1231,8 @@ def to_dict(self, orient='dict', into=dict): elif orient.lower().startswith('r'): return [ into_c((k, com.maybe_box_datetimelike(v)) - for k, v in compat.iteritems(row._asdict())) - for row in self.itertuples(index=False)] + for k, v in compat.iteritems(row._asdict())) + for row in self.itertuples(index=False)] elif orient.lower().startswith('i'): if not self.index.is_unique: raise ValueError( @@ -2654,6 +2656,7 @@ def _get_value(self, index, col, takeable=False): col = self.columns.get_loc(col) index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) + _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -2698,6 +2701,7 @@ def _set_value(self, index, col, value, takeable=False): self._item_cache.pop(col, None) return self + _set_value.__doc__ = set_value.__doc__ def _ixs(self, i, axis=0): @@ -3161,6 +3165,7 @@ def select_dtypes(self, include=None, exclude=None): 4 True 1.0 5 False 2.0 """ + def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" if not hasattr(obj, '_info_axis_number'): @@ -6045,9 +6050,9 @@ def diff(self, periods=1, axis=0): # Function application def _gotitem(self, - key, # type: Union[str, List[str]] - ndim, # type: int - subset=None # type: Union[Series, DataFrame, None] + key, # type: Union[str, List[str]] + ndim, # type: int + subset=None # type: Union[Series, DataFrame, None] ): # type: (...) -> Union[Series, DataFrame] """ diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 76c392b68ece3..b6ede3972a1c7 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -150,7 +150,7 @@ def test_to_records_index_name(self): def test_to_records_with_unicode_index(self): # GH13172 # unicode_literals conflict with to_records - result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a')\ + result = DataFrame([{u'a': u'x', u'b': 'y'}]).set_index(u'a') \ .to_records() expected = np.rec.array([('x', 'y')], dtype=[('a', 'O'), ('b', 'O')]) tm.assert_almost_equal(result, expected) @@ -284,7 +284,7 @@ def test_to_records_datetimeindex_with_tz(self, tz): def test_to_dict_box_scalars(self): # 14216, 23753 # make sure that we are boxing properly - df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) + df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) result = df.to_dict() assert isinstance(result['a'][0], (int, long)) @@ -306,14 +306,11 @@ def test_to_dict_box_scalars(self): assert isinstance(result[0]['a'], (int, long)) assert isinstance(result[0]['b'], float) - - - def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc, ),)] df = DataFrame(list(data), columns=["d", ]) result = df.to_dict(orient='records') From dd5313b70fb339f1041e6b9ea6b2d641fea803c9 Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Mon, 26 Nov 2018 20:53:32 +0100 Subject: [PATCH 4/6] Fixes: formatting + test parametrization --- doc/source/whatsnew/v0.23.5.txt | 1 - doc/source/whatsnew/v0.24.0.rst | 2 ++ pandas/core/frame.py | 10 +++----- pandas/tests/frame/test_convert_to.py | 34 ++++++++++----------------- 4 files changed, 17 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index c1716ff59c171..250f56316b594 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -51,4 +51,3 @@ Bug Fixes **I/O** - Bug in :func:`read_csv` that caused it to raise ``OverflowError`` when trying to use 'inf' as ``na_value`` with integer index column (:issue:`17128`) -- Bug in :meth:`DataFrame.to_dict` when the result dict contains non-Python scalars (:issue:`23753`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4ff3cc728f7f7..6ee1a8f93a3ea 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1334,6 +1334,8 @@ MultiIndex I/O ^^^ +- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) + .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: Proper handling of `np.NaN` in a string data-typed column with the Python engine diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5bb9239617c7..2caec0f6a6c4d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -280,7 +280,6 @@ Index(['value'], dtype='object') """ - # ----------------------------------------------------------------------- # DataFrame class @@ -2656,7 +2655,6 @@ def _get_value(self, index, col, takeable=False): col = self.columns.get_loc(col) index = self.index.get_loc(index) return self._get_value(index, col, takeable=True) - _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): @@ -2701,7 +2699,6 @@ def _set_value(self, index, col, value, takeable=False): self._item_cache.pop(col, None) return self - _set_value.__doc__ = set_value.__doc__ def _ixs(self, i, axis=0): @@ -3165,7 +3162,6 @@ def select_dtypes(self, include=None, exclude=None): 4 True 1.0 5 False 2.0 """ - def _get_info_slice(obj, indexer): """Slice the info axis of `obj` with `indexer`.""" if not hasattr(obj, '_info_axis_number'): @@ -6050,9 +6046,9 @@ def diff(self, periods=1, axis=0): # Function application def _gotitem(self, - key, # type: Union[str, List[str]] - ndim, # type: int - subset=None # type: Union[Series, DataFrame, None] + key, # type: Union[str, List[str]] + ndim, # type: int + subset=None # type: Union[Series, DataFrame, None] ): # type: (...) -> Union[Series, DataFrame] """ diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index b6ede3972a1c7..24b7a401f15d1 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -281,36 +281,26 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) - def test_to_dict_box_scalars(self): + @pytest.mark.parametrize('orient,item_getter', + [('dict', lambda d, col, idx: d[col][idx]), + ('records', lambda d, col, idx: d[idx][col]), + ('list', lambda d, col, idx: d[col][idx]), + ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]), + ('index', lambda d, col, idx: d[idx][col]) + ]) + def test_to_dict_box_scalars(self, orient, item_getter): # 14216, 23753 # make sure that we are boxing properly df = DataFrame({'a': [1, 2], 'b': [.1, .2]}) - - result = df.to_dict() - assert isinstance(result['a'][0], (int, long)) - assert isinstance(result['b'][0], float) - - result = df.to_dict(orient='records') - assert isinstance(result[0]['a'], (int, long)) - assert isinstance(result[0]['b'], float) - - result = df.to_dict(orient='list') - assert isinstance(result['a'][0], (int, long)) - assert isinstance(result['b'][0], float) - - result = df.to_dict(orient='split') - assert isinstance(result['data'][0][0], (int, long)) - assert isinstance(result['data'][0][1], float) - - result = df.to_dict(orient='index') - assert isinstance(result[0]['a'], (int, long)) - assert isinstance(result[0]['b'], float) + result = df.to_dict(orient=orient) + assert isinstance(item_getter(result, 'a', 0), (int, long)) + assert isinstance(item_getter(result, 'b', 0), float) def test_frame_to_dict_tz(self): # GH18372 When converting to dict with orient='records' columns of # datetime that are tz-aware were not converted to required arrays data = [(datetime(2017, 11, 18, 21, 53, 0, 219225, tzinfo=pytz.utc),), - (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc, ),)] + (datetime(2017, 11, 18, 22, 6, 30, 61810, tzinfo=pytz.utc,),)] df = DataFrame(list(data), columns=["d", ]) result = df.to_dict(orient='records') From 0bd960eb579f1120c17b554640e9707f7dab0c7e Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Tue, 27 Nov 2018 20:07:55 +0100 Subject: [PATCH 5/6] Another set of fixes --- doc/source/whatsnew/v0.23.5.txt | 1 + doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/frame.py | 9 +++++---- pandas/tests/frame/test_convert_to.py | 14 ++++++++------ 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v0.23.5.txt b/doc/source/whatsnew/v0.23.5.txt index 250f56316b594..8f4b1a13c2e9d 100644 --- a/doc/source/whatsnew/v0.23.5.txt +++ b/doc/source/whatsnew/v0.23.5.txt @@ -42,6 +42,7 @@ Bug Fixes **Groupby/Resample/Rolling** - Bug in :meth:`DataFrame.resample` when resampling ``NaT`` in ``TimeDeltaIndex`` (:issue:`13223`). +- **Missing** diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 6ee1a8f93a3ea..823314fe9826e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1334,7 +1334,6 @@ MultiIndex I/O ^^^ -- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: @@ -1397,6 +1396,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) - Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) Plotting ^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2caec0f6a6c4d..d709630aebc60 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1117,6 +1117,7 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) + def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. @@ -1184,10 +1185,10 @@ def to_dict(self, orient='dict', into=dict): >>> df.to_dict('split') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1.0, 0.5], [2.0, 0.75]]} + 'data': [[1, 0.5], [2, 0.75]]} >>> df.to_dict('records') - [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}] + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] >>> df.to_dict('index') {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} @@ -1203,8 +1204,8 @@ def to_dict(self, orient='dict', into=dict): >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1.0, 'col2': 0.5}), - defaultdict(, {'col1': 2.0, 'col2': 0.75})] + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index 24b7a401f15d1..ea983193d8c11 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -281,12 +281,14 @@ def test_to_records_datetimeindex_with_tz(self, tz): # both converted to UTC, so they are equal tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize('orient,item_getter', - [('dict', lambda d, col, idx: d[col][idx]), - ('records', lambda d, col, idx: d[idx][col]), - ('list', lambda d, col, idx: d[col][idx]), - ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]), - ('index', lambda d, col, idx: d[idx][col]) + # orient - orient argument to to_dict function + # item_getter - function for extracting value from resulting dict using column name and index + @pytest.mark.parametrize('orient,item_getter', [ + ('dict', lambda d, col, idx: d[col][idx]), + ('records', lambda d, col, idx: d[idx][col]), + ('list', lambda d, col, idx: d[col][idx]), + ('split', lambda d, col, idx: d['data'][idx][d['columns'].index(col)]), + ('index', lambda d, col, idx: d[idx][col]) ]) def test_to_dict_box_scalars(self, orient, item_getter): # 14216, 23753 From dd4833f5571293ae219ade4f9a4eadfe6fbd240b Mon Sep 17 00:00:00 2001 From: "a.bogachev" Date: Wed, 28 Nov 2018 16:07:04 +0100 Subject: [PATCH 6/6] fix some lintint errors --- pandas/core/frame.py | 1 - pandas/tests/frame/test_convert_to.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d709630aebc60..5a31e25cffe95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1117,7 +1117,6 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) - def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. diff --git a/pandas/tests/frame/test_convert_to.py b/pandas/tests/frame/test_convert_to.py index ea983193d8c11..f1eb6a33eddeb 100644 --- a/pandas/tests/frame/test_convert_to.py +++ b/pandas/tests/frame/test_convert_to.py @@ -282,7 +282,8 @@ def test_to_records_datetimeindex_with_tz(self, tz): tm.assert_numpy_array_equal(result, expected) # orient - orient argument to to_dict function - # item_getter - function for extracting value from resulting dict using column name and index + # item_getter - function for extracting value from + # the resulting dict using column name and index @pytest.mark.parametrize('orient,item_getter', [ ('dict', lambda d, col, idx: d[col][idx]), ('records', lambda d, col, idx: d[idx][col]),