Skip to content

Commit 5036c4f

Browse files
committed
REF: Refactor DataFrame dict contstructor
When passing a dict and `column=` to DataFrame, we previously passed the dict of {column: array} to the Series constructor. This eventually hit `construct_1d_object_array_from_listlike`[1]. For extension arrays, this ends up calling `ExtensionArray.__iter__`, iterating over the elements of the ExtensionArray, which is prohibiatively slow. We try to properly handle all the edge cases that we were papering over earlier by just passing the `data` to Series. We fix a bug or two along the way, but don't change any *tested* behavior, even if it looks fishy (e.g. pandas-dev#24385). [1]: pandas-dev#24368 (comment) Closes pandas-dev#24368 Closes pandas-dev#24386
1 parent a022bae commit 5036c4f

File tree

3 files changed

+105
-32
lines changed

3 files changed

+105
-32
lines changed

doc/source/whatsnew/v0.24.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -1401,10 +1401,11 @@ Numeric
14011401
- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
14021402
- Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`)
14031403

1404-
Conversion
1404+
Conversion
14051405
^^^^^^^^^^
14061406

14071407
- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`)
1408+
- Bug in :meth:`DataFrame.__init__` when providing a ``dict`` data, ``columns`` that don't overlap with the keys in ``data``, and an integer ``dtype`` returning a DataFrame with floating-point values (:issue:`24386`)
14081409

14091410
Strings
14101411
^^^^^^^

pandas/core/internals/construction.py

+91-31
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
from pandas.core.dtypes.common import (
2222
is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
2323
is_extension_array_dtype, is_extension_type, is_float_dtype,
24-
is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype)
24+
is_integer_dtype, is_iterator, is_list_like, is_object_dtype,
25+
is_string_dtype, pandas_dtype)
2526
from pandas.core.dtypes.generic import (
2627
ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries,
2728
ABCTimedeltaIndex)
@@ -173,49 +174,108 @@ def init_dict(data, index, columns, dtype=None):
173174
"""
174175
from pandas.core.series import Series
175176

177+
# Converting a dict of arrays to list of arrays sounds easy enough,
178+
# right? Well, it's a bit more nuanced that that. Some problems:
179+
# 1. Pandas allows missing values in the keys. If a user provides a dict
180+
# where the keys never compare equal (np.nan, pd.NaT, float('nan'))
181+
# we can't ever do a `data[key]`. So we *have* to iterate over the
182+
# key, value pairs of `data`, no way around it.
183+
# 2. The key value pairs of `data` may have
184+
# 1. A subset of the desired columns
185+
# 2. A superset of the columns
186+
# 3. Just the right columns
187+
# And may or may not be in the right order (or ordered, period).
188+
# So we need to get a mapping from `key in data -> position`.
189+
# 3. Inconsistencies between the Series and DataFrame constructors
190+
# w.r.t. dtypes makes all for a lot of special casing later on.
176191
if columns is None:
177192
columns = list(data)
178193

179194
if not isinstance(columns, Index):
180195
columns = Index(columns, copy=False)
181196

182-
if columns.nlevels > 1:
183-
# MultiIndex.__iter__ may be incorrect for integer levels
184-
# with some missing values. The integer values are cast to
185-
# float. The upshot is that we can't look up keys from the
186-
# dict below.
187-
column_iter = (columns[i] for i in range(len(columns)))
197+
if data:
198+
normalized_keys = Index(data, copy=False)
199+
positions = Series(columns.get_indexer_for(normalized_keys),
200+
index=normalized_keys)
188201
else:
189-
column_iter = iter(columns)
190-
191-
new_data = type(data)() # dict or OrderedDict
192-
sentinel = object()
193-
194-
for key in column_iter:
195-
# We use an object() sentinel for two reasons:
196-
# 1. We avoid having to allocate the Series in each iteration
197-
# 2. We can use data.get(key, None), since the user is allowed
198-
# to pass DataFrame({"A": None}, index=[...]), which is
199-
# different from DataFrame({"A": Series(None, index=[...])})
200-
# which is probably a bug.
201-
val = data.get(key, sentinel)
202-
203-
if val is sentinel:
204-
val = Series(index=index, dtype=dtype)
205-
elif val is None:
206-
val = Series([None] * len(index), index=index,
207-
dtype=dtype or object)
202+
positions = Series()
203+
204+
new_data = {}
205+
index_len = 0 if index is None else len(index)
206+
207+
for key, val in data.items():
208+
position = positions[key]
209+
if position < 0:
210+
# Something like data={"A": [...]}, columns={"B"}
211+
continue
208212
if (isinstance(val, ABCDatetimeIndex) and
209213
data[key].tz is not None):
210214
# GH#24096 need copy to be deep for datetime64tz case
211215
# TODO: See if we can avoid these copies
212-
val = val[key].copy(deep=True)
216+
val = val.copy(deep=True)
213217

214-
new_data[key] = val
215-
216-
keys = com.dict_keys_to_ordered_list(new_data)
218+
elif val is None:
219+
# Users may provide scalars as keys. These are aligned to the
220+
# correct shape to align with `index`. We would use the Series
221+
# constructor, but Series(None, index=index) is converted to
222+
# NaNs. In DataFrame,
223+
# DataFrame({"A": None}, index=[1, 2], columns=["A"])
224+
# is an array of Nones.
225+
val = Series([None] * index_len, index=index,
226+
dtype=dtype or object)
217227

218-
arrays = [new_data[k] for k in keys]
228+
elif index_len and lib.is_scalar(val):
229+
val = Series(val, index=index, dtype=dtype)
230+
231+
new_data[position] = val
232+
233+
# OK, so user-provided columns in `data` taken care of. Let's move on to
234+
# "extra" columns as defined by `columns`. First, we figure out the
235+
# positions of the holes we're filling in.
236+
extra_positions = np.arange(len(columns))
237+
mask = np.isin(extra_positions, positions, invert=True)
238+
extra_positions = extra_positions[mask]
239+
240+
# And now, what should the dtype of this new guys be? We'll that's a little
241+
# tricky.
242+
# 1. User provided dtype, just use that...
243+
# unless the user provided dtype=int and an index (Gh-24385)
244+
# 2. Empty data.keys() & columns is object (unless specified by the user)
245+
# 3. No data and No dtype is object (unless specified by the user).
246+
247+
# https://github.com/pandas-dev/pandas/issues/24385
248+
# Series(None, dtype=int) and DataFrame(None, dtype=dtype)
249+
# differ when the index is provided.
250+
# But if dtype is not provided, then we fall use object.
251+
# we have to pass this dtype through to arrays_to_mgr
252+
253+
# Some things I'd like to change
254+
# With DataFrame(None, index=[1], columns=['a'], dtype=dtype):
255+
# For dtype=object, the result is object
256+
# But for dtype=int, the result is float
257+
empty_columns = len(positions.index & columns) == 0
258+
259+
if empty_columns and dtype is None:
260+
dtype = object
261+
elif (index_len
262+
and is_integer_dtype(dtype)):
263+
# That's one complicated condition:
264+
# DataFrame(None, index=idx, columns=cols, dtype=int) must be float
265+
# DataFrame(None, index=idx, columns=cols, dtype=object) is object
266+
# DataFrame({'a': 2}, columns=['b']) is object (empty)
267+
dtype = float
268+
elif not data and dtype is None:
269+
dtype = np.dtype('object')
270+
271+
for position in extra_positions:
272+
new_data[position] = Series(index=index, dtype=dtype)
273+
274+
arrays = [new_data[i] for i in range(len(columns))]
275+
276+
# hrm this probably belongs in arrays_to_mgr...
277+
if is_string_dtype(dtype) and not is_categorical_dtype(dtype):
278+
dtype = np.dtype("object")
219279

220280
return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
221281

pandas/tests/frame/test_constructors.py

+12
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,18 @@ def test_constructor_dtype(self, data, index, columns, dtype, expected):
816816
df = DataFrame(data, index, columns, dtype)
817817
assert df.values.dtype == expected
818818

819+
@pytest.mark.parametrize('dtype', [
820+
np.dtype("int64"),
821+
np.dtype("float32"),
822+
np.dtype("object"),
823+
np.dtype("datetime64[ns]"),
824+
"category"
825+
])
826+
def test_constructor_dtype_non_overlapping_columns(self, dtype):
827+
df = DataFrame({"A": [1, 2]}, columns=['B'], dtype=dtype)
828+
result = df.dtypes['B']
829+
assert result == dtype
830+
819831
def test_constructor_scalar_inference(self):
820832
data = {'int': 1, 'bool': True,
821833
'float': 3., 'complex': 4j, 'object': 'foo'}

0 commit comments

Comments
 (0)