|
21 | 21 | from pandas.core.dtypes.common import (
|
22 | 22 | is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
|
23 | 23 | is_extension_array_dtype, is_extension_type, is_float_dtype,
|
24 |
| - is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) |
| 24 | + is_integer_dtype, is_iterator, is_list_like, is_object_dtype, |
| 25 | + is_string_dtype, pandas_dtype) |
25 | 26 | from pandas.core.dtypes.generic import (
|
26 | 27 | ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries,
|
27 | 28 | ABCTimedeltaIndex)
|
@@ -173,49 +174,108 @@ def init_dict(data, index, columns, dtype=None):
|
173 | 174 | """
|
174 | 175 | from pandas.core.series import Series
|
175 | 176 |
|
| 177 | + # Converting a dict of arrays to list of arrays sounds easy enough, |
| 178 | + # right? Well, it's a bit more nuanced that that. Some problems: |
| 179 | + # 1. Pandas allows missing values in the keys. If a user provides a dict |
| 180 | + # where the keys never compare equal (np.nan, pd.NaT, float('nan')) |
| 181 | + # we can't ever do a `data[key]`. So we *have* to iterate over the |
| 182 | + # key, value pairs of `data`, no way around it. |
| 183 | + # 2. The key value pairs of `data` may have |
| 184 | + # 1. A subset of the desired columns |
| 185 | + # 2. A superset of the columns |
| 186 | + # 3. Just the right columns |
| 187 | + # And may or may not be in the right order (or ordered, period). |
| 188 | + # So we need to get a mapping from `key in data -> position`. |
| 189 | + # 3. Inconsistencies between the Series and DataFrame constructors |
| 190 | + # w.r.t. dtypes makes all for a lot of special casing later on. |
176 | 191 | if columns is None:
|
177 | 192 | columns = list(data)
|
178 | 193 |
|
179 | 194 | if not isinstance(columns, Index):
|
180 | 195 | columns = Index(columns, copy=False)
|
181 | 196 |
|
182 |
| - if columns.nlevels > 1: |
183 |
| - # MultiIndex.__iter__ may be incorrect for integer levels |
184 |
| - # with some missing values. The integer values are cast to |
185 |
| - # float. The upshot is that we can't look up keys from the |
186 |
| - # dict below. |
187 |
| - column_iter = (columns[i] for i in range(len(columns))) |
| 197 | + if data: |
| 198 | + normalized_keys = Index(data, copy=False) |
| 199 | + positions = Series(columns.get_indexer_for(normalized_keys), |
| 200 | + index=normalized_keys) |
188 | 201 | else:
|
189 |
| - column_iter = iter(columns) |
190 |
| - |
191 |
| - new_data = type(data)() # dict or OrderedDict |
192 |
| - sentinel = object() |
193 |
| - |
194 |
| - for key in column_iter: |
195 |
| - # We use an object() sentinel for two reasons: |
196 |
| - # 1. We avoid having to allocate the Series in each iteration |
197 |
| - # 2. We can use data.get(key, None), since the user is allowed |
198 |
| - # to pass DataFrame({"A": None}, index=[...]), which is |
199 |
| - # different from DataFrame({"A": Series(None, index=[...])}) |
200 |
| - # which is probably a bug. |
201 |
| - val = data.get(key, sentinel) |
202 |
| - |
203 |
| - if val is sentinel: |
204 |
| - val = Series(index=index, dtype=dtype) |
205 |
| - elif val is None: |
206 |
| - val = Series([None] * len(index), index=index, |
207 |
| - dtype=dtype or object) |
| 202 | + positions = Series() |
| 203 | + |
| 204 | + new_data = {} |
| 205 | + index_len = 0 if index is None else len(index) |
| 206 | + |
| 207 | + for key, val in data.items(): |
| 208 | + position = positions[key] |
| 209 | + if position < 0: |
| 210 | + # Something like data={"A": [...]}, columns={"B"} |
| 211 | + continue |
208 | 212 | if (isinstance(val, ABCDatetimeIndex) and
|
209 | 213 | data[key].tz is not None):
|
210 | 214 | # GH#24096 need copy to be deep for datetime64tz case
|
211 | 215 | # TODO: See if we can avoid these copies
|
212 |
| - val = val[key].copy(deep=True) |
| 216 | + val = val.copy(deep=True) |
213 | 217 |
|
214 |
| - new_data[key] = val |
215 |
| - |
216 |
| - keys = com.dict_keys_to_ordered_list(new_data) |
| 218 | + elif val is None: |
| 219 | + # Users may provide scalars as keys. These are aligned to the |
| 220 | + # correct shape to align with `index`. We would use the Series |
| 221 | + # constructor, but Series(None, index=index) is converted to |
| 222 | + # NaNs. In DataFrame, |
| 223 | + # DataFrame({"A": None}, index=[1, 2], columns=["A"]) |
| 224 | + # is an array of Nones. |
| 225 | + val = Series([None] * index_len, index=index, |
| 226 | + dtype=dtype or object) |
217 | 227 |
|
218 |
| - arrays = [new_data[k] for k in keys] |
| 228 | + elif index_len and lib.is_scalar(val): |
| 229 | + val = Series(val, index=index, dtype=dtype) |
| 230 | + |
| 231 | + new_data[position] = val |
| 232 | + |
| 233 | + # OK, so user-provided columns in `data` taken care of. Let's move on to |
| 234 | + # "extra" columns as defined by `columns`. First, we figure out the |
| 235 | + # positions of the holes we're filling in. |
| 236 | + extra_positions = np.arange(len(columns)) |
| 237 | + mask = np.isin(extra_positions, positions, invert=True) |
| 238 | + extra_positions = extra_positions[mask] |
| 239 | + |
| 240 | + # And now, what should the dtype of this new guys be? We'll that's a little |
| 241 | + # tricky. |
| 242 | + # 1. User provided dtype, just use that... |
| 243 | + # unless the user provided dtype=int and an index (Gh-24385) |
| 244 | + # 2. Empty data.keys() & columns is object (unless specified by the user) |
| 245 | + # 3. No data and No dtype is object (unless specified by the user). |
| 246 | + |
| 247 | + # https://github.com/pandas-dev/pandas/issues/24385 |
| 248 | + # Series(None, dtype=int) and DataFrame(None, dtype=dtype) |
| 249 | + # differ when the index is provided. |
| 250 | + # But if dtype is not provided, then we fall use object. |
| 251 | + # we have to pass this dtype through to arrays_to_mgr |
| 252 | + |
| 253 | + # Some things I'd like to change |
| 254 | + # With DataFrame(None, index=[1], columns=['a'], dtype=dtype): |
| 255 | + # For dtype=object, the result is object |
| 256 | + # But for dtype=int, the result is float |
| 257 | + empty_columns = len(positions.index & columns) == 0 |
| 258 | + |
| 259 | + if empty_columns and dtype is None: |
| 260 | + dtype = object |
| 261 | + elif (index_len |
| 262 | + and is_integer_dtype(dtype)): |
| 263 | + # That's one complicated condition: |
| 264 | + # DataFrame(None, index=idx, columns=cols, dtype=int) must be float |
| 265 | + # DataFrame(None, index=idx, columns=cols, dtype=object) is object |
| 266 | + # DataFrame({'a': 2}, columns=['b']) is object (empty) |
| 267 | + dtype = float |
| 268 | + elif not data and dtype is None: |
| 269 | + dtype = np.dtype('object') |
| 270 | + |
| 271 | + for position in extra_positions: |
| 272 | + new_data[position] = Series(index=index, dtype=dtype) |
| 273 | + |
| 274 | + arrays = [new_data[i] for i in range(len(columns))] |
| 275 | + |
| 276 | + # hrm this probably belongs in arrays_to_mgr... |
| 277 | + if is_string_dtype(dtype) and not is_categorical_dtype(dtype): |
| 278 | + dtype = np.dtype("object") |
219 | 279 |
|
220 | 280 | return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
|
221 | 281 |
|
|
0 commit comments