|
21 | 21 | from pandas.core.dtypes.common import (
|
22 | 22 | is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal,
|
23 | 23 | is_extension_array_dtype, is_extension_type, is_float_dtype,
|
24 |
| - is_integer_dtype, is_iterator, is_list_like, is_object_dtype, pandas_dtype) |
| 24 | + is_integer_dtype, is_iterator, is_list_like, is_object_dtype, |
| 25 | + is_string_dtype, pandas_dtype) |
25 | 26 | from pandas.core.dtypes.generic import (
|
26 | 27 | ABCDataFrame, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, ABCSeries,
|
27 | 28 | ABCTimedeltaIndex)
|
@@ -171,44 +172,112 @@ def init_dict(data, index, columns, dtype=None):
|
171 | 172 | Segregate Series based on type and coerce into matrices.
|
172 | 173 | Needs to handle a lot of exceptional cases.
|
173 | 174 | """
|
174 |
| - if columns is not None: |
175 |
| - from pandas.core.series import Series |
176 |
| - arrays = Series(data, index=columns, dtype=object) |
177 |
| - data_names = arrays.index |
| 175 | + from pandas.core.series import Series |
| 176 | + |
| 177 | + # Converting a dict of arrays to list of arrays sounds easy enough, |
| 178 | + # right? Well, it's a bit more nuanced that that. Some problems: |
| 179 | + # 1. Pandas allows missing values in the keys. If a user provides a dict |
| 180 | + # where the keys never compare equal (np.nan, pd.NaT, float('nan')) |
| 181 | + # we can't ever do a `data[key]`. So we *have* to iterate over the |
| 182 | + # key, value pairs of `data`, no way around it. |
| 183 | + # 2. The key value pairs of `data` may have |
| 184 | + # 1. A subset of the desired columns |
| 185 | + # 2. A superset of the columns |
| 186 | + # 3. Just the right columns |
| 187 | + # And may or may not be in the right order (or ordered, period). |
| 188 | + # So we need to get a mapping from `key in data -> position`. |
| 189 | + # 3. Inconsistencies between the Series and DataFrame constructors |
| 190 | + # w.r.t. dtypes makes all for a lot of special casing later on. |
| 191 | + if columns is None: |
| 192 | + columns = list(data) |
178 | 193 |
|
179 |
| - missing = arrays.isnull() |
180 |
| - if index is None: |
181 |
| - # GH10856 |
182 |
| - # raise ValueError if only scalars in dict |
183 |
| - index = extract_index(arrays[~missing]) |
184 |
| - else: |
185 |
| - index = ensure_index(index) |
186 |
| - |
187 |
| - # no obvious "empty" int column |
188 |
| - if missing.any() and not is_integer_dtype(dtype): |
189 |
| - if dtype is None or np.issubdtype(dtype, np.flexible): |
190 |
| - # GH#1783 |
191 |
| - nan_dtype = object |
192 |
| - else: |
193 |
| - nan_dtype = dtype |
194 |
| - val = construct_1d_arraylike_from_scalar(np.nan, len(index), |
195 |
| - nan_dtype) |
196 |
| - arrays.loc[missing] = [val] * missing.sum() |
| 194 | + if not isinstance(columns, Index): |
| 195 | + columns = Index(columns, copy=False) |
197 | 196 |
|
| 197 | + if data: |
| 198 | + normalized_keys = Index(data.keys(), copy=False) |
| 199 | + positions = Series(columns.get_indexer_for(normalized_keys), |
| 200 | + index=normalized_keys) |
198 | 201 | else:
|
199 |
| - |
200 |
| - for key in data: |
201 |
| - if (isinstance(data[key], ABCDatetimeIndex) and |
202 |
| - data[key].tz is not None): |
203 |
| - # GH#24096 need copy to be deep for datetime64tz case |
204 |
| - # TODO: See if we can avoid these copies |
205 |
| - data[key] = data[key].copy(deep=True) |
206 |
| - |
207 |
| - keys = com.dict_keys_to_ordered_list(data) |
208 |
| - columns = data_names = Index(keys) |
209 |
| - arrays = [data[k] for k in keys] |
210 |
| - |
211 |
| - return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) |
| 202 | + positions = Series() |
| 203 | + |
| 204 | + new_data = {} |
| 205 | + index_len = 0 if index is None else len(index) |
| 206 | + |
| 207 | + for key, val in data.items(): |
| 208 | + position = positions[key] |
| 209 | + if position < 0: |
| 210 | + # Something like data={"A": [...]}, columns={"B"} |
| 211 | + continue |
| 212 | + if (isinstance(val, ABCDatetimeIndex) and |
| 213 | + data[key].tz is not None): |
| 214 | + # GH#24096 need copy to be deep for datetime64tz case |
| 215 | + # TODO: See if we can avoid these copies |
| 216 | + val = val.copy(deep=True) |
| 217 | + |
| 218 | + elif val is None: |
| 219 | + # Users may provide scalars as keys. These are aligned to the |
| 220 | + # correct shape to align with `index`. We would use the Series |
| 221 | + # constructor, but Series(None, index=index) is converted to |
| 222 | + # NaNs. In DataFrame, |
| 223 | + # DataFrame({"A": None}, index=[1, 2], columns=["A"]) |
| 224 | + # is an array of Nones. |
| 225 | + val = Series([None] * index_len, index=index, |
| 226 | + dtype=dtype or object) |
| 227 | + |
| 228 | + elif index_len and lib.is_scalar(val): |
| 229 | + val = Series(val, index=index, dtype=dtype) |
| 230 | + |
| 231 | + new_data[position] = val |
| 232 | + |
| 233 | + # OK, so user-provided columns in `data` taken care of. Let's move on to |
| 234 | + # "extra" columns as defined by `columns`. First, we figure out the |
| 235 | + # positions of the holes we're filling in. |
| 236 | + extra_positions = np.arange(len(columns)) |
| 237 | + mask = np.isin(extra_positions, positions, invert=True) |
| 238 | + extra_positions = extra_positions[mask] |
| 239 | + |
| 240 | + # And now, what should the dtype of this new guys be? We'll that's a little |
| 241 | + # tricky. |
| 242 | + # 1. User provided dtype, just use that... |
| 243 | + # unless the user provided dtype=int and an index (Gh-24385) |
| 244 | + # 2. Empty data.keys() & columns is object (unless specified by the user) |
| 245 | + # 3. No data and No dtype is object (unless specified by the user). |
| 246 | + |
| 247 | + # https://github.com/pandas-dev/pandas/issues/24385 |
| 248 | + # Series(None, dtype=int) and DataFrame(None, dtype=dtype) |
| 249 | + # differ when the index is provided. |
| 250 | + # But if dtype is not provided, then we fall use object. |
| 251 | + # we have to pass this dtype through to arrays_to_mgr |
| 252 | + |
| 253 | + # Some things I'd like to change |
| 254 | + # With DataFrame(None, index=[1], columns=['a'], dtype=dtype): |
| 255 | + # For dtype=object, the result is object |
| 256 | + # But for dtype=int, the result is float |
| 257 | + empty_columns = len(positions.index & columns) == 0 |
| 258 | + |
| 259 | + if empty_columns and dtype is None: |
| 260 | + dtype = object |
| 261 | + elif (index_len |
| 262 | + and is_integer_dtype(dtype)): |
| 263 | + # That's one complicated condition: |
| 264 | + # DataFrame(None, index=idx, columns=cols, dtype=int) must be float |
| 265 | + # DataFrame(None, index=idx, columns=cols, dtype=object) is object |
| 266 | + # DataFrame({'a': 2}, columns=['b']) is object (empty) |
| 267 | + dtype = float |
| 268 | + elif not data and dtype is None: |
| 269 | + dtype = np.dtype('object') |
| 270 | + |
| 271 | + for position in extra_positions: |
| 272 | + new_data[position] = Series(index=index, dtype=dtype) |
| 273 | + |
| 274 | + arrays = [new_data[i] for i in range(len(columns))] |
| 275 | + |
| 276 | + # hrm this probably belongs in arrays_to_mgr... |
| 277 | + if is_string_dtype(dtype) and not is_categorical_dtype(dtype): |
| 278 | + dtype = np.dtype("object") |
| 279 | + |
| 280 | + return arrays_to_mgr(arrays, columns, index, columns, dtype=dtype) |
212 | 281 |
|
213 | 282 |
|
214 | 283 | # ---------------------------------------------------------------------
|
|
0 commit comments