forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconcat.py
393 lines (319 loc) · 12.2 KB
/
concat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""
Utility functions related to concat
"""
import numpy as np
import pandas.core.common as com
import pandas.tslib as tslib
from pandas import compat
from pandas.compat import map
def get_dtype_kinds(l):
"""
Parameters
----------
l : list of arrays
Returns
-------
a set of kinds that exist in this list of arrays
"""
typs = set()
for arr in l:
dtype = arr.dtype
if com.is_categorical_dtype(dtype):
typ = 'category'
elif com.is_sparse(arr):
typ = 'sparse'
elif com.is_datetimetz(arr):
typ = 'datetimetz'
elif com.is_datetime64_dtype(dtype):
typ = 'datetime'
elif com.is_timedelta64_dtype(dtype):
typ = 'timedelta'
elif com.is_object_dtype(dtype):
typ = 'object'
elif com.is_bool_dtype(dtype):
typ = 'bool'
else:
typ = dtype.kind
typs.add(typ)
return typs
def _get_series_result_type(result):
"""
return appropriate class of Series concat
input is either dict or array-like
"""
if isinstance(result, dict):
# concat Series with axis 1
if all(com.is_sparse(c) for c in compat.itervalues(result)):
from pandas.sparse.api import SparseDataFrame
return SparseDataFrame
else:
from pandas.core.frame import DataFrame
return DataFrame
elif com.is_sparse(result):
# concat Series with axis 1
from pandas.sparse.api import SparseSeries
return SparseSeries
else:
from pandas.core.series import Series
return Series
def _get_frame_result_type(result, objs):
"""
return appropriate class of DataFrame-like concat
if any block is SparseBlock, return SparseDataFrame
otherwise, return 1st obj
"""
if any(b.is_sparse for b in result.blocks):
from pandas.sparse.api import SparseDataFrame
return SparseDataFrame
else:
return objs[0]
def _concat_compat(to_concat, axis=0):
"""
provide concatenation of an array of arrays each of which is a single
'normalized' dtypes (in that for example, if it's object, then it is a
non-datetimelike and provide a combined dtype for the resulting array that
preserves the overall dtype if possible)
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
Returns
-------
a single array, preserving the combined dtypes
"""
# filter empty arrays
# 1-d dtypes always are included here
def is_nonempty(x):
try:
return x.shape[axis] > 0
except Exception:
return True
nonempty = [x for x in to_concat if is_nonempty(x)]
# If all arrays are empty, there's nothing to convert, just short-cut to
# the concatenation, #3121.
#
# Creating an empty array directly is tempting, but the winnings would be
# marginal given that it would still require shape & dtype calculation and
# np.concatenate which has them both implemented is compiled.
typs = get_dtype_kinds(to_concat)
# these are mandated to handle empties as well
if 'datetime' in typs or 'datetimetz' in typs or 'timedelta' in typs:
return _concat_datetime(to_concat, axis=axis, typs=typs)
elif 'sparse' in typs:
return _concat_sparse(to_concat, axis=axis, typs=typs)
elif 'category' in typs:
return _concat_categorical(to_concat, axis=axis)
if not nonempty:
# we have all empties, but may need to coerce the result dtype to
# object if we have non-numeric type operands (numpy would otherwise
# cast this to float)
typs = get_dtype_kinds(to_concat)
if len(typs) != 1:
if (not len(typs - set(['i', 'u', 'f'])) or
not len(typs - set(['bool', 'i', 'u']))):
# let numpy coerce
pass
else:
# coerce to object
to_concat = [x.astype('object') for x in to_concat]
return np.concatenate(to_concat, axis=axis)
def _concat_categorical(to_concat, axis=0):
"""Concatenate an object/categorical array of arrays, each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : int
Axis to provide concatenation in the current implementation this is
always 0, e.g. we only have 1D categoricals
Returns
-------
Categorical
A single array, preserving the combined dtypes
"""
from pandas.core.categorical import Categorical
def convert_categorical(x):
# coerce to object dtype
if com.is_categorical_dtype(x.dtype):
return x.get_values()
return x.ravel()
if get_dtype_kinds(to_concat) - set(['object', 'category']):
# convert to object type and perform a regular concat
return _concat_compat([np.array(x, copy=False, dtype=object)
for x in to_concat], axis=0)
# we could have object blocks and categoricals here
# if we only have a single categoricals then combine everything
# else its a non-compat categorical
categoricals = [x for x in to_concat if com.is_categorical_dtype(x.dtype)]
# validate the categories
categories = categoricals[0]
rawcats = categories.categories
for x in categoricals[1:]:
if not categories.is_dtype_equal(x):
raise ValueError("incompatible categories in categorical concat")
# we've already checked that all categoricals are the same, so if their
# length is equal to the input then we have all the same categories
if len(categoricals) == len(to_concat):
# concating numeric types is much faster than concating object types
# and fastpath takes a shorter path through the constructor
return Categorical(np.concatenate([x.codes for x in to_concat],
axis=0),
rawcats, ordered=categoricals[0].ordered,
fastpath=True)
else:
concatted = np.concatenate(list(map(convert_categorical, to_concat)),
axis=0)
return Categorical(concatted, rawcats)
def union_categoricals(to_union):
"""
Combine list-like of Categoricals, unioning categories. All
must have the same dtype, and none can be ordered.
.. versionadded 0.18.2
Parameters
----------
to_union : list-like of Categoricals
Returns
-------
Categorical
A single array, categories will be ordered as they
appear in the list
Raises
------
TypeError
If any of the categoricals are ordered or all do not
have the same dtype
ValueError
Emmpty list of categoricals passed
"""
from pandas import Index, Categorical
if len(to_union) == 0:
raise ValueError('No Categoricals to union')
first = to_union[0]
if any(c.ordered for c in to_union):
raise TypeError("Can only combine unordered Categoricals")
if not all(com.is_dtype_equal(c.categories.dtype, first.categories.dtype)
for c in to_union):
raise TypeError("dtype of categories must be the same")
cats = first.categories
unique_cats = cats.append([c.categories for c in to_union[1:]]).unique()
categories = Index(unique_cats)
new_codes = []
for c in to_union:
indexer = categories.get_indexer(c.categories)
new_codes.append(indexer.take(c.codes))
codes = np.concatenate(new_codes)
return Categorical(codes, categories=categories, ordered=False,
fastpath=True)
def _concat_datetime(to_concat, axis=0, typs=None):
"""
provide concatenation of an datetimelike array of arrays each of which is a
single M8[ns], datetimet64[ns, tz] or m8[ns] dtype
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes
Returns
-------
a single array, preserving the combined dtypes
"""
def convert_to_pydatetime(x, axis):
# coerce to an object dtype
# if dtype is of datetimetz or timezone
if x.dtype.kind == com._NS_DTYPE.kind:
if getattr(x, 'tz', None) is not None:
x = x.asobject.values
else:
shape = x.shape
x = tslib.ints_to_pydatetime(x.view(np.int64).ravel())
x = x.reshape(shape)
elif x.dtype == com._TD_DTYPE:
shape = x.shape
x = tslib.ints_to_pytimedelta(x.view(np.int64).ravel())
x = x.reshape(shape)
if axis == 1:
x = np.atleast_2d(x)
return x
if typs is None:
typs = get_dtype_kinds(to_concat)
# must be single dtype
if len(typs) == 1:
if 'datetimetz' in typs:
# datetime with no tz should be stored as "datetime" in typs,
# thus no need to care
# we require ALL of the same tz for datetimetz
tzs = set([str(x.tz) for x in to_concat])
if len(tzs) == 1:
from pandas.tseries.index import DatetimeIndex
new_values = np.concatenate([x.tz_localize(None).asi8
for x in to_concat])
return DatetimeIndex(new_values, tz=list(tzs)[0])
elif 'datetime' in typs:
new_values = np.concatenate([x.view(np.int64) for x in to_concat],
axis=axis)
return new_values.view(com._NS_DTYPE)
elif 'timedelta' in typs:
new_values = np.concatenate([x.view(np.int64) for x in to_concat],
axis=axis)
return new_values.view(com._TD_DTYPE)
# need to coerce to object
to_concat = [convert_to_pydatetime(x, axis) for x in to_concat]
return np.concatenate(to_concat, axis=axis)
def _concat_sparse(to_concat, axis=0, typs=None):
"""
provide concatenation of an sparse/dense array of arrays each of which is a
single dtype
Parameters
----------
to_concat : array of arrays
axis : axis to provide concatenation
typs : set of to_concat dtypes
Returns
-------
a single array, preserving the combined dtypes
"""
from pandas.sparse.array import SparseArray, _make_index
def convert_sparse(x, axis):
# coerce to native type
if isinstance(x, SparseArray):
x = x.get_values()
x = x.ravel()
if axis > 0:
x = np.atleast_2d(x)
return x
if typs is None:
typs = com.get_dtype_kinds(to_concat)
if len(typs) == 1:
# concat input as it is if all inputs are sparse
# and have the same fill_value
fill_values = set(c.fill_value for c in to_concat)
if len(fill_values) == 1:
sp_values = [c.sp_values for c in to_concat]
indexes = [c.sp_index.to_int_index() for c in to_concat]
indices = []
loc = 0
for idx in indexes:
indices.append(idx.indices + loc)
loc += idx.length
sp_values = np.concatenate(sp_values)
indices = np.concatenate(indices)
sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index)
return SparseArray(sp_values, sparse_index=sp_index,
fill_value=to_concat[0].fill_value)
# input may be sparse / dense mixed and may have different fill_value
# input must contain sparse at least 1
sparses = [c for c in to_concat if com.is_sparse(c)]
fill_values = [c.fill_value for c in sparses]
sp_indexes = [c.sp_index for c in sparses]
# densify and regular concat
to_concat = [convert_sparse(x, axis) for x in to_concat]
result = np.concatenate(to_concat, axis=axis)
if not len(typs - set(['sparse', 'f', 'i'])):
# sparsify if inputs are sparse and dense numerics
# first sparse input's fill_value and SparseIndex is used
result = SparseArray(result.ravel(), fill_value=fill_values[0],
kind=sp_indexes[0])
else:
# coerce to object if needed
result = result.astype('object')
return result