Skip to content

Commit ca6d88b

Browse files
max-sixtyjreback
authored andcommitted
CLN: clean up PeriodIndex constructor
closes #13232 Material clean up of PeriodIndex constructor, which was doing a few weird things (#13232 (comment) nt-220788816), and generally getting messy. Author: Maximilian Roos <[email protected]> Closes #13277 from MaximilianR/period-float and squashes the following commits: 5cae7aa [Maximilian Roos] @jreback changes 75ff54d [Maximilian Roos] _new_PeriodIndex for unpickling 240172f [Maximilian Roos] coerce freq object earlier for perf ba5133b [Maximilian Roos] documentation b0fc0a7 [Maximilian Roos] final changes fa0fa9d [Maximilian Roos] clean up PeriodIndex constructor
1 parent 5f0b69a commit ca6d88b

File tree

8 files changed

+98
-86
lines changed

8 files changed

+98
-86
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,7 @@ Bug Fixes
657657
- Bug in ``DataFrame.sort_values()`` when sorting by multiple columns where one column is of type ``int64`` and contains ``NaT`` (:issue:`14922`)
658658
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
659659
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
660+
- Cleaned up ``PeriodIndex`` constructor, including raising on floats more consistently (:issue:`13277`)
660661
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
661662
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
662663
- Bug in ``pd.read_csv()`` in which missing data was being improperly handled with ``usecols`` (:issue:`6710`)

pandas/core/algorithms.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,8 @@ def _value_counts_arraylike(values, dropna=True):
471471
# dtype handling
472472
if is_datetimetz_type:
473473
keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
474-
if is_period_type:
475-
keys = PeriodIndex._simple_new(keys, freq=freq)
474+
elif is_period_type:
475+
keys = PeriodIndex._from_ordinals(keys, freq=freq)
476476

477477
elif is_signed_integer_dtype(dtype):
478478
values = _ensure_int64(values)

pandas/indexes/base.py

+5
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ def _new_Index(cls, d):
8888
""" This is called upon unpickling, rather than the default which doesn't
8989
have arguments and breaks __new__
9090
"""
91+
# required for backward compat, because PI can't be instantiated with
92+
# ordinals through __new__ GH #13277
93+
if issubclass(cls, ABCPeriodIndex):
94+
from pandas.tseries.period import _new_PeriodIndex
95+
return _new_PeriodIndex(cls, **d)
9196
return cls.__new__(cls, **d)
9297

9398

pandas/io/packers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ def decode(obj):
573573
elif typ == u'period_index':
574574
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
575575
d = dict(name=obj[u'name'], freq=obj[u'freq'])
576-
return globals()[obj[u'klass']](data, **d)
576+
return globals()[obj[u'klass']]._from_ordinals(data, **d)
577577
elif typ == u'datetime_index':
578578
data = unconvert(obj[u'data'], np.int64, obj.get(u'compress'))
579579
d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False)

pandas/tests/indexes/period/test_construction.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def test_constructor_fromarraylike(self):
120120

121121
self.assertRaises(ValueError, PeriodIndex, idx._values)
122122
self.assertRaises(ValueError, PeriodIndex, list(idx._values))
123-
self.assertRaises(ValueError, PeriodIndex,
123+
self.assertRaises(TypeError, PeriodIndex,
124124
data=Period('2007', freq='A'))
125125

126126
result = PeriodIndex(iter(idx))
@@ -285,12 +285,15 @@ def test_constructor_simple_new_empty(self):
285285
result = idx._simple_new(idx, name='p', freq='M')
286286
tm.assert_index_equal(result, idx)
287287

288-
def test_constructor_simple_new_floats(self):
288+
def test_constructor_floats(self):
289289
# GH13079
290-
for floats in [[1.1], np.array([1.1])]:
290+
for floats in [[1.1, 2.1], np.array([1.1, 2.1])]:
291291
with self.assertRaises(TypeError):
292292
pd.PeriodIndex._simple_new(floats, freq='M')
293293

294+
with self.assertRaises(TypeError):
295+
pd.PeriodIndex(floats, freq='M')
296+
294297
def test_constructor_nat(self):
295298
self.assertRaises(ValueError, period_range, start='NaT',
296299
end='2011-01-01', freq='M')

pandas/tests/indexes/period/test_period.py

+6
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,12 @@ def test_astype_raises(self):
5353
def test_pickle_compat_construction(self):
5454
pass
5555

56+
def test_pickle_round_trip(self):
57+
for freq in ['D', 'M', 'Y']:
58+
idx = PeriodIndex(['2016-05-16', 'NaT', NaT, np.NaN], freq='D')
59+
result = self.round_trip_pickle(idx)
60+
tm.assert_index_equal(result, idx)
61+
5662
def test_get_loc(self):
5763
idx = pd.period_range('2000-01-01', periods=3)
5864

pandas/tseries/period.py

+76-80
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
is_period_dtype,
1818
is_bool_dtype,
1919
pandas_dtype,
20-
_ensure_int64,
2120
_ensure_object)
2221
from pandas.types.dtypes import PeriodDtype
2322
from pandas.types.generic import ABCSeries
@@ -114,6 +113,13 @@ def wrapper(self, other):
114113
return wrapper
115114

116115

116+
def _new_PeriodIndex(cls, **d):
117+
# GH13277 for unpickling
118+
if d['data'].dtype == 'int64':
119+
values = d.pop('data')
120+
return cls._from_ordinals(values=values, **d)
121+
122+
117123
class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index):
118124
"""
119125
Immutable ndarray holding ordinal values indicating regular periods in
@@ -209,17 +215,57 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
209215
msg = 'specified freq and dtype are different'
210216
raise IncompatibleFrequency(msg)
211217

218+
# coerce freq to freq object, otherwise it can be coerced elementwise
219+
# which is slow
220+
if freq:
221+
freq = Period._maybe_convert_freq(freq)
222+
212223
if data is None:
213224
if ordinal is not None:
214225
data = np.asarray(ordinal, dtype=np.int64)
215226
else:
216227
data, freq = cls._generate_range(start, end, periods,
217228
freq, kwargs)
218-
else:
219-
ordinal, freq = cls._from_arraylike(data, freq, tz)
220-
data = np.array(ordinal, dtype=np.int64, copy=copy)
229+
return cls._from_ordinals(data, name=name, freq=freq)
221230

222-
return cls._simple_new(data, name=name, freq=freq)
231+
if isinstance(data, PeriodIndex):
232+
if freq is None or freq == data.freq: # no freq change
233+
freq = data.freq
234+
data = data._values
235+
else:
236+
base1, _ = _gfc(data.freq)
237+
base2, _ = _gfc(freq)
238+
data = period.period_asfreq_arr(data._values,
239+
base1, base2, 1)
240+
return cls._simple_new(data, name=name, freq=freq)
241+
242+
# not array / index
243+
if not isinstance(data, (np.ndarray, PeriodIndex,
244+
DatetimeIndex, Int64Index)):
245+
if is_scalar(data) or isinstance(data, Period):
246+
cls._scalar_data_error(data)
247+
248+
# other iterable of some kind
249+
if not isinstance(data, (list, tuple)):
250+
data = list(data)
251+
252+
data = np.asarray(data)
253+
254+
# datetime other than period
255+
if is_datetime64_dtype(data.dtype):
256+
data = dt64arr_to_periodarr(data, freq, tz)
257+
return cls._from_ordinals(data, name=name, freq=freq)
258+
259+
# check not floats
260+
if infer_dtype(data) == 'floating' and len(data) > 0:
261+
raise TypeError("PeriodIndex does not allow "
262+
"floating point in construction")
263+
264+
# anything else, likely an array of strings or periods
265+
data = _ensure_object(data)
266+
freq = freq or period.extract_freq(data)
267+
data = period.extract_ordinals(data, freq)
268+
return cls._from_ordinals(data, name=name, freq=freq)
223269

224270
@classmethod
225271
def _generate_range(cls, start, end, periods, freq, fields):
@@ -240,85 +286,34 @@ def _generate_range(cls, start, end, periods, freq, fields):
240286

241287
return subarr, freq
242288

243-
@classmethod
244-
def _from_arraylike(cls, data, freq, tz):
245-
if freq is not None:
246-
freq = Period._maybe_convert_freq(freq)
247-
248-
if not isinstance(data, (np.ndarray, PeriodIndex,
249-
DatetimeIndex, Int64Index)):
250-
if is_scalar(data) or isinstance(data, Period):
251-
raise ValueError('PeriodIndex() must be called with a '
252-
'collection of some kind, %s was passed'
253-
% repr(data))
254-
255-
# other iterable of some kind
256-
if not isinstance(data, (list, tuple)):
257-
data = list(data)
258-
259-
try:
260-
data = _ensure_int64(data)
261-
if freq is None:
262-
raise ValueError('freq not specified')
263-
data = np.array([Period(x, freq=freq) for x in data],
264-
dtype=np.int64)
265-
except (TypeError, ValueError):
266-
data = _ensure_object(data)
267-
268-
if freq is None:
269-
freq = period.extract_freq(data)
270-
data = period.extract_ordinals(data, freq)
271-
else:
272-
if isinstance(data, PeriodIndex):
273-
if freq is None or freq == data.freq:
274-
freq = data.freq
275-
data = data._values
276-
else:
277-
base1, _ = _gfc(data.freq)
278-
base2, _ = _gfc(freq)
279-
data = period.period_asfreq_arr(data._values,
280-
base1, base2, 1)
281-
else:
282-
if is_object_dtype(data):
283-
inferred = infer_dtype(data)
284-
if inferred == 'integer':
285-
data = data.astype(np.int64)
286-
287-
if freq is None and is_object_dtype(data):
288-
# must contain Period instance and thus extract ordinals
289-
freq = period.extract_freq(data)
290-
data = period.extract_ordinals(data, freq)
291-
292-
if freq is None:
293-
msg = 'freq not specified and cannot be inferred'
294-
raise ValueError(msg)
295-
296-
if data.dtype != np.int64:
297-
if np.issubdtype(data.dtype, np.datetime64):
298-
data = dt64arr_to_periodarr(data, freq, tz)
299-
else:
300-
data = _ensure_object(data)
301-
data = period.extract_ordinals(data, freq)
302-
303-
return data, freq
304-
305289
@classmethod
306290
def _simple_new(cls, values, name=None, freq=None, **kwargs):
307-
291+
"""
292+
Values can be any type that can be coerced to Periods.
293+
Ordinals in an ndarray are fastpath-ed to `_from_ordinals`
294+
"""
308295
if not is_integer_dtype(values):
309296
values = np.array(values, copy=False)
310-
if (len(values) > 0 and is_float_dtype(values)):
297+
if len(values) > 0 and is_float_dtype(values):
311298
raise TypeError("PeriodIndex can't take floats")
312-
else:
313-
return cls(values, name=name, freq=freq, **kwargs)
299+
return cls(values, name=name, freq=freq, **kwargs)
300+
301+
return cls._from_ordinals(values, name, freq, **kwargs)
302+
303+
@classmethod
304+
def _from_ordinals(cls, values, name=None, freq=None, **kwargs):
305+
"""
306+
Values should be int ordinals
307+
`__new__` & `_simple_new` cooerce to ordinals and call this method
308+
"""
314309

315310
values = np.array(values, dtype='int64', copy=False)
316311

317312
result = object.__new__(cls)
318313
result._data = values
319314
result.name = name
320315
if freq is None:
321-
raise ValueError('freq is not specified')
316+
raise ValueError('freq is not specified and cannot be inferred')
322317
result.freq = Period._maybe_convert_freq(freq)
323318
result._reset_identity()
324319
return result
@@ -327,13 +322,13 @@ def _shallow_copy_with_infer(self, values=None, **kwargs):
327322
""" we always want to return a PeriodIndex """
328323
return self._shallow_copy(values=values, **kwargs)
329324

330-
def _shallow_copy(self, values=None, **kwargs):
331-
if kwargs.get('freq') is None:
332-
# freq must be provided
333-
kwargs['freq'] = self.freq
325+
def _shallow_copy(self, values=None, freq=None, **kwargs):
326+
if freq is None:
327+
freq = self.freq
334328
if values is None:
335329
values = self._values
336-
return super(PeriodIndex, self)._shallow_copy(values=values, **kwargs)
330+
return super(PeriodIndex, self)._shallow_copy(values=values,
331+
freq=freq, **kwargs)
337332

338333
def _coerce_scalar_to_index(self, item):
339334
"""
@@ -413,7 +408,7 @@ def __array_wrap__(self, result, context=None):
413408
return result
414409
# the result is object dtype array of Period
415410
# cannot pass _simple_new as it is
416-
return PeriodIndex(result, freq=self.freq, name=self.name)
411+
return self._shallow_copy(result, freq=self.freq, name=self.name)
417412

418413
@property
419414
def _box_func(self):
@@ -708,7 +703,7 @@ def shift(self, n):
708703
values = self._values + n * self.freq.n
709704
if self.hasnans:
710705
values[self._isnan] = tslib.iNaT
711-
return PeriodIndex(data=values, name=self.name, freq=self.freq)
706+
return self._shallow_copy(values=values)
712707

713708
@cache_readonly
714709
def dtype(self):
@@ -945,7 +940,8 @@ def _wrap_union_result(self, other, result):
945940

946941
def _apply_meta(self, rawarr):
947942
if not isinstance(rawarr, PeriodIndex):
948-
rawarr = PeriodIndex(rawarr, freq=self.freq)
943+
rawarr = PeriodIndex._from_ordinals(rawarr, freq=self.freq,
944+
name=self.name)
949945
return rawarr
950946

951947
def _format_native_types(self, na_rep=u('NaT'), date_format=None,

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ parentdir_prefix = pandas-
1313

1414
[flake8]
1515
ignore = E731,E402
16+
max-line-length = 79
1617

1718
[yapf]
1819
based_on_style = pep8

0 commit comments

Comments
 (0)