Skip to content

Commit c02c400

Browse files
authored
Merge branch 'master' into fix-no-raise-dup-cols
2 parents adf2283 + aa9e002 commit c02c400

21 files changed

+737
-300
lines changed

doc/source/io.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ squeeze : boolean, default ``False``
149149
prefix : str, default ``None``
150150
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
151151
mangle_dupe_cols : boolean, default ``True``
152-
Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'.
152+
Duplicate columns will be specified as 'X', 'X.1'...'X.N', rather than 'X'...'X'.
153153
Passing in False will cause data to be overwritten if there are duplicate
154154
names in the columns.
155155

@@ -548,7 +548,7 @@ these names so as to prevent data overwrite:
548548
pd.read_csv(StringIO(data))
549549
550550
There is no more duplicate data because ``mangle_dupe_cols=True`` by default, which modifies
551-
a series of duplicate columns 'X'...'X' to become 'X.0'...'X.N'. If ``mangle_dupe_cols
551+
a series of duplicate columns 'X'...'X' to become 'X', 'X.1',...'X.N'. If ``mangle_dupe_cols
552552
=False``, duplicate data can arise:
553553

554554
.. code-block :: python

doc/source/whatsnew/v0.23.0.txt

+7
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ Other Enhancements
202202
- ``Resampler`` objects now have a functioning :attr:`~pandas.core.resample.Resampler.pipe` method.
203203
Previously, calls to ``pipe`` were diverted to the ``mean`` method (:issue:`17905`).
204204
- :func:`~pandas.api.types.is_scalar` now returns ``True`` for ``DateOffset`` objects (:issue:`18943`).
205+
- ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`)
205206

206207
.. _whatsnew_0230.api_breaking:
207208

@@ -272,6 +273,7 @@ Other API Changes
272273
- The default ``Timedelta`` constructor now accepts an ``ISO 8601 Duration`` string as an argument (:issue:`19040`)
273274
- ``IntervalDtype`` now returns ``True`` when compared against ``'interval'`` regardless of subtype, and ``IntervalDtype.name`` now returns ``'interval'`` regardless of subtype (:issue:`18980`)
274275
- ``KeyError`` now raises instead of ``ValueError`` when using :meth:`drop()` to remove a non-existent element in an axis of ``Series``, ``Index``, ``DataFrame`` and ``Panel`` (:issue:`19186`)
276+
- :func:`Series.to_csv` now accepts a ``compression`` argument that works in the same way as the ``compression`` argument in :func:`DataFrame.to_csv` (:issue:`18958`)
275277

276278
.. _whatsnew_0230.deprecations:
277279

@@ -386,6 +388,11 @@ Conversion
386388
- Bug in localization of a naive, datetime string in a ``Series`` constructor with a ``datetime64[ns, tz]`` dtype (:issue:`174151`)
387389
- :func:`Timestamp.replace` will now handle Daylight Savings transitions gracefully (:issue:`18319`)
388390

391+
392+
393+
- Bug in ``.astype()`` to non-ns timedelta units would hold the incorrect dtype (:issue:`19176`, :issue:`19223`, :issue:`12425`)
394+
395+
389396
Indexing
390397
^^^^^^^^
391398

pandas/_libs/tslibs/conversion.pyx

+24-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ from np_datetime cimport (check_dts_bounds,
2929

3030
from util cimport (is_string_object,
3131
is_datetime64_object,
32-
is_integer_object, is_float_object)
32+
is_integer_object, is_float_object, is_array)
3333

3434
from timedeltas cimport cast_from_unit
3535
from timezones cimport (is_utc, is_tzlocal, is_fixed_offset,
@@ -45,6 +45,8 @@ from nattype cimport NPY_NAT, checknull_with_nat
4545
# Constants
4646

4747
cdef int64_t DAY_NS = 86400000000000LL
48+
NS_DTYPE = np.dtype('M8[ns]')
49+
TD_DTYPE = np.dtype('m8[ns]')
4850

4951
UTC = pytz.UTC
5052

@@ -73,13 +75,14 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1:
7375
return ival
7476

7577

76-
def ensure_datetime64ns(ndarray arr):
78+
def ensure_datetime64ns(ndarray arr, copy=True):
7779
"""
7880
Ensure a np.datetime64 array has dtype specifically 'datetime64[ns]'
7981
8082
Parameters
8183
----------
8284
arr : ndarray
85+
copy : boolean, default True
8386
8487
Returns
8588
-------
@@ -104,6 +107,8 @@ def ensure_datetime64ns(ndarray arr):
104107

105108
unit = get_datetime64_unit(arr.flat[0])
106109
if unit == PANDAS_FR_ns:
110+
if copy:
111+
arr = arr.copy()
107112
result = arr
108113
else:
109114
for i in range(n):
@@ -117,6 +122,23 @@ def ensure_datetime64ns(ndarray arr):
117122
return result
118123

119124

125+
def ensure_timedelta64ns(ndarray arr, copy=True):
126+
"""
127+
Ensure a np.timedelta64 array has dtype specifically 'timedelta64[ns]'
128+
129+
Parameters
130+
----------
131+
arr : ndarray
132+
copy : boolean, default True
133+
134+
Returns
135+
-------
136+
result : ndarray with dtype timedelta64[ns]
137+
138+
"""
139+
return arr.astype(TD_DTYPE, copy=copy)
140+
141+
120142
def datetime_to_datetime64(ndarray[object] values):
121143
"""
122144
Convert ndarray of datetime-like objects to int64 array representing

pandas/core/dtypes/cast.py

+38-31
Original file line numberDiff line numberDiff line change
@@ -649,50 +649,70 @@ def astype_nansafe(arr, dtype, copy=True):
649649
if issubclass(dtype.type, text_type):
650650
# in Py3 that's str, in Py2 that's unicode
651651
return lib.astype_unicode(arr.ravel()).reshape(arr.shape)
652+
652653
elif issubclass(dtype.type, string_types):
653654
return lib.astype_str(arr.ravel()).reshape(arr.shape)
655+
654656
elif is_datetime64_dtype(arr):
655-
if dtype == object:
657+
if is_object_dtype(dtype):
656658
return tslib.ints_to_pydatetime(arr.view(np.int64))
657659
elif dtype == np.int64:
658660
return arr.view(dtype)
659-
elif dtype != _NS_DTYPE:
660-
raise TypeError("cannot astype a datetimelike from [{from_dtype}] "
661-
"to [{to_dtype}]".format(from_dtype=arr.dtype,
662-
to_dtype=dtype))
663-
return arr.astype(_NS_DTYPE)
661+
662+
# allow frequency conversions
663+
if dtype.kind == 'M':
664+
return arr.astype(dtype)
665+
666+
raise TypeError("cannot astype a datetimelike from [{from_dtype}] "
667+
"to [{to_dtype}]".format(from_dtype=arr.dtype,
668+
to_dtype=dtype))
669+
664670
elif is_timedelta64_dtype(arr):
665-
if dtype == np.int64:
666-
return arr.view(dtype)
667-
elif dtype == object:
671+
if is_object_dtype(dtype):
668672
return tslib.ints_to_pytimedelta(arr.view(np.int64))
673+
elif dtype == np.int64:
674+
return arr.view(dtype)
669675

670676
# in py3, timedelta64[ns] are int64
671-
elif ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or
672-
(not PY3 and dtype != _TD_DTYPE)):
677+
if ((PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or
678+
(not PY3 and dtype != _TD_DTYPE)):
673679

674680
# allow frequency conversions
681+
# we return a float here!
675682
if dtype.kind == 'm':
676683
mask = isna(arr)
677684
result = arr.astype(dtype).astype(np.float64)
678685
result[mask] = np.nan
679686
return result
687+
elif dtype == _TD_DTYPE:
688+
return arr.astype(_TD_DTYPE, copy=copy)
680689

681-
raise TypeError("cannot astype a timedelta from [{from_dtype}] "
682-
"to [{to_dtype}]".format(from_dtype=arr.dtype,
683-
to_dtype=dtype))
690+
raise TypeError("cannot astype a timedelta from [{from_dtype}] "
691+
"to [{to_dtype}]".format(from_dtype=arr.dtype,
692+
to_dtype=dtype))
684693

685-
return arr.astype(_TD_DTYPE)
686694
elif (np.issubdtype(arr.dtype, np.floating) and
687695
np.issubdtype(dtype, np.integer)):
688696

689697
if not np.isfinite(arr).all():
690698
raise ValueError('Cannot convert non-finite values (NA or inf) to '
691699
'integer')
692700

693-
elif is_object_dtype(arr.dtype) and np.issubdtype(dtype.type, np.integer):
701+
elif is_object_dtype(arr):
702+
694703
# work around NumPy brokenness, #1987
695-
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
704+
if np.issubdtype(dtype.type, np.integer):
705+
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
706+
707+
# if we have a datetime/timedelta array of objects
708+
# then coerce to a proper dtype and recall astype_nansafe
709+
710+
elif is_datetime64_dtype(dtype):
711+
from pandas import to_datetime
712+
return astype_nansafe(to_datetime(arr).values, dtype, copy=copy)
713+
elif is_timedelta64_dtype(dtype):
714+
from pandas import to_timedelta
715+
return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy)
696716

697717
if dtype.name in ("datetime64", "timedelta64"):
698718
msg = ("Passing in '{dtype}' dtype with no frequency is "
@@ -703,20 +723,7 @@ def astype_nansafe(arr, dtype, copy=True):
703723
dtype = np.dtype(dtype.name + "[ns]")
704724

705725
if copy:
706-
707-
if arr.dtype == dtype:
708-
return arr.copy()
709-
710-
# we handle datetimelikes with pandas machinery
711-
# to be robust to the input type
712-
elif is_datetime64_dtype(dtype):
713-
from pandas import to_datetime
714-
return to_datetime(arr).values
715-
elif is_timedelta64_dtype(dtype):
716-
from pandas import to_timedelta
717-
return to_timedelta(arr).values
718-
719-
return arr.astype(dtype)
726+
return arr.astype(dtype, copy=True)
720727
return arr.view(dtype)
721728

722729

pandas/core/dtypes/common.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from pandas.compat import (string_types, text_type, binary_type,
55
PY3, PY36)
66
from pandas._libs import algos, lib
7+
from pandas._libs.tslibs import conversion
78
from .dtypes import (CategoricalDtype, CategoricalDtypeType,
89
DatetimeTZDtype, DatetimeTZDtypeType,
910
PeriodDtype, PeriodDtypeType,
@@ -21,8 +22,8 @@
2122
for t in ['O', 'int8', 'uint8', 'int16', 'uint16',
2223
'int32', 'uint32', 'int64', 'uint64']])
2324

24-
_NS_DTYPE = np.dtype('M8[ns]')
25-
_TD_DTYPE = np.dtype('m8[ns]')
25+
_NS_DTYPE = conversion.NS_DTYPE
26+
_TD_DTYPE = conversion.TD_DTYPE
2627
_INT64_DTYPE = np.dtype(np.int64)
2728

2829
# oh the troubles to reduce import time
@@ -31,6 +32,9 @@
3132
_ensure_float64 = algos.ensure_float64
3233
_ensure_float32 = algos.ensure_float32
3334

35+
_ensure_datetime64ns = conversion.ensure_datetime64ns
36+
_ensure_timedelta64ns = conversion.ensure_timedelta64ns
37+
3438

3539
def _ensure_float(arr):
3640
"""

pandas/core/dtypes/dtypes.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -710,7 +710,8 @@ def __eq__(self, other):
710710
# None should match any subtype
711711
return True
712712
else:
713-
return self.subtype == other.subtype
713+
from pandas.core.dtypes.common import is_dtype_equal
714+
return is_dtype_equal(self.subtype, other.subtype)
714715

715716
@classmethod
716717
def is_dtype(cls, dtype):

pandas/core/indexes/interval.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
is_scalar,
2121
is_float,
2222
is_number,
23-
is_integer)
23+
is_integer,
24+
pandas_dtype)
2425
from pandas.core.indexes.base import (
2526
Index, _ensure_index,
2627
default_pprint, _index_shared_docs)
@@ -699,8 +700,16 @@ def copy(self, deep=False, name=None):
699700

700701
@Appender(_index_shared_docs['astype'])
701702
def astype(self, dtype, copy=True):
702-
if is_interval_dtype(dtype):
703-
return self.copy() if copy else self
703+
dtype = pandas_dtype(dtype)
704+
if is_interval_dtype(dtype) and dtype != self.dtype:
705+
try:
706+
new_left = self.left.astype(dtype.subtype)
707+
new_right = self.right.astype(dtype.subtype)
708+
except TypeError:
709+
msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are '
710+
'incompatible')
711+
raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
712+
return self._shallow_copy(new_left, new_right)
704713
return super(IntervalIndex, self).astype(dtype, copy=copy)
705714

706715
@cache_readonly

pandas/core/internals.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -631,7 +631,7 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
631631
values = astype_nansafe(values.ravel(), dtype, copy=True)
632632
values = values.reshape(self.shape)
633633

634-
newb = make_block(values, placement=self.mgr_locs, dtype=dtype,
634+
newb = make_block(values, placement=self.mgr_locs,
635635
klass=klass)
636636
except:
637637
if errors == 'raise':
@@ -1954,6 +1954,13 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
19541954
_can_hold_na = True
19551955
is_numeric = False
19561956

1957+
def __init__(self, values, placement, fastpath=False, **kwargs):
1958+
if values.dtype != _TD_DTYPE:
1959+
values = conversion.ensure_timedelta64ns(values)
1960+
1961+
super(TimeDeltaBlock, self).__init__(values, fastpath=True,
1962+
placement=placement, **kwargs)
1963+
19571964
@property
19581965
def _box_func(self):
19591966
return lambda x: tslib.Timedelta(x, unit='ns')

pandas/core/series.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -2881,7 +2881,8 @@ def from_csv(cls, path, sep=',', parse_dates=True, header=None,
28812881

28822882
def to_csv(self, path=None, index=True, sep=",", na_rep='',
28832883
float_format=None, header=False, index_label=None,
2884-
mode='w', encoding=None, date_format=None, decimal='.'):
2884+
mode='w', encoding=None, compression=None, date_format=None,
2885+
decimal='.'):
28852886
"""
28862887
Write Series to a comma-separated values (csv) file
28872888
@@ -2908,6 +2909,10 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
29082909
encoding : string, optional
29092910
a string representing the encoding to use if the contents are
29102911
non-ascii, for python versions prior to 3
2912+
compression : string, optional
2913+
a string representing the compression to use in the output file,
2914+
allowed values are 'gzip', 'bz2', 'xz', only used when the first
2915+
argument is a filename
29112916
date_format: string, default None
29122917
Format string for datetime objects.
29132918
decimal: string, default '.'
@@ -2920,8 +2925,8 @@ def to_csv(self, path=None, index=True, sep=",", na_rep='',
29202925
result = df.to_csv(path, index=index, sep=sep, na_rep=na_rep,
29212926
float_format=float_format, header=header,
29222927
index_label=index_label, mode=mode,
2923-
encoding=encoding, date_format=date_format,
2924-
decimal=decimal)
2928+
encoding=encoding, compression=compression,
2929+
date_format=date_format, decimal=decimal)
29252930
if path is None:
29262931
return result
29272932

pandas/io/parsers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@
114114
prefix : str, default None
115115
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
116116
mangle_dupe_cols : boolean, default True
117-
Duplicate columns will be specified as 'X.0'...'X.N', rather than
117+
Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than
118118
'X'...'X'. Passing in False will cause data to be overwritten if there
119119
are duplicate names in the columns.
120120
dtype : Type name or dict of column -> type, default None

pandas/tests/dtypes/test_dtypes.py

+6
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,12 @@ def test_equality(self):
534534
assert not is_dtype_equal(IntervalDtype('int64'),
535535
IntervalDtype('float64'))
536536

537+
# invalid subtype comparisons do not raise when directly compared
538+
dtype1 = IntervalDtype('float64')
539+
dtype2 = IntervalDtype('datetime64[ns, US/Eastern]')
540+
assert dtype1 != dtype2
541+
assert dtype2 != dtype1
542+
537543
@pytest.mark.parametrize('subtype', [
538544
None, 'interval', 'Interval', 'int64', 'uint64', 'float64',
539545
'complex128', 'datetime64', 'timedelta64', PeriodDtype('Q')])

0 commit comments

Comments
 (0)