Skip to content

Commit f70c68e

Browse files
committed
Merge branch 'master' into pandas-dev#17778
2 parents b8b7a66 + 1181622 commit f70c68e

27 files changed

+588
-111
lines changed

appveyor.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ environment:
2222
PYTHON_VERSION: "3.6"
2323
PYTHON_ARCH: "64"
2424
CONDA_PY: "36"
25-
CONDA_NPY: "112"
25+
CONDA_NPY: "113"
2626

2727
- CONDA_ROOT: "C:\\Miniconda3_64"
2828
PYTHON_VERSION: "2.7"

ci/requirements-3.6_WIN.run

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
python-dateutil
22
pytz
3-
numpy=1.12*
3+
numpy=1.13*
44
bottleneck
55
openpyxl
66
xlsxwriter
77
xlrd
88
xlwt
9-
# scipy
9+
scipy
1010
feather-format
1111
numexpr
1212
pytables

doc/source/whatsnew/v0.21.1.txt

+6-3
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ Documentation Changes
5757
Bug Fixes
5858
~~~~~~~~~
5959
- Bug in ``DataFrame.resample(...).apply(...)`` when there is a callable that returns different columns (:issue:`15169`)
60+
- Bug in :class:`TimedeltaIndex` subtraction could incorrectly overflow when ``NaT`` is present (:issue:`17791`)
61+
- Bug in :class:`DatetimeIndex` subtracting datetimelike from DatetimeIndex could fail to overflow (:issue:`18020`)
6062

6163
Conversion
6264
^^^^^^^^^^
@@ -76,7 +78,8 @@ I/O
7678
^^^
7779

7880
- Bug in class:`~pandas.io.stata.StataReader` not converting date/time columns with display formatting addressed (:issue:`17990`). Previously columns with display formatting were normally left as ordinal numbers and not converted to datetime objects.
79-
81+
- Bug in :func:`read_csv` when reading a compressed UTF-16 encoded file (:issue:`18071`)
82+
- Bug in :meth:`DataFrame.to_csv` when the table had ``MultiIndex`` columns, and a list of strings was passed in for ``header`` (:issue:`5539`)
8083

8184
Plotting
8285
^^^^^^^^
@@ -102,7 +105,7 @@ Sparse
102105
Reshaping
103106
^^^^^^^^^
104107

105-
-
108+
- Error message in ``pd.merge_asof()`` for key datatype mismatch now includes datatype of left and right key (:issue:`18068`)
106109
-
107110
-
108111

@@ -119,7 +122,7 @@ Categorical
119122
- Bug in :meth:`DataFrame.astype` where casting to 'category' on an empty ``DataFrame`` causes a segmentation fault (:issue:`18004`)
120123
- Error messages in the testing module have been improved when items have
121124
different ``CategoricalDtype`` (:issue:`18069`)
122-
-
125+
- ``CategoricalIndex`` can now correctly take a ``pd.api.types.CategoricalDtype`` as its dtype (:issue:`18116`)
123126

124127
Other
125128
^^^^^

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ Conversion
101101
Indexing
102102
^^^^^^^^
103103

104-
-
104+
- Bug in :func:`PeriodIndex.truncate` which raises ``TypeError`` when ``PeriodIndex`` is monotonic (:issue:`17717`)
105105
-
106106
-
107107

pandas/_libs/index.pyx

+52-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ from tslib cimport _to_i8
1717

1818
from hashtable cimport HashTable
1919

20-
from pandas._libs import algos, hashtable as _hash
20+
from pandas._libs import algos, period as periodlib, hashtable as _hash
2121
from pandas._libs.tslib import Timestamp, Timedelta
2222
from datetime import datetime, timedelta
2323

@@ -270,13 +270,16 @@ cdef class IndexEngine:
270270

271271
values = self._get_index_values()
272272
self.mapping = self._make_hash_table(len(values))
273-
self.mapping.map_locations(values)
273+
self._call_map_locations(values)
274274

275275
if len(self.mapping) == len(values):
276276
self.unique = 1
277277

278278
self.need_unique_check = 0
279279

280+
cpdef _call_map_locations(self, values):
281+
self.mapping.map_locations(values)
282+
280283
def clear_mapping(self):
281284
self.mapping = None
282285
self.need_monotonic_check = 1
@@ -490,6 +493,53 @@ cdef class TimedeltaEngine(DatetimeEngine):
490493
cdef _get_box_dtype(self):
491494
return 'm8[ns]'
492495

496+
497+
cdef class PeriodEngine(Int64Engine):
498+
499+
cdef _get_index_values(self):
500+
return super(PeriodEngine, self).vgetter()
501+
502+
cpdef _call_map_locations(self, values):
503+
super(PeriodEngine, self)._call_map_locations(values.view('i8'))
504+
505+
def _call_monotonic(self, values):
506+
return super(PeriodEngine, self)._call_monotonic(values.view('i8'))
507+
508+
def get_indexer(self, values):
509+
cdef ndarray[int64_t, ndim=1] ordinals
510+
511+
super(PeriodEngine, self)._ensure_mapping_populated()
512+
513+
freq = super(PeriodEngine, self).vgetter().freq
514+
ordinals = periodlib.extract_ordinals(values, freq)
515+
516+
return self.mapping.lookup(ordinals)
517+
518+
def get_pad_indexer(self, other, limit=None):
519+
freq = super(PeriodEngine, self).vgetter().freq
520+
ordinal = periodlib.extract_ordinals(other, freq)
521+
522+
return algos.pad_int64(self._get_index_values(),
523+
np.asarray(ordinal), limit=limit)
524+
525+
def get_backfill_indexer(self, other, limit=None):
526+
freq = super(PeriodEngine, self).vgetter().freq
527+
ordinal = periodlib.extract_ordinals(other, freq)
528+
529+
return algos.backfill_int64(self._get_index_values(),
530+
np.asarray(ordinal), limit=limit)
531+
532+
def get_indexer_non_unique(self, targets):
533+
freq = super(PeriodEngine, self).vgetter().freq
534+
ordinal = periodlib.extract_ordinals(targets, freq)
535+
ordinal_array = np.asarray(ordinal)
536+
537+
return super(PeriodEngine, self).get_indexer_non_unique(ordinal_array)
538+
539+
cdef _get_index_values_for_bool_indexer(self):
540+
return self._get_index_values().view('i8')
541+
542+
493543
cpdef convert_scalar(ndarray arr, object value):
494544
# we don't turn integers
495545
# into datetimes/timedeltas

pandas/_libs/index_class_helper.pxi.in

+4-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ cdef class {{name}}Engine(IndexEngine):
6666
raise KeyError(val)
6767
{{endif}}
6868

69-
values = self._get_index_values()
69+
values = self._get_index_values_for_bool_indexer()
7070
n = len(values)
7171

7272
result = np.empty(n, dtype=bool)
@@ -86,6 +86,9 @@ cdef class {{name}}Engine(IndexEngine):
8686
return last_true
8787

8888
return result
89+
90+
cdef _get_index_values_for_bool_indexer(self):
91+
return self._get_index_values()
8992
{{endif}}
9093

9194
{{endfor}}

pandas/_libs/parsers.pyx

+19-11
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,17 @@ cdef class TextReader:
374374
float_precision=None,
375375
skip_blank_lines=True):
376376

377+
# set encoding for native Python and C library
378+
if encoding is not None:
379+
if not isinstance(encoding, bytes):
380+
encoding = encoding.encode('utf-8')
381+
encoding = encoding.lower()
382+
self.c_encoding = <char*> encoding
383+
else:
384+
self.c_encoding = NULL
385+
386+
self.encoding = encoding
387+
377388
self.parser = parser_new()
378389
self.parser.chunksize = tokenize_chunksize
379390

@@ -495,17 +506,6 @@ cdef class TextReader:
495506
self.parser.double_converter_nogil = NULL
496507
self.parser.double_converter_withgil = round_trip
497508

498-
# encoding
499-
if encoding is not None:
500-
if not isinstance(encoding, bytes):
501-
encoding = encoding.encode('utf-8')
502-
encoding = encoding.lower()
503-
self.c_encoding = <char*> encoding
504-
else:
505-
self.c_encoding = NULL
506-
507-
self.encoding = encoding
508-
509509
if isinstance(dtype, dict):
510510
dtype = {k: pandas_dtype(dtype[k])
511511
for k in dtype}
@@ -684,6 +684,14 @@ cdef class TextReader:
684684
else:
685685
raise ValueError('Unrecognized compression type: %s' %
686686
self.compression)
687+
688+
if b'utf-16' in (self.encoding or b''):
689+
# we need to read utf-16 through UTF8Recoder.
690+
# if source is utf-16, convert source to utf-8 by UTF8Recoder.
691+
source = com.UTF8Recoder(source, self.encoding.decode('utf-8'))
692+
self.encoding = b'utf-8'
693+
self.c_encoding = <char*> self.encoding
694+
687695
self.handle = source
688696

689697
if isinstance(source, basestring):

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5105,7 +5105,7 @@ def append(self, other, ignore_index=False, verify_integrity=False):
51055105
51065106
>>> df = pd.DataFrame(columns=['A'])
51075107
>>> for i in range(5):
5108-
... df = df.append({'A'}: i}, ignore_index=True)
5108+
... df = df.append({'A': i}, ignore_index=True)
51095109
>>> df
51105110
A
51115111
0 0

pandas/core/indexes/category.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,8 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None,
7979
if data is not None or categories is None:
8080
cls._scalar_data_error(data)
8181
data = []
82-
data = cls._create_categorical(cls, data, categories, ordered)
82+
data = cls._create_categorical(cls, data, categories, ordered,
83+
dtype)
8384

8485
if copy:
8586
data = data.copy()

pandas/core/indexes/datetimelike.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -679,7 +679,7 @@ def __sub__(self, other):
679679
return self._add_delta(-other)
680680
elif is_integer(other):
681681
return self.shift(-other)
682-
elif isinstance(other, datetime):
682+
elif isinstance(other, (datetime, np.datetime64)):
683683
return self._sub_datelike(other)
684684
elif isinstance(other, Period):
685685
return self._sub_period(other)

pandas/core/indexes/datetimes.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import pandas.core.dtypes.concat as _concat
3030
from pandas.errors import PerformanceWarning
3131
from pandas.core.common import _values_from_object, _maybe_box
32+
from pandas.core.algorithms import checked_add_with_arr
3233

3334
from pandas.core.indexes.base import Index, _index_shared_docs
3435
from pandas.core.indexes.numeric import Int64Index, Float64Index
@@ -767,7 +768,7 @@ def _sub_datelike(self, other):
767768
raise TypeError("DatetimeIndex subtraction must have the same "
768769
"timezones or no timezones")
769770
result = self._sub_datelike_dti(other)
770-
elif isinstance(other, datetime):
771+
elif isinstance(other, (datetime, np.datetime64)):
771772
other = Timestamp(other)
772773
if other is libts.NaT:
773774
result = self._nat_new(box=False)
@@ -777,7 +778,8 @@ def _sub_datelike(self, other):
777778
"timezones or no timezones")
778779
else:
779780
i8 = self.asi8
780-
result = i8 - other.value
781+
result = checked_add_with_arr(i8, -other.value,
782+
arr_mask=self._isnan)
781783
result = self._maybe_mask_results(result,
782784
fill_value=libts.iNaT)
783785
else:

pandas/core/indexes/period.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
import pandas.tseries.offsets as offsets
3232

3333
from pandas._libs.lib import infer_dtype
34-
from pandas._libs import tslib, period
34+
from pandas._libs import tslib, period, index as libindex
3535
from pandas._libs.period import (Period, IncompatibleFrequency,
3636
get_period_field_arr, _validate_end_alias,
3737
_quarter_to_myear)
@@ -192,6 +192,8 @@ class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index):
192192

193193
freq = None
194194

195+
_engine_type = libindex.PeriodEngine
196+
195197
__eq__ = _period_index_cmp('__eq__')
196198
__ne__ = _period_index_cmp('__ne__', nat_result=True)
197199
__lt__ = _period_index_cmp('__lt__')
@@ -275,6 +277,10 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None,
275277
data = period.extract_ordinals(data, freq)
276278
return cls._from_ordinals(data, name=name, freq=freq)
277279

280+
@cache_readonly
281+
def _engine(self):
282+
return self._engine_type(lambda: self, len(self))
283+
278284
@classmethod
279285
def _generate_range(cls, start, end, periods, freq, fields):
280286
if freq is not None:

pandas/core/indexes/timedeltas.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,8 @@ def _add_datelike(self, other):
361361
else:
362362
other = Timestamp(other)
363363
i8 = self.asi8
364-
result = checked_add_with_arr(i8, other.value)
364+
result = checked_add_with_arr(i8, other.value,
365+
arr_mask=self._isnan)
365366
result = self._maybe_mask_results(result, fill_value=iNaT)
366367
return DatetimeIndex(result, name=self.name, copy=False)
367368

pandas/core/reshape/merge.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -1253,10 +1253,12 @@ def _get_merge_keys(self):
12531253
join_names) = super(_AsOfMerge, self)._get_merge_keys()
12541254

12551255
# validate index types are the same
1256-
for lk, rk in zip(left_join_keys, right_join_keys):
1256+
for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
12571257
if not is_dtype_equal(lk.dtype, rk.dtype):
1258-
raise MergeError("incompatible merge keys, "
1259-
"must be the same type")
1258+
raise MergeError("incompatible merge keys [{i}] {lkdtype} and "
1259+
"{rkdtype}, must be the same type"
1260+
.format(i=i, lkdtype=lk.dtype,
1261+
rkdtype=rk.dtype))
12601262

12611263
# validate tolerance; must be a Timedelta if we have a DTI
12621264
if self.tolerance is not None:

pandas/io/formats/format.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1695,7 +1695,7 @@ def _save_header(self):
16951695
else:
16961696
encoded_labels = []
16971697

1698-
if not has_mi_columns:
1698+
if not has_mi_columns or has_aliases:
16991699
encoded_labels += list(write_cols)
17001700
writer.writerow(encoded_labels)
17011701
else:

pandas/io/parsers.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -1431,7 +1431,6 @@ def ix(col):
14311431
if not isinstance(col, compat.string_types):
14321432
return col
14331433
raise ValueError('Index %s invalid' % col)
1434-
index = None
14351434

14361435
to_remove = []
14371436
index = []
@@ -1462,8 +1461,6 @@ def _get_name(icol):
14621461
if i == icol:
14631462
return c
14641463

1465-
index = None
1466-
14671464
to_remove = []
14681465
index = []
14691466
for idx in self.index_col:
@@ -1484,7 +1481,7 @@ def _agg_index(self, index, try_parse_dates=True):
14841481

14851482
for i, arr in enumerate(index):
14861483

1487-
if (try_parse_dates and self._should_parse_dates(i)):
1484+
if try_parse_dates and self._should_parse_dates(i):
14881485
arr = self._date_conv(arr)
14891486

14901487
col_na_values = self.na_values
@@ -1671,7 +1668,9 @@ def __init__(self, src, **kwds):
16711668

16721669
ParserBase.__init__(self, kwds)
16731670

1674-
if 'utf-16' in (kwds.get('encoding') or ''):
1671+
if (kwds.get('compression') is None
1672+
and 'utf-16' in (kwds.get('encoding') or '')):
1673+
# if source is utf-16 plain text, convert source to utf-8
16751674
if isinstance(src, compat.string_types):
16761675
src = open(src, 'rb')
16771676
self.handles.append(src)

pandas/tests/frame/test_to_csv.py

+13
Original file line numberDiff line numberDiff line change
@@ -1203,3 +1203,16 @@ def test_period_index_date_overflow(self):
12031203

12041204
expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n'
12051205
assert result == expected
1206+
1207+
def test_multi_index_header(self):
1208+
# see gh-5539
1209+
columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2),
1210+
("b", 1), ("b", 2)])
1211+
df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]])
1212+
df.columns = columns
1213+
1214+
header = ["a", "b", "c", "d"]
1215+
result = df.to_csv(header=header)
1216+
1217+
expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n"
1218+
assert result == expected

0 commit comments

Comments
 (0)