Skip to content

Commit 5896f93

Browse files
committed
Merge branch 'master' into pandas-dev#17778
2 parents f70c68e + e23bd24 commit 5896f93

File tree

11 files changed

+248
-91
lines changed

11 files changed

+248
-91
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ Bug Fixes
8989

9090
- Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`)
9191
- Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
92+
- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
9293
- Bug in ``DataFrame.to_pickle()`` fails for .zip format (:issue:`17778`)
9394

9495
Conversion

pandas/_libs/lib.pyx

+1-2
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,8 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
4848
PyTime_Check, PyDelta_Check,
4949
PyDateTime_IMPORT)
5050
PyDateTime_IMPORT
51-
# this is our tseries.pxd
52-
from datetime cimport get_timedelta64_value, get_datetime64_value
5351

52+
from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value
5453

5554
from tslib cimport _check_all_nulls
5655
import tslib

pandas/_libs/src/datetime.pxd

+6-57
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,20 @@
11
# cython: profile=False
2-
from numpy cimport int64_t, int32_t, npy_int64, npy_int32, ndarray
3-
from cpython cimport PyObject
2+
from numpy cimport int64_t, npy_int64, npy_int32
43

54
from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString
65

76

8-
cdef extern from "datetime.h":
9-
10-
ctypedef class datetime.date [object PyDateTime_Date]:
11-
pass
12-
13-
ctypedef class datetime.datetime [object PyDateTime_DateTime]:
14-
pass
15-
16-
ctypedef class datetime.timedelta [object PyDateTime_Delta]:
17-
pass
18-
19-
void PyDateTime_IMPORT()
20-
21-
int PyDateTime_GET_YEAR(date)
22-
int PyDateTime_GET_MONTH(date)
23-
int PyDateTime_GET_DAY(date)
24-
int PyDateTime_DATE_GET_HOUR(object o)
25-
int PyDateTime_DATE_GET_MINUTE(object o)
26-
int PyDateTime_DATE_GET_SECOND(object o)
27-
int PyDateTime_DATE_GET_MICROSECOND(object o)
28-
int PyDateTime_TIME_GET_HOUR(object o)
29-
int PyDateTime_TIME_GET_MINUTE(object o)
30-
int PyDateTime_TIME_GET_SECOND(object o)
31-
int PyDateTime_TIME_GET_MICROSECOND(object o)
32-
bint PyDateTime_Check(object o)
33-
bint PyDate_Check(object o)
34-
bint PyTime_Check(object o)
35-
bint PyDelta_Check(object o)
36-
object PyDateTime_FromDateAndTime(int year, int month, int day, int hour,
37-
int minute, int second, int us)
38-
397
cdef extern from "numpy/ndarrayobject.h":
408

419
ctypedef int64_t npy_timedelta
4210
ctypedef int64_t npy_datetime
4311

4412
ctypedef enum NPY_CASTING:
45-
NPY_NO_CASTING
46-
NPY_EQUIV_CASTING
47-
NPY_SAFE_CASTING
48-
NPY_SAME_KIND_CASTING
49-
NPY_UNSAFE_CASTING
13+
NPY_NO_CASTING
14+
NPY_EQUIV_CASTING
15+
NPY_SAFE_CASTING
16+
NPY_SAME_KIND_CASTING
17+
NPY_UNSAFE_CASTING
5018

5119

5220
cdef extern from "numpy_helper.h":
@@ -79,9 +47,6 @@ cdef extern from "datetime/np_datetime.h":
7947
npy_int64 year
8048
npy_int32 month, day, hour, min, sec, us, ps, as
8149

82-
int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
83-
pandas_datetimestruct *b)
84-
8550
npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr,
8651
pandas_datetimestruct *d) nogil
8752
void pandas_datetime_to_datetimestruct(npy_datetime val,
@@ -102,8 +67,6 @@ cdef extern from "datetime/np_datetime_strings.h":
10267
PANDAS_DATETIMEUNIT *out_bestunit,
10368
npy_bool *out_special)
10469

105-
# int parse_python_string(object obj, pandas_datetimestruct *out) except -1
106-
10770

10871

10972

@@ -134,17 +97,3 @@ cdef inline int _cstring_to_dts(char *val, int length,
13497
NPY_UNSAFE_CASTING,
13598
dts, out_local, out_tzoffset, &out_bestunit, &special)
13699
return result
137-
138-
139-
cdef inline bint check_dts_bounds(pandas_datetimestruct *dts):
140-
"""Returns True if an error needs to be raised"""
141-
cdef:
142-
bint error = False
143-
144-
if (dts.year <= 1677 and
145-
cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1):
146-
error = True
147-
elif (dts.year >= 2262 and
148-
cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1):
149-
error = True
150-
return error

pandas/_libs/tslib.pyx

+4-5
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,8 @@ PyDateTime_IMPORT
4141
from datetime cimport (
4242
pandas_datetime_to_datetimestruct,
4343
days_per_month_table,
44-
get_datetime64_value,
45-
get_timedelta64_value,
46-
get_datetime64_unit,
4744
PANDAS_DATETIMEUNIT,
4845
_string_to_dts,
49-
npy_datetime,
5046
is_leapyear,
5147
dayofweek,
5248
PANDAS_FR_ns)
@@ -59,7 +55,10 @@ from tslibs.np_datetime cimport (check_dts_bounds,
5955
cmp_scalar,
6056
pandas_datetimestruct,
6157
dt64_to_dtstruct, dtstruct_to_dt64,
62-
pydatetime_to_dt64, pydate_to_dt64)
58+
pydatetime_to_dt64, pydate_to_dt64,
59+
npy_datetime,
60+
get_datetime64_unit, get_datetime64_value,
61+
get_timedelta64_value)
6362
from tslibs.np_datetime import OutOfBoundsDatetime
6463

6564
from khash cimport (

pandas/_libs/tslibs/np_datetime.pxd

+38
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,46 @@ from cpython.datetime cimport date, datetime
55

66
from numpy cimport int64_t, int32_t
77

8+
cdef extern from "numpy/ndarrayobject.h":
9+
ctypedef int64_t npy_timedelta
10+
ctypedef int64_t npy_datetime
11+
12+
cdef extern from "numpy/ndarraytypes.h":
13+
ctypedef struct PyArray_DatetimeMetaData:
14+
PANDAS_DATETIMEUNIT base
15+
int64_t num
16+
17+
cdef extern from "numpy/arrayscalars.h":
18+
ctypedef struct PyDatetimeScalarObject:
19+
# PyObject_HEAD
20+
npy_datetime obval
21+
PyArray_DatetimeMetaData obmeta
22+
23+
ctypedef struct PyTimedeltaScalarObject:
24+
# PyObject_HEAD
25+
npy_timedelta obval
26+
PyArray_DatetimeMetaData obmeta
827

928
cdef extern from "../src/datetime/np_datetime.h":
1029
ctypedef struct pandas_datetimestruct:
1130
int64_t year
1231
int32_t month, day, hour, min, sec, us, ps, as
1332

33+
ctypedef enum PANDAS_DATETIMEUNIT:
34+
PANDAS_FR_Y
35+
PANDAS_FR_M
36+
PANDAS_FR_W
37+
PANDAS_FR_D
38+
PANDAS_FR_B
39+
PANDAS_FR_h
40+
PANDAS_FR_m
41+
PANDAS_FR_s
42+
PANDAS_FR_ms
43+
PANDAS_FR_us
44+
PANDAS_FR_ns
45+
PANDAS_FR_ps
46+
PANDAS_FR_fs
47+
PANDAS_FR_as
1448

1549
cdef int reverse_ops[6]
1650

@@ -23,3 +57,7 @@ cdef void dt64_to_dtstruct(int64_t dt64, pandas_datetimestruct* out) nogil
2357

2458
cdef int64_t pydatetime_to_dt64(datetime val, pandas_datetimestruct *dts)
2559
cdef int64_t pydate_to_dt64(date val, pandas_datetimestruct *dts)
60+
61+
cdef npy_datetime get_datetime64_value(object obj) nogil
62+
cdef npy_timedelta get_timedelta64_value(object obj) nogil
63+
cdef PANDAS_DATETIMEUNIT get_datetime64_unit(object obj) nogil

pandas/_libs/tslibs/np_datetime.pyx

+26-20
Original file line numberDiff line numberDiff line change
@@ -14,27 +14,7 @@ PyDateTime_IMPORT
1414

1515
from numpy cimport int64_t
1616

17-
cdef extern from "numpy/ndarrayobject.h":
18-
ctypedef int64_t npy_timedelta
19-
ctypedef int64_t npy_datetime
20-
2117
cdef extern from "../src/datetime/np_datetime.h":
22-
ctypedef enum PANDAS_DATETIMEUNIT:
23-
PANDAS_FR_Y
24-
PANDAS_FR_M
25-
PANDAS_FR_W
26-
PANDAS_FR_D
27-
PANDAS_FR_B
28-
PANDAS_FR_h
29-
PANDAS_FR_m
30-
PANDAS_FR_s
31-
PANDAS_FR_ms
32-
PANDAS_FR_us
33-
PANDAS_FR_ns
34-
PANDAS_FR_ps
35-
PANDAS_FR_fs
36-
PANDAS_FR_as
37-
3818
int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
3919
pandas_datetimestruct *b)
4020

@@ -48,6 +28,32 @@ cdef extern from "../src/datetime/np_datetime.h":
4828

4929
pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
5030

31+
# ----------------------------------------------------------------------
32+
# numpy object inspection
33+
34+
cdef inline npy_datetime get_datetime64_value(object obj) nogil:
35+
"""
36+
returns the int64 value underlying scalar numpy datetime64 object
37+
38+
Note that to interpret this as a datetime, the corresponding unit is
39+
also needed. That can be found using `get_datetime64_unit`.
40+
"""
41+
return (<PyDatetimeScalarObject*>obj).obval
42+
43+
44+
cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:
45+
"""
46+
returns the int64 value underlying scalar numpy timedelta64 object
47+
"""
48+
return (<PyTimedeltaScalarObject*>obj).obval
49+
50+
51+
cdef inline PANDAS_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
52+
"""
53+
returns the unit part of the dtype for a numpy datetime64 object.
54+
"""
55+
return <PANDAS_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
56+
5157
# ----------------------------------------------------------------------
5258
# Comparison
5359

pandas/io/parsers.py

+24-4
Original file line numberDiff line numberDiff line change
@@ -1106,6 +1106,24 @@ def _is_index_col(col):
11061106
return col is not None and col is not False
11071107

11081108

1109+
def _is_potential_multi_index(columns):
1110+
"""
1111+
Check whether or not the `columns` parameter
1112+
could be converted into a MultiIndex.
1113+
1114+
Parameters
1115+
----------
1116+
columns : array-like
1117+
Object which may or may not be convertible into a MultiIndex
1118+
1119+
Returns
1120+
-------
1121+
boolean : Whether or not columns could become a MultiIndex
1122+
"""
1123+
return (len(columns) and not isinstance(columns, MultiIndex) and
1124+
all([isinstance(c, tuple) for c in columns]))
1125+
1126+
11091127
def _evaluate_usecols(usecols, names):
11101128
"""
11111129
Check whether or not the 'usecols' parameter
@@ -1374,14 +1392,18 @@ def _maybe_dedup_names(self, names):
13741392
if self.mangle_dupe_cols:
13751393
names = list(names) # so we can index
13761394
counts = defaultdict(int)
1395+
is_potential_mi = _is_potential_multi_index(names)
13771396

13781397
for i, col in enumerate(names):
13791398
cur_count = counts[col]
13801399

13811400
while cur_count > 0:
13821401
counts[col] = cur_count + 1
13831402

1384-
col = '%s.%d' % (col, cur_count)
1403+
if is_potential_mi:
1404+
col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
1405+
else:
1406+
col = '%s.%d' % (col, cur_count)
13851407
cur_count = counts[col]
13861408

13871409
names[i] = col
@@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names):
13911413

13921414
def _maybe_make_multi_index_columns(self, columns, col_names=None):
13931415
# possibly create a column mi here
1394-
if (not self.tupleize_cols and len(columns) and
1395-
not isinstance(columns, MultiIndex) and
1396-
all([isinstance(c, tuple) for c in columns])):
1416+
if _is_potential_multi_index(columns):
13971417
columns = MultiIndex.from_tuples(columns, names=col_names)
13981418
return columns
13991419

pandas/tests/io/parser/c_parser_only.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -290,11 +290,11 @@ def test_empty_header_read(count):
290290
test_empty_header_read(count)
291291

292292
def test_parse_trim_buffers(self):
293-
# This test is part of a bugfix for issue #13703. It attmepts to
293+
# This test is part of a bugfix for issue #13703. It attempts to
294294
# to stress the system memory allocator, to cause it to move the
295295
# stream buffer and either let the OS reclaim the region, or let
296296
# other memory requests of parser otherwise modify the contents
297-
# of memory space, where it was formely located.
297+
# of memory space, where it was formally located.
298298
# This test is designed to cause a `segfault` with unpatched
299299
# `tokenizer.c`. Sometimes the test fails on `segfault`, other
300300
# times it fails due to memory corruption, which causes the
@@ -346,7 +346,7 @@ def test_parse_trim_buffers(self):
346346

347347
# Generate the expected output: manually create the dataframe
348348
# by splitting by comma and repeating the `n_lines` times.
349-
row = tuple(val_ if val_ else float("nan")
349+
row = tuple(val_ if val_ else np.nan
350350
for val_ in record_.split(","))
351351
expected = pd.DataFrame([row for _ in range(n_lines)],
352352
dtype=object, columns=None, index=None)
@@ -359,6 +359,15 @@ def test_parse_trim_buffers(self):
359359
# Check for data corruption if there was no segfault
360360
tm.assert_frame_equal(result, expected)
361361

362+
# This extra test was added to replicate the fault in gh-5291.
363+
# Force 'utf-8' encoding, so that `_string_convert` would take
364+
# a different execution branch.
365+
chunks_ = self.read_csv(StringIO(csv_data), header=None,
366+
dtype=object, chunksize=chunksize,
367+
encoding='utf_8')
368+
result = pd.concat(chunks_, axis=0, ignore_index=True)
369+
tm.assert_frame_equal(result, expected)
370+
362371
def test_internal_null_byte(self):
363372
# see gh-14012
364373
#

pandas/tests/io/parser/header.py

+27
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,30 @@ def test_singleton_header(self):
290290
df = self.read_csv(StringIO(data), header=[0])
291291
expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
292292
tm.assert_frame_equal(df, expected)
293+
294+
def test_mangles_multi_index(self):
295+
# See GH 18062
296+
data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
297+
df = self.read_csv(StringIO(data), header=[0, 1])
298+
expected = DataFrame([[0, 40, 34, 0.1]],
299+
columns=MultiIndex.from_tuples(
300+
[('A', 'one'), ('A', 'one.1'),
301+
('A', 'one.2'), ('B', 'two')]))
302+
tm.assert_frame_equal(df, expected)
303+
304+
data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
305+
df = self.read_csv(StringIO(data), header=[0, 1])
306+
expected = DataFrame([[0, 40, 34, 0.1]],
307+
columns=MultiIndex.from_tuples(
308+
[('A', 'one'), ('A', 'one.1'),
309+
('A', 'one.1.1'), ('B', 'two')]))
310+
tm.assert_frame_equal(df, expected)
311+
312+
data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
313+
df = self.read_csv(StringIO(data), header=[0, 1])
314+
expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
315+
columns=MultiIndex.from_tuples(
316+
[('A', 'one'), ('A', 'one.1'),
317+
('A', 'one.1.1'), ('B', 'two'),
318+
('B', 'two.1')]))
319+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)