Merge branch 'master' into pandas-dev#17778

kchomski · kchomski · commit 5896f93a48e9 · 2017-11-06T19:54:43.000+01:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -89,6 +89,7 @@ Bug Fixes
 
 - Bug in ``pd.read_msgpack()`` with a non existent file is passed in Python 2 (:issue:`15296`)
 - Bug in ``DataFrame.groupby`` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
+- Bug in :func:`pd.read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`)
 - Bug in ``DataFrame.to_pickle()`` fails for .zip format (:issue:`17778`)
 
 Conversion
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -48,9 +48,8 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
                                PyTime_Check, PyDelta_Check,
                                PyDateTime_IMPORT)
 PyDateTime_IMPORT
-# this is our tseries.pxd
-from datetime cimport get_timedelta64_value, get_datetime64_value
 
+from tslibs.np_datetime cimport get_timedelta64_value, get_datetime64_value
 
 from tslib cimport _check_all_nulls
 import tslib
diff --git a/pandas/_libs/src/datetime.pxd b/pandas/_libs/src/datetime.pxd
@@ -1,52 +1,20 @@
 # cython: profile=False
-from numpy cimport int64_t, int32_t, npy_int64, npy_int32, ndarray
-from cpython cimport PyObject
+from numpy cimport int64_t, npy_int64, npy_int32
 
 from cpython cimport PyUnicode_Check, PyUnicode_AsASCIIString
 
 
-cdef extern from "datetime.h":
-
-    ctypedef class datetime.date [object PyDateTime_Date]:
-        pass
-
-    ctypedef class datetime.datetime [object PyDateTime_DateTime]:
-        pass
-
-    ctypedef class datetime.timedelta [object PyDateTime_Delta]:
-        pass
-
-    void PyDateTime_IMPORT()
-
-    int PyDateTime_GET_YEAR(date)
-    int PyDateTime_GET_MONTH(date)
-    int PyDateTime_GET_DAY(date)
-    int PyDateTime_DATE_GET_HOUR(object o)
-    int PyDateTime_DATE_GET_MINUTE(object o)
-    int PyDateTime_DATE_GET_SECOND(object o)
-    int PyDateTime_DATE_GET_MICROSECOND(object o)
-    int PyDateTime_TIME_GET_HOUR(object o)
-    int PyDateTime_TIME_GET_MINUTE(object o)
-    int PyDateTime_TIME_GET_SECOND(object o)
-    int PyDateTime_TIME_GET_MICROSECOND(object o)
-    bint PyDateTime_Check(object o)
-    bint PyDate_Check(object o)
-    bint PyTime_Check(object o)
-    bint PyDelta_Check(object o)
-    object PyDateTime_FromDateAndTime(int year, int month, int day, int hour,
-                                      int minute, int second, int us)
-
 cdef extern from "numpy/ndarrayobject.h":
 
     ctypedef int64_t npy_timedelta
     ctypedef int64_t npy_datetime
 
     ctypedef enum NPY_CASTING:
-            NPY_NO_CASTING
-            NPY_EQUIV_CASTING
-            NPY_SAFE_CASTING
-            NPY_SAME_KIND_CASTING
-            NPY_UNSAFE_CASTING
+        NPY_NO_CASTING
+        NPY_EQUIV_CASTING
+        NPY_SAFE_CASTING
+        NPY_SAME_KIND_CASTING
+        NPY_UNSAFE_CASTING
 
 
 cdef extern from "numpy_helper.h":
@@ -79,9 +47,6 @@ cdef extern from "datetime/np_datetime.h":
         npy_int64 year
         npy_int32 month, day, hour, min, sec, us, ps, as
 
-    int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
-                                  pandas_datetimestruct *b)
-
     npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr,
                                                    pandas_datetimestruct *d) nogil
     void pandas_datetime_to_datetimestruct(npy_datetime val,
@@ -102,8 +67,6 @@ cdef extern from "datetime/np_datetime_strings.h":
                                 PANDAS_DATETIMEUNIT *out_bestunit,
                                 npy_bool *out_special)
 
-    # int parse_python_string(object obj, pandas_datetimestruct *out) except -1
-
 
 
 
@@ -134,17 +97,3 @@ cdef inline int _cstring_to_dts(char *val, int length,
                                      NPY_UNSAFE_CASTING,
                                      dts, out_local, out_tzoffset, &out_bestunit, &special)
     return result
-
-
-cdef inline bint check_dts_bounds(pandas_datetimestruct *dts):
-    """Returns True if an error needs to be raised"""
-    cdef:
-        bint error = False
-
-    if (dts.year <= 1677 and
-            cmp_pandas_datetimestruct(dts, &_NS_MIN_DTS) == -1):
-        error = True
-    elif (dts.year >= 2262 and
-          cmp_pandas_datetimestruct(dts, &_NS_MAX_DTS) == 1):
-        error = True
-    return error
diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -41,12 +41,8 @@ PyDateTime_IMPORT
 from datetime cimport (
     pandas_datetime_to_datetimestruct,
     days_per_month_table,
-    get_datetime64_value,
-    get_timedelta64_value,
-    get_datetime64_unit,
     PANDAS_DATETIMEUNIT,
     _string_to_dts,
-    npy_datetime,
     is_leapyear,
     dayofweek,
     PANDAS_FR_ns)
@@ -59,7 +55,10 @@ from tslibs.np_datetime cimport (check_dts_bounds,
                                  cmp_scalar,
                                  pandas_datetimestruct,
                                  dt64_to_dtstruct, dtstruct_to_dt64,
-                                 pydatetime_to_dt64, pydate_to_dt64)
+                                 pydatetime_to_dt64, pydate_to_dt64,
+                                 npy_datetime,
+                                 get_datetime64_unit, get_datetime64_value,
+                                 get_timedelta64_value)
 from tslibs.np_datetime import OutOfBoundsDatetime
 
 from khash cimport (
diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
@@ -5,12 +5,46 @@ from cpython.datetime cimport date, datetime
 
 from numpy cimport int64_t, int32_t
 
+cdef extern from "numpy/ndarrayobject.h":
+    ctypedef int64_t npy_timedelta
+    ctypedef int64_t npy_datetime
+
+cdef extern from "numpy/ndarraytypes.h":
+    ctypedef struct PyArray_DatetimeMetaData:
+        PANDAS_DATETIMEUNIT base
+        int64_t num
+
+cdef extern from "numpy/arrayscalars.h":
+    ctypedef struct PyDatetimeScalarObject:
+        # PyObject_HEAD
+        npy_datetime obval
+        PyArray_DatetimeMetaData obmeta
+
+    ctypedef struct PyTimedeltaScalarObject:
+        # PyObject_HEAD
+        npy_timedelta obval
+        PyArray_DatetimeMetaData obmeta
 
 cdef extern from "../src/datetime/np_datetime.h":
     ctypedef struct pandas_datetimestruct:
         int64_t year
         int32_t month, day, hour, min, sec, us, ps, as
 
+    ctypedef enum PANDAS_DATETIMEUNIT:
+        PANDAS_FR_Y
+        PANDAS_FR_M
+        PANDAS_FR_W
+        PANDAS_FR_D
+        PANDAS_FR_B
+        PANDAS_FR_h
+        PANDAS_FR_m
+        PANDAS_FR_s
+        PANDAS_FR_ms
+        PANDAS_FR_us
+        PANDAS_FR_ns
+        PANDAS_FR_ps
+        PANDAS_FR_fs
+        PANDAS_FR_as
 
 cdef int reverse_ops[6]
 
@@ -23,3 +57,7 @@ cdef void dt64_to_dtstruct(int64_t dt64, pandas_datetimestruct* out) nogil
 
 cdef int64_t pydatetime_to_dt64(datetime val, pandas_datetimestruct *dts)
 cdef int64_t pydate_to_dt64(date val, pandas_datetimestruct *dts)
+
+cdef npy_datetime get_datetime64_value(object obj) nogil
+cdef npy_timedelta get_timedelta64_value(object obj) nogil
+cdef PANDAS_DATETIMEUNIT get_datetime64_unit(object obj) nogil
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
@@ -14,27 +14,7 @@ PyDateTime_IMPORT
 
 from numpy cimport int64_t
 
-cdef extern from "numpy/ndarrayobject.h":
-    ctypedef int64_t npy_timedelta
-    ctypedef int64_t npy_datetime
-
 cdef extern from "../src/datetime/np_datetime.h":
-    ctypedef enum PANDAS_DATETIMEUNIT:
-        PANDAS_FR_Y
-        PANDAS_FR_M
-        PANDAS_FR_W
-        PANDAS_FR_D
-        PANDAS_FR_B
-        PANDAS_FR_h
-        PANDAS_FR_m
-        PANDAS_FR_s
-        PANDAS_FR_ms
-        PANDAS_FR_us
-        PANDAS_FR_ns
-        PANDAS_FR_ps
-        PANDAS_FR_fs
-        PANDAS_FR_as
-
     int cmp_pandas_datetimestruct(pandas_datetimestruct *a,
                                   pandas_datetimestruct *b)
 
@@ -48,6 +28,32 @@ cdef extern from "../src/datetime/np_datetime.h":
 
     pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS
 
+# ----------------------------------------------------------------------
+# numpy object inspection
+
+cdef inline npy_datetime get_datetime64_value(object obj) nogil:
+    """
+    returns the int64 value underlying scalar numpy datetime64 object
+
+    Note that to interpret this as a datetime, the corresponding unit is
+    also needed.  That can be found using `get_datetime64_unit`.
+    """
+    return (<PyDatetimeScalarObject*>obj).obval
+
+
+cdef inline npy_timedelta get_timedelta64_value(object obj) nogil:
+    """
+    returns the int64 value underlying scalar numpy timedelta64 object
+    """
+    return (<PyTimedeltaScalarObject*>obj).obval
+
+
+cdef inline PANDAS_DATETIMEUNIT get_datetime64_unit(object obj) nogil:
+    """
+    returns the unit part of the dtype for a numpy datetime64 object.
+    """
+    return <PANDAS_DATETIMEUNIT>(<PyDatetimeScalarObject*>obj).obmeta.base
+
 # ----------------------------------------------------------------------
 # Comparison
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -1106,6 +1106,24 @@ def _is_index_col(col):
     return col is not None and col is not False
 
 
+def _is_potential_multi_index(columns):
+    """
+    Check whether or not the `columns` parameter
+    could be converted into a MultiIndex.
+
+    Parameters
+    ----------
+    columns : array-like
+        Object which may or may not be convertible into a MultiIndex
+
+    Returns
+    -------
+    boolean : Whether or not columns could become a MultiIndex
+    """
+    return (len(columns) and not isinstance(columns, MultiIndex) and
+            all([isinstance(c, tuple) for c in columns]))
+
+
 def _evaluate_usecols(usecols, names):
     """
     Check whether or not the 'usecols' parameter
@@ -1374,14 +1392,18 @@ def _maybe_dedup_names(self, names):
         if self.mangle_dupe_cols:
             names = list(names)  # so we can index
             counts = defaultdict(int)
+            is_potential_mi = _is_potential_multi_index(names)
 
             for i, col in enumerate(names):
                 cur_count = counts[col]
 
                 while cur_count > 0:
                     counts[col] = cur_count + 1
 
-                    col = '%s.%d' % (col, cur_count)
+                    if is_potential_mi:
+                        col = col[:-1] + ('%s.%d' % (col[-1], cur_count),)
+                    else:
+                        col = '%s.%d' % (col, cur_count)
                     cur_count = counts[col]
 
                 names[i] = col
@@ -1391,9 +1413,7 @@ def _maybe_dedup_names(self, names):
 
     def _maybe_make_multi_index_columns(self, columns, col_names=None):
         # possibly create a column mi here
-        if (not self.tupleize_cols and len(columns) and
-                not isinstance(columns, MultiIndex) and
-                all([isinstance(c, tuple) for c in columns])):
+        if _is_potential_multi_index(columns):
             columns = MultiIndex.from_tuples(columns, names=col_names)
         return columns
 
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -290,11 +290,11 @@ def test_empty_header_read(count):
             test_empty_header_read(count)
 
     def test_parse_trim_buffers(self):
-        # This test is part of a bugfix for issue #13703. It attmepts to
+        # This test is part of a bugfix for issue #13703. It attempts to
         # to stress the system memory allocator, to cause it to move the
         # stream buffer and either let the OS reclaim the region, or let
         # other memory requests of parser otherwise modify the contents
-        # of memory space, where it was formely located.
+        # of memory space, where it was formally located.
         # This test is designed to cause a `segfault` with unpatched
         # `tokenizer.c`. Sometimes the test fails on `segfault`, other
         # times it fails due to memory corruption, which causes the
@@ -346,7 +346,7 @@ def test_parse_trim_buffers(self):
 
         # Generate the expected output: manually create the dataframe
         # by splitting by comma and repeating the `n_lines` times.
-        row = tuple(val_ if val_ else float("nan")
+        row = tuple(val_ if val_ else np.nan
                     for val_ in record_.split(","))
         expected = pd.DataFrame([row for _ in range(n_lines)],
                                 dtype=object, columns=None, index=None)
@@ -359,6 +359,15 @@ def test_parse_trim_buffers(self):
         # Check for data corruption if there was no segfault
         tm.assert_frame_equal(result, expected)
 
+        # This extra test was added to replicate the fault in gh-5291.
+        # Force 'utf-8' encoding, so that `_string_convert` would take
+        # a different execution branch.
+        chunks_ = self.read_csv(StringIO(csv_data), header=None,
+                                dtype=object, chunksize=chunksize,
+                                encoding='utf_8')
+        result = pd.concat(chunks_, axis=0, ignore_index=True)
+        tm.assert_frame_equal(result, expected)
+
     def test_internal_null_byte(self):
         # see gh-14012
         #
diff --git a/pandas/tests/io/parser/header.py b/pandas/tests/io/parser/header.py
@@ -290,3 +290,30 @@ def test_singleton_header(self):
         df = self.read_csv(StringIO(data), header=[0])
         expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
         tm.assert_frame_equal(df, expected)
+
+    def test_mangles_multi_index(self):
+        # See GH 18062
+        data = """A,A,A,B\none,one,one,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.2'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B\none,one,one.1,two\n0,40,34,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two')]))
+        tm.assert_frame_equal(df, expected)
+
+        data = """A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1"""
+        df = self.read_csv(StringIO(data), header=[0, 1])
+        expected = DataFrame([[0, 40, 34, 0.1, 0.1]],
+                             columns=MultiIndex.from_tuples(
+                                 [('A', 'one'), ('A', 'one.1'),
+                                  ('A', 'one.1.1'), ('B', 'two'),
+                                  ('B', 'two.1')]))
+        tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/io/parser/parse_dates.py b/pandas/tests/io/parser/parse_dates.py
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py