Merge pull request #4388 from jreback/GH4377

jreback · jreback · commit 03b4997074bb · 2013-07-29T17:17:42.000-07:00
BUG: fixes for GH4377
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -83,6 +83,10 @@ pandas 0.13
   - In ``to_json``, raise if a passed ``orient`` would cause loss of data because
     of a duplicate index (:issue:`4359`)
   - Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
+  - Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed
+    dtypes, surfaced in (:issue:`4377`)
+  - Fixed bug with duplicate columns and type conversion in ``read_json`` when
+    ``orient='split'`` (:issue:`4377`)
 
 pandas 0.12
 ===========
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
@@ -61,9 +61,6 @@ Bug Fixes
   - Fixed bug where ``network`` testing was throwing ``NameError`` because a
     local variable was undefined (:issue:`4381`)
 
-  - In ``to_json``, raise if a passed ``orient`` would cause loss of data because
-    of a duplicate index (:issue:`4359`)
-
   - Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
 
 See the :ref:`full release notes
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -1538,23 +1538,23 @@ def _interleave(self, items):
         # By construction, all of the item should be covered by one of the
         # blocks
         if items.is_unique:
+
             for block in self.blocks:
                 indexer = items.get_indexer(block.items)
                 if (indexer == -1).any():
                     raise AssertionError('Items must contain all block items')
                 result[indexer] = block.get_values(dtype)
                 itemmask[indexer] = 1
+
+            if not itemmask.all():
+                raise AssertionError('Some items were not contained in blocks')
+
         else:
-            for block in self.blocks:
-                mask = items.isin(block.items)
-                indexer = mask.nonzero()[0]
-                if (len(indexer) != len(block.items)):
-                    raise AssertionError('All items must be in block items')
-                result[indexer] = block.get_values(dtype)
-                itemmask[indexer] = 1
 
-        if not itemmask.all():
-            raise AssertionError('Some items were not contained in blocks')
+            # non-unique, must use ref_locs
+            rl = self._set_ref_locs()
+            for i, (block, idx) in enumerate(rl):
+                result[i] = block.iget(idx)
 
         return result
 
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -52,19 +52,24 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
         self._format_axes()
         self._format_dates()
 
+    def _needs_to_date(self, obj):
+        return obj.dtype == 'datetime64[ns]'
+
     def _format_dates(self):
         raise NotImplementedError
 
     def _format_axes(self):
         raise NotImplementedError
 
-    def _needs_to_date(self, data):
-        return self.date_format == 'iso' and data.dtype == 'datetime64[ns]'
-
     def _format_to_date(self, data):
-        if self._needs_to_date(data):
+
+        # iso
+        if self.date_format == 'iso':
             return data.apply(lambda x: x.isoformat())
-        return data
+
+        # int64
+        else:
+            return data.astype(np.int64)
 
     def copy_if_needed(self):
         """ copy myself if necessary """
@@ -87,13 +92,11 @@ def _format_axes(self):
             self.obj.index = self._format_to_date(self.obj.index.to_series())
 
     def _format_dates(self):
-        if self._needs_to_date(self.obj):
-            self.copy_if_needed()
+        if self.obj.dtype == 'datetime64[ns]':
             self.obj = self._format_to_date(self.obj)
 
     def _format_bools(self):
         if self._needs_to_bool(self.obj):
-            self.copy_if_needed()
             self.obj = self._format_to_bool(self.obj)
 
 class FrameWriter(Writer):
@@ -123,13 +126,22 @@ def _format_axes(self):
             setattr(self.obj,axis,self._format_to_date(a.to_series()))
 
     def _format_dates(self):
-        if self.date_format == 'iso':
-            dtypes = self.obj.dtypes
-            dtypes = dtypes[dtypes == 'datetime64[ns]']
-            if len(dtypes):
-                self.copy_if_needed()
-                for c in dtypes.index:
-                    self.obj[c] = self._format_to_date(self.obj[c])
+        dtypes = self.obj.dtypes
+        if len(dtypes[dtypes == 'datetime64[ns]']):
+
+            # need to create a new object
+            d = {}
+
+            for i, (col, c) in enumerate(self.obj.iteritems()):
+
+                if c.dtype == 'datetime64[ns]':
+                    c = self._format_to_date(c)
+
+                d[i] = c
+
+            d = DataFrame(d,index=self.obj.index)
+            d.columns = self.obj.columns
+            self.obj = d
 
 def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
               convert_axes=True, convert_dates=True, keep_default_dates=True,
@@ -291,14 +303,16 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
             except:
                 pass
 
-        if data.dtype == 'float':
+        if data.dtype.kind == 'f':
 
-            # coerce floats to 64
-            try:
-                data = data.astype('float64')
-                result = True
-            except:
-                pass
+            if data.dtype != 'float64':
+
+                # coerce floats to 64
+                try:
+                    data = data.astype('float64')
+                    result = True
+                except:
+                    pass
 
         # do't coerce 0-len data
         if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
@@ -448,14 +462,35 @@ def _parse_no_numpy(self):
             self.obj = DataFrame(
                 loads(json, precise_float=self.precise_float), dtype=None)
 
+    def _process_converter(self, f, filt=None):
+        """ take a conversion function and possibly recreate the frame """
+
+        if filt is None:
+            filt = lambda col, c: True
+
+        needs_new_obj = False
+        new_obj = dict()
+        for i, (col, c) in enumerate(self.obj.iteritems()):
+            if filt(col, c):
+                new_data, result = f(col, c)
+                if result:
+                    c = new_data
+                    needs_new_obj = True
+            new_obj[i] = c
+
+        if needs_new_obj:
+
+            # possibly handle dup columns
+            new_obj = DataFrame(new_obj,index=self.obj.index)
+            new_obj.columns = self.obj.columns
+            self.obj = new_obj
+
     def _try_convert_types(self):
         if self.obj is None: return
         if self.convert_dates:
             self._try_convert_dates()
-        for col in self.obj.columns:
-            new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False)
-            if result:
-                self.obj[col] = new_data
+
+        self._process_converter(lambda col, c: self._try_convert_data(col, c, convert_dates=False))
 
     def _try_convert_dates(self):
         if self.obj is None: return
@@ -478,9 +513,6 @@ def is_ok(col):
                     return True
             return False
 
+        self._process_converter(lambda col, c: self._try_convert_to_date(c),
+                                lambda col, c: (self.keep_default_dates and is_ok(col)) or col in convert_dates)
 
-        for col in self.obj.columns:
-            if (self.keep_default_dates and is_ok(col)) or col in convert_dates:
-                new_data, result = self._try_convert_to_date(self.obj[col])
-                if result:
-                    self.obj[col] = new_data
diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py
@@ -83,6 +83,21 @@ def test_frame_non_unique_columns(self):
         unser = read_json(df.to_json(orient='values'), orient='values')
         np.testing.assert_equal(df.values, unser.values)
 
+        # GH4377; duplicate columns not processing correctly
+        df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y'])
+        result = read_json(df.to_json(orient='split'), orient='split')
+        assert_frame_equal(result, df)
+
+        def _check(df):
+            result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x'])
+            assert_frame_equal(result, df)
+
+        for o in [[['a','b'],['c','d']],
+                  [[1.5,2.5],[3.5,4.5]],
+                  [[1,2.5],[3,4.5]],
+                  [[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]:
+            _check(DataFrame(o, index=[1,2], columns=['x','x']))
+
     def test_frame_from_json_to_json(self):
 
         def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2950,6 +2950,12 @@ def check(result, expected=None):
         expected = DataFrame([[1],[1],[1]],columns=['bar'])
         check(df,expected)
 
+        # values
+        df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x'])
+        result = df.values
+        expected = np.array([[1,2.5],[3,4.5]])
+        self.assert_((result == expected).all().all())
+
     def test_insert_benchmark(self):
         # from the vb_suite/frame_methods/frame_insert_columns
         N = 10