Skip to content

Commit 03b4997

Browse files
committed
Merge pull request #4388 from jreback/GH4377
BUG: fixes for GH4377
2 parents 8f387f6 + 40d08df commit 03b4997

File tree

6 files changed

+97
-43
lines changed

6 files changed

+97
-43
lines changed

doc/source/release.rst

+4
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ pandas 0.13
8383
- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
8484
of a duplicate index (:issue:`4359`)
8585
- Fixed passing ``keep_default_na=False`` when ``na_values=None`` (:issue:`4318`)
86+
- Fixed bug with ``values`` raising an error on a DataFrame with duplicate columns and mixed
87+
dtypes, surfaced in (:issue:`4377`)
88+
- Fixed bug with duplicate columns and type conversion in ``read_json`` when
89+
``orient='split'`` (:issue:`4377`)
8690

8791
pandas 0.12
8892
===========

doc/source/v0.13.0.txt

-3
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,6 @@ Bug Fixes
6161
- Fixed bug where ``network`` testing was throwing ``NameError`` because a
6262
local variable was undefined (:issue:`4381`)
6363

64-
- In ``to_json``, raise if a passed ``orient`` would cause loss of data because
65-
of a duplicate index (:issue:`4359`)
66-
6764
- Suppressed DeprecationWarning associated with internal calls issued by repr() (:issue:`4391`)
6865

6966
See the :ref:`full release notes

pandas/core/internals.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -1538,23 +1538,23 @@ def _interleave(self, items):
15381538
# By construction, all of the item should be covered by one of the
15391539
# blocks
15401540
if items.is_unique:
1541+
15411542
for block in self.blocks:
15421543
indexer = items.get_indexer(block.items)
15431544
if (indexer == -1).any():
15441545
raise AssertionError('Items must contain all block items')
15451546
result[indexer] = block.get_values(dtype)
15461547
itemmask[indexer] = 1
1548+
1549+
if not itemmask.all():
1550+
raise AssertionError('Some items were not contained in blocks')
1551+
15471552
else:
1548-
for block in self.blocks:
1549-
mask = items.isin(block.items)
1550-
indexer = mask.nonzero()[0]
1551-
if (len(indexer) != len(block.items)):
1552-
raise AssertionError('All items must be in block items')
1553-
result[indexer] = block.get_values(dtype)
1554-
itemmask[indexer] = 1
15551553

1556-
if not itemmask.all():
1557-
raise AssertionError('Some items were not contained in blocks')
1554+
# non-unique, must use ref_locs
1555+
rl = self._set_ref_locs()
1556+
for i, (block, idx) in enumerate(rl):
1557+
result[i] = block.iget(idx)
15581558

15591559
return result
15601560

pandas/io/json.py

+63-31
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,24 @@ def __init__(self, obj, orient, date_format, double_precision, ensure_ascii):
5252
self._format_axes()
5353
self._format_dates()
5454

55+
def _needs_to_date(self, obj):
56+
return obj.dtype == 'datetime64[ns]'
57+
5558
def _format_dates(self):
5659
raise NotImplementedError
5760

5861
def _format_axes(self):
5962
raise NotImplementedError
6063

61-
def _needs_to_date(self, data):
62-
return self.date_format == 'iso' and data.dtype == 'datetime64[ns]'
63-
6464
def _format_to_date(self, data):
65-
if self._needs_to_date(data):
65+
66+
# iso
67+
if self.date_format == 'iso':
6668
return data.apply(lambda x: x.isoformat())
67-
return data
69+
70+
# int64
71+
else:
72+
return data.astype(np.int64)
6873

6974
def copy_if_needed(self):
7075
""" copy myself if necessary """
@@ -87,13 +92,11 @@ def _format_axes(self):
8792
self.obj.index = self._format_to_date(self.obj.index.to_series())
8893

8994
def _format_dates(self):
90-
if self._needs_to_date(self.obj):
91-
self.copy_if_needed()
95+
if self.obj.dtype == 'datetime64[ns]':
9296
self.obj = self._format_to_date(self.obj)
9397

9498
def _format_bools(self):
9599
if self._needs_to_bool(self.obj):
96-
self.copy_if_needed()
97100
self.obj = self._format_to_bool(self.obj)
98101

99102
class FrameWriter(Writer):
@@ -123,13 +126,22 @@ def _format_axes(self):
123126
setattr(self.obj,axis,self._format_to_date(a.to_series()))
124127

125128
def _format_dates(self):
126-
if self.date_format == 'iso':
127-
dtypes = self.obj.dtypes
128-
dtypes = dtypes[dtypes == 'datetime64[ns]']
129-
if len(dtypes):
130-
self.copy_if_needed()
131-
for c in dtypes.index:
132-
self.obj[c] = self._format_to_date(self.obj[c])
129+
dtypes = self.obj.dtypes
130+
if len(dtypes[dtypes == 'datetime64[ns]']):
131+
132+
# need to create a new object
133+
d = {}
134+
135+
for i, (col, c) in enumerate(self.obj.iteritems()):
136+
137+
if c.dtype == 'datetime64[ns]':
138+
c = self._format_to_date(c)
139+
140+
d[i] = c
141+
142+
d = DataFrame(d,index=self.obj.index)
143+
d.columns = self.obj.columns
144+
self.obj = d
133145

134146
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
135147
convert_axes=True, convert_dates=True, keep_default_dates=True,
@@ -291,14 +303,16 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True):
291303
except:
292304
pass
293305

294-
if data.dtype == 'float':
306+
if data.dtype.kind == 'f':
295307

296-
# coerce floats to 64
297-
try:
298-
data = data.astype('float64')
299-
result = True
300-
except:
301-
pass
308+
if data.dtype != 'float64':
309+
310+
# coerce floats to 64
311+
try:
312+
data = data.astype('float64')
313+
result = True
314+
except:
315+
pass
302316

303317
# do't coerce 0-len data
304318
if len(data) and (data.dtype == 'float' or data.dtype == 'object'):
@@ -448,14 +462,35 @@ def _parse_no_numpy(self):
448462
self.obj = DataFrame(
449463
loads(json, precise_float=self.precise_float), dtype=None)
450464

465+
def _process_converter(self, f, filt=None):
466+
""" take a conversion function and possibly recreate the frame """
467+
468+
if filt is None:
469+
filt = lambda col, c: True
470+
471+
needs_new_obj = False
472+
new_obj = dict()
473+
for i, (col, c) in enumerate(self.obj.iteritems()):
474+
if filt(col, c):
475+
new_data, result = f(col, c)
476+
if result:
477+
c = new_data
478+
needs_new_obj = True
479+
new_obj[i] = c
480+
481+
if needs_new_obj:
482+
483+
# possibly handle dup columns
484+
new_obj = DataFrame(new_obj,index=self.obj.index)
485+
new_obj.columns = self.obj.columns
486+
self.obj = new_obj
487+
451488
def _try_convert_types(self):
452489
if self.obj is None: return
453490
if self.convert_dates:
454491
self._try_convert_dates()
455-
for col in self.obj.columns:
456-
new_data, result = self._try_convert_data(col, self.obj[col], convert_dates=False)
457-
if result:
458-
self.obj[col] = new_data
492+
493+
self._process_converter(lambda col, c: self._try_convert_data(col, c, convert_dates=False))
459494

460495
def _try_convert_dates(self):
461496
if self.obj is None: return
@@ -478,9 +513,6 @@ def is_ok(col):
478513
return True
479514
return False
480515

516+
self._process_converter(lambda col, c: self._try_convert_to_date(c),
517+
lambda col, c: (self.keep_default_dates and is_ok(col)) or col in convert_dates)
481518

482-
for col in self.obj.columns:
483-
if (self.keep_default_dates and is_ok(col)) or col in convert_dates:
484-
new_data, result = self._try_convert_to_date(self.obj[col])
485-
if result:
486-
self.obj[col] = new_data

pandas/io/tests/test_json/test_pandas.py

+15
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,21 @@ def test_frame_non_unique_columns(self):
8383
unser = read_json(df.to_json(orient='values'), orient='values')
8484
np.testing.assert_equal(df.values, unser.values)
8585

86+
# GH4377; duplicate columns not processing correctly
87+
df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y'])
88+
result = read_json(df.to_json(orient='split'), orient='split')
89+
assert_frame_equal(result, df)
90+
91+
def _check(df):
92+
result = read_json(df.to_json(orient='split'), orient='split', convert_dates=['x'])
93+
assert_frame_equal(result, df)
94+
95+
for o in [[['a','b'],['c','d']],
96+
[[1.5,2.5],[3.5,4.5]],
97+
[[1,2.5],[3,4.5]],
98+
[[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]:
99+
_check(DataFrame(o, index=[1,2], columns=['x','x']))
100+
86101
def test_frame_from_json_to_json(self):
87102

88103
def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None):

pandas/tests/test_frame.py

+6
Original file line numberDiff line numberDiff line change
@@ -2950,6 +2950,12 @@ def check(result, expected=None):
29502950
expected = DataFrame([[1],[1],[1]],columns=['bar'])
29512951
check(df,expected)
29522952

2953+
# values
2954+
df = DataFrame([[1,2.5],[3,4.5]], index=[1,2], columns=['x','x'])
2955+
result = df.values
2956+
expected = np.array([[1,2.5],[3,4.5]])
2957+
self.assert_((result == expected).all().all())
2958+
29532959
def test_insert_benchmark(self):
29542960
# from the vb_suite/frame_methods/frame_insert_columns
29552961
N = 10

0 commit comments

Comments
 (0)