Skip to content

Commit c271d4d

Browse files
WillAydjreback
authored andcommitted
Fixed issue with read_json and partially missing MI names (pandas-dev#19177)
1 parent eee83e2 commit c271d4d

File tree

4 files changed

+53
-21
lines changed

4 files changed

+53
-21
lines changed

doc/source/io.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -2228,9 +2228,10 @@ round-trippable manner.
22282228
new_df
22292229
new_df.dtypes
22302230
2231-
Please note that the string `index` is not supported with the round trip
2232-
format, as it is used by default in ``write_json`` to indicate a missing index
2233-
name.
2231+
Please note that the literal string 'index' as the name of an :class:`Index`
2232+
is not round-trippable, nor are any names beginning with 'level_' within a
2233+
:class:`MultiIndex`. These are used by default in :func:`DataFrame.to_json` to
2234+
indicate missing values and the subsequent read cannot distinguish the intent.
22342235

22352236
.. ipython:: python
22362237

pandas/io/json/json.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -341,12 +341,14 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
341341
342342
Notes
343343
-----
344-
Specific to ``orient='table'``, if a ``DataFrame`` with a literal ``Index``
345-
name of `index` gets written with ``write_json``, the subsequent read
346-
operation will incorrectly set the ``Index`` name to ``None``. This is
347-
because `index` is also used by ``write_json`` to denote a missing
348-
``Index`` name, and the subsequent ``read_json`` operation cannot
349-
distinguish between the two.
344+
Specific to ``orient='table'``, if a :class:`DataFrame` with a literal
345+
:class:`Index` name of `index` gets written with :func:`to_json`, the
346+
subsequent read operation will incorrectly set the :class:`Index` name to
347+
``None``. This is because `index` is also used by :func:`DataFrame.to_json`
348+
to denote a missing :class:`Index` name, and the subsequent
349+
:func:`read_json` operation cannot distinguish between the two. The same
350+
limitation is encountered with a :class:`MultiIndex` and any names
351+
beginning with 'level_'.
350352
351353
See Also
352354
--------

pandas/io/json/table_schema.py

+20-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
44
http://specs.frictionlessdata.io/json-table-schema/
55
"""
6+
import warnings
7+
68
import pandas._libs.json as json
79
from pandas import DataFrame
810
from pandas.api.types import CategoricalDtype
@@ -68,6 +70,12 @@ def as_json_table_type(x):
6870
def set_default_names(data):
6971
"""Sets index names to 'index' for regular, or 'level_x' for Multi"""
7072
if _all_not_none(*data.index.names):
73+
nms = data.index.names
74+
if len(nms) == 1 and data.index.name == 'index':
75+
warnings.warn("Index name of 'index' is not round-trippable")
76+
elif len(nms) > 1 and any(x.startswith('level_') for x in nms):
77+
warnings.warn("Index names beginning with 'level_' are not "
78+
"round-trippable")
7179
return data
7280

7381
data = data.copy()
@@ -273,10 +281,13 @@ def parse_table_schema(json, precise_float):
273281
274282
Notes
275283
-----
276-
Because ``write_json`` uses the string `index` to denote a name-less
277-
``Index``, this function sets the name of the returned ``DataFrame`` to
278-
``None`` when said string is encountered. Therefore, intentional usage
279-
of `index` as the ``Index`` name is not supported.
284+
Because :func:`DataFrame.to_json` uses the string 'index' to denote a
285+
name-less :class:`Index`, this function sets the name of the returned
286+
:class:`DataFrame` to ``None`` when said string is encountered with a
287+
normal :class:`Index`. For a :class:`MultiIndex`, the same limitation
288+
applies to any strings beginning with 'level_'. Therefore, an
289+
:class:`Index` name of 'index' and :class:`MultiIndex` names starting
290+
with 'level_' are not supported.
280291
281292
See also
282293
--------
@@ -303,10 +314,11 @@ def parse_table_schema(json, precise_float):
303314
df = df.astype(dtypes)
304315

305316
df = df.set_index(table['schema']['primaryKey'])
306-
if len(df.index.names) == 1 and df.index.name == 'index':
307-
df.index.name = None
317+
if len(df.index.names) == 1:
318+
if df.index.name == 'index':
319+
df.index.name = None
308320
else:
309-
if all(x.startswith('level_') for x in df.index.names):
310-
df.index.names = [None] * len(df.index.names)
321+
df.index.names = [None if x.startswith('level_') else x for x in
322+
df.index.names]
311323

312324
return df

pandas/tests/io/json/test_json_table_schema.py

+21-4
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,20 @@ def test_set_names_unset(self, idx, nm, prop):
451451
result = set_default_names(data)
452452
assert getattr(result.index, prop) == nm
453453

454+
@pytest.mark.parametrize("idx", [
455+
pd.Index([], name='index'),
456+
pd.MultiIndex.from_arrays([['foo'], ['bar']],
457+
names=('level_0', 'level_1')),
458+
pd.MultiIndex.from_arrays([['foo'], ['bar']],
459+
names=('foo', 'level_1'))
460+
])
461+
def test_warns_non_roundtrippable_names(self, idx):
462+
# GH 19130
463+
df = pd.DataFrame([[]], index=idx)
464+
df.index.name = 'index'
465+
with tm.assert_produces_warning():
466+
set_default_names(df)
467+
454468
def test_timestamp_in_columns(self):
455469
df = pd.DataFrame([[1, 2]], columns=[pd.Timestamp('2016'),
456470
pd.Timedelta(10, unit='s')])
@@ -481,7 +495,8 @@ def test_mi_falsey_name(self):
481495
class TestTableOrientReader(object):
482496

483497
@pytest.mark.parametrize("index_nm", [
484-
None, "idx", pytest.param("index", marks=pytest.mark.xfail)])
498+
None, "idx", pytest.param("index", marks=pytest.mark.xfail),
499+
'level_0'])
485500
@pytest.mark.parametrize("vals", [
486501
{'ints': [1, 2, 3, 4]},
487502
{'objects': ['a', 'b', 'c', 'd']},
@@ -492,7 +507,7 @@ class TestTableOrientReader(object):
492507
pytest.param({'floats': [1., 2., 3., 4.]}, marks=pytest.mark.xfail),
493508
{'floats': [1.1, 2.2, 3.3, 4.4]},
494509
{'bools': [True, False, False, True]}])
495-
def test_read_json_table_orient(self, index_nm, vals):
510+
def test_read_json_table_orient(self, index_nm, vals, recwarn):
496511
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
497512
out = df.to_json(orient="table")
498513
result = pd.read_json(out, orient="table")
@@ -504,7 +519,7 @@ def test_read_json_table_orient(self, index_nm, vals):
504519
{'timedeltas': pd.timedelta_range('1H', periods=4, freq='T')},
505520
{'timezones': pd.date_range('2016-01-01', freq='d', periods=4,
506521
tz='US/Central')}])
507-
def test_read_json_table_orient_raises(self, index_nm, vals):
522+
def test_read_json_table_orient_raises(self, index_nm, vals, recwarn):
508523
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
509524
out = df.to_json(orient="table")
510525
with tm.assert_raises_regex(NotImplementedError, 'can not yet read '):
@@ -530,7 +545,9 @@ def test_comprehensive(self):
530545
result = pd.read_json(out, orient="table")
531546
tm.assert_frame_equal(df, result)
532547

533-
@pytest.mark.parametrize("index_names", [[None, None], ['foo', 'bar']])
548+
@pytest.mark.parametrize("index_names", [
549+
[None, None], ['foo', 'bar'], ['foo', None], [None, 'foo'],
550+
['index', 'foo']])
534551
def test_multiindex(self, index_names):
535552
# GH 18912
536553
df = pd.DataFrame(

0 commit comments

Comments
 (0)