From bcaa92c293fbaea4b0b196cab574a5a33b83e6db Mon Sep 17 00:00:00 2001 From: Bolke de Bruin Date: Wed, 18 Dec 2019 13:47:12 +0100 Subject: [PATCH] Fix TypeError when pulling field that is None If normalizing a jsonstruct a field can be set to None due to a schema change. Co-Authored-By: William Ayd --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/io/json/_normalize.py | 16 +++++++++++++--- pandas/tests/io/json/test_normalize.py | 24 ++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 96ea682dd3caf..f2bbf2a31b325 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -832,6 +832,7 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) Plotting ^^^^^^^^ diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index aa14c3f3a63f3..d486e0000332b 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -3,13 +3,14 @@ from collections import defaultdict import copy -from typing import DefaultDict, Dict, List, Optional, Union +from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union import numpy as np from pandas._libs.writers import convert_json_to_lines from pandas.util._decorators import deprecate +import pandas as pd from pandas import DataFrame @@ -229,14 +230,23 @@ def _json_normalize( Returns normalized data with columns prefixed with the given string. """ - def _pull_field(js, spec): - result = js + def _pull_field(js: Dict[str, Any], spec: Union[List, str]) -> Iterable: + result = js # type: ignore if isinstance(spec, list): for field in spec: result = result[field] else: result = result[spec] + if not isinstance(result, Iterable): + if pd.isnull(result): + result = [] # type: ignore + else: + raise TypeError( + f"{js} has non iterable value {result} for path {spec}. " + "Must be iterable or null." + ) + return result if isinstance(data, list) and not data: diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index 038dd2df4d632..46dd1e94aa739 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -462,6 +462,30 @@ def test_nested_flattening_consistent(self): # They should be the same. tm.assert_frame_equal(df1, df2) + def test_nonetype_record_path(self, nulls_fixture): + # see gh-30148 + # should not raise TypeError + result = json_normalize( + [ + {"state": "Texas", "info": nulls_fixture}, + {"state": "Florida", "info": [{"i": 2}]}, + ], + record_path=["info"], + ) + expected = DataFrame({"i": 2}, index=[0]) + tm.assert_equal(result, expected) + + def test_non_interable_record_path_errors(self): + # see gh-30148 + test_input = {"state": "Texas", "info": 1} + test_path = "info" + msg = ( + f"{test_input} has non iterable value 1 for path {test_path}. " + "Must be iterable or null." + ) + with pytest.raises(TypeError, match=msg): + json_normalize([test_input], record_path=[test_path]) + class TestNestedToRecord: def test_flat_stays_flat(self):