Skip to content

Commit 52bdfdc

Browse files
BUG: Fix pd.json_normalize to not skip the first element of a generator input (#38698)
1 parent 94810d1 commit 52bdfdc

File tree

3 files changed

+39
-4
lines changed

3 files changed

+39
-4
lines changed

doc/source/whatsnew/v1.3.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,8 @@ I/O
264264
- Allow custom error values for parse_dates argument of :func:`read_sql`, :func:`read_sql_query` and :func:`read_sql_table` (:issue:`35185`)
265265
- Bug in :func:`to_hdf` raising ``KeyError`` when trying to apply
266266
for subclasses of ``DataFrame`` or ``Series`` (:issue:`33748`).
267+
- Bug in :func:`json_normalize` resulting in the first element of a generator object not being included in the returned ``DataFrame`` (:issue:`35923`)
268+
267269

268270
Period
269271
^^^^^^

pandas/io/json/_normalize.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# ---------------------------------------------------------------------
22
# JSON normalization routines
33

4-
from collections import defaultdict
4+
from collections import abc, defaultdict
55
import copy
66
from typing import Any, DefaultDict, Dict, Iterable, List, Optional, Union
77

@@ -261,10 +261,15 @@ def _pull_records(js: Dict[str, Any], spec: Union[List, str]) -> List:
261261

262262
if isinstance(data, list) and not data:
263263
return DataFrame()
264-
265-
# A bit of a hackjob
266-
if isinstance(data, dict):
264+
elif isinstance(data, dict):
265+
# A bit of a hackjob
267266
data = [data]
267+
elif isinstance(data, abc.Iterable) and not isinstance(data, str):
268+
# GH35923 Fix pd.json_normalize to not skip the first element of a
269+
# generator input
270+
data = list(data)
271+
else:
272+
raise NotImplementedError
268273

269274
if record_path is None:
270275
if any([isinstance(x, dict) for x in y.values()] for y in data):

pandas/tests/io/json/test_normalize.py

+28
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from contextlib import nullcontext as does_not_raise
12
import json
23

34
import numpy as np
@@ -168,6 +169,22 @@ def test_empty_array(self):
168169
expected = DataFrame()
169170
tm.assert_frame_equal(result, expected)
170171

172+
@pytest.mark.parametrize(
173+
"data, record_path, error",
174+
[
175+
([{"a": 0}, {"a": 1}], None, does_not_raise()),
176+
({"a": [{"a": 0}, {"a": 1}]}, "a", does_not_raise()),
177+
('{"a": [{"a": 0}, {"a": 1}]}', None, pytest.raises(NotImplementedError)),
178+
(None, None, pytest.raises(NotImplementedError)),
179+
],
180+
)
181+
def test_accepted_input(self, data, record_path, error):
182+
with error:
183+
result = json_normalize(data, record_path=record_path)
184+
expected = DataFrame([0, 1], columns=["a"])
185+
186+
tm.assert_frame_equal(result, expected)
187+
171188
def test_simple_normalize_with_separator(self, deep_nested):
172189
# GH 14883
173190
result = json_normalize({"A": {"A": 1, "B": 2}})
@@ -518,6 +535,17 @@ def test_meta_non_iterable(self):
518535
)
519536
tm.assert_frame_equal(result, expected)
520537

538+
def test_generator(self, state_data):
539+
# GH35923 Fix pd.json_normalize to not skip the first element of a
540+
# generator input
541+
def generator_data():
542+
yield from state_data[0]["counties"]
543+
544+
result = json_normalize(generator_data())
545+
expected = DataFrame(state_data[0]["counties"])
546+
547+
tm.assert_frame_equal(result, expected)
548+
521549

522550
class TestNestedToRecord:
523551
def test_flat_stays_flat(self):

0 commit comments

Comments
 (0)