ENH: Preserve Series index on json_normalize (#57422)

nworb-cire · web-flow · commit 81e3e0febf25 · 2024-02-14T13:12:55.000-08:00
* Preserve index on json_normalize

* Update unit tests

* Update release notes

* Pass linter

* Set index in constructor

* Use tm assert_index_equal in unit test

* Update docstring and examples

* Update release notes
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -31,6 +31,7 @@ Other enhancements
 - :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
 - :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
 - Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
+- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -19,7 +19,10 @@
 from pandas._libs.writers import convert_json_to_lines
 
 import pandas as pd
-from pandas import DataFrame
+from pandas import (
+    DataFrame,
+    Series,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -266,7 +269,7 @@ def _simple_json_normalize(
 
 
 def json_normalize(
-    data: dict | list[dict],
+    data: dict | list[dict] | Series,
     record_path: str | list | None = None,
     meta: str | list[str | list[str]] | None = None,
     meta_prefix: str | None = None,
@@ -280,7 +283,7 @@ def json_normalize(
 
     Parameters
     ----------
-    data : dict or list of dicts
+    data : dict, list of dicts, or Series of dicts
         Unserialized JSON objects.
     record_path : str or list of str, default None
         Path in each object to list of records. If not passed, data will be
@@ -365,6 +368,26 @@ def json_normalize(
     1  NaN    Mark Reg             130              60
     2  2.0  Faye Raker             130              60
 
+    >>> data = [
+    ...     {
+    ...         "id": 1,
+    ...         "name": "Cole Volk",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ...     {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
+    ...     {
+    ...         "id": 2,
+    ...         "name": "Faye Raker",
+    ...         "fitness": {"height": 130, "weight": 60},
+    ...     },
+    ... ]
+    >>> series = pd.Series(data, index=pd.Index(["a", "b", "c"]))
+    >>> pd.json_normalize(series)
+        id        name  fitness.height  fitness.weight
+    a  1.0   Cole Volk             130              60
+    b  NaN    Mark Reg             130              60
+    c  2.0  Faye Raker             130              60
+
     >>> data = [
     ...     {
     ...         "state": "Florida",
@@ -455,6 +478,11 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
                 )
         return result
 
+    if isinstance(data, Series):
+        index = data.index
+    else:
+        index = None
+
     if isinstance(data, list) and not data:
         return DataFrame()
     elif isinstance(data, dict):
@@ -477,7 +505,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
         and record_prefix is None
         and max_level is None
     ):
-        return DataFrame(_simple_json_normalize(data, sep=sep))
+        return DataFrame(_simple_json_normalize(data, sep=sep), index=index)
 
     if record_path is None:
         if any([isinstance(x, dict) for x in y.values()] for y in data):
@@ -489,7 +517,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
             # TODO: handle record value which are lists, at least error
             #       reasonably
             data = nested_to_record(data, sep=sep, max_level=max_level)
-        return DataFrame(data)
+        return DataFrame(data, index=index)
     elif not isinstance(record_path, list):
         record_path = [record_path]
 
@@ -564,4 +592,6 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
                 values[i] = val
 
         result[k] = values.repeat(lengths)
+    if index is not None:
+        result.index = index.repeat(lengths)
     return result
diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py
@@ -561,6 +561,14 @@ def test_top_column_with_leading_underscore(self):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_series_index(self, state_data):
+        idx = Index([7, 8])
+        series = Series(state_data, index=idx)
+        result = json_normalize(series)
+        tm.assert_index_equal(result.index, idx)
+        result = json_normalize(series, "counties")
+        tm.assert_index_equal(result.index, idx.repeat([3, 2]))
+
 
 class TestNestedToRecord:
     def test_flat_stays_flat(self):
@@ -891,6 +899,7 @@ def test_series_non_zero_index(self):
                 "elements.a": [1.0, np.nan, np.nan],
                 "elements.b": [np.nan, 2.0, np.nan],
                 "elements.c": [np.nan, np.nan, 3.0],
-            }
+            },
+            index=[1, 2, 3],
         )
         tm.assert_frame_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ Other enhancements`
`31`	`31`	- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
`32`	`32`	- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
`33`	`33`	- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
	`34`	+- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
`34`	`35`	`-`
`35`	`36`
`36`	`37`	`.. ---------------------------------------------------------------------------`