Skip to content

REF: Make read_json less stateful #59124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 68 additions & 90 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ def read(self) -> DataFrame | Series:
else:
return obj

def _get_object_parser(self, json) -> DataFrame | Series:
def _get_object_parser(self, json: str) -> DataFrame | Series:
"""
Parses a json document into a pandas object.
"""
Expand All @@ -985,16 +985,14 @@ def _get_object_parser(self, json) -> DataFrame | Series:
"date_unit": self.date_unit,
"dtype_backend": self.dtype_backend,
}
obj = None
if typ == "frame":
obj = FrameParser(json, **kwargs).parse()

if typ == "series" or obj is None:
return FrameParser(json, **kwargs).parse()
elif typ == "series":
if not isinstance(dtype, bool):
kwargs["dtype"] = dtype
obj = SeriesParser(json, **kwargs).parse()

return obj
return SeriesParser(json, **kwargs).parse()
else:
raise ValueError(f"{typ=} must be 'frame' or 'series'.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a bugfix note for this at least to have it visibile. Something to the effect of "read_json now raises if the typ argument is neither frame nor series"


def close(self) -> None:
"""
Expand Down Expand Up @@ -1107,7 +1105,6 @@ def __init__(
self.convert_dates = convert_dates
self.date_unit = date_unit
self.keep_default_dates = keep_default_dates
self.obj: DataFrame | Series | None = None
self.dtype_backend = dtype_backend

@final
Expand All @@ -1121,26 +1118,22 @@ def check_keys_split(self, decoded: dict) -> None:
raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")

@final
def parse(self):
self._parse()
def parse(self) -> DataFrame | Series:
obj = self._parse()

if self.obj is None:
return None
if self.convert_axes:
self._convert_axes()
self._try_convert_types()
return self.obj
obj = self._convert_axes(obj)
obj = self._try_convert_types(obj)
return obj

def _parse(self) -> None:
def _parse(self) -> DataFrame | Series:
raise AbstractMethodError(self)

@final
def _convert_axes(self) -> None:
def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series:
"""
Try to convert axes.
"""
obj = self.obj
assert obj is not None # for mypy
for axis_name in obj._AXIS_ORDERS:
ax = obj._get_axis(axis_name)
ser = Series(ax, dtype=ax.dtype, copy=False)
Expand All @@ -1153,9 +1146,10 @@ def _convert_axes(self) -> None:
)
if result:
new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
setattr(self.obj, axis_name, new_axis)
setattr(obj, axis_name, new_axis)
return obj

def _try_convert_types(self) -> None:
def _try_convert_types(self, obj):
raise AbstractMethodError(self)

@final
Expand Down Expand Up @@ -1292,42 +1286,31 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
class SeriesParser(Parser):
_default_orient = "index"
_split_keys = ("name", "index", "data")
obj: Series | None

def _parse(self) -> None:
def _parse(self) -> Series:
data = ujson_loads(self.json, precise_float=self.precise_float)

if self.orient == "split":
decoded = {str(k): v for k, v in data.items()}
self.check_keys_split(decoded)
self.obj = Series(**decoded)
return Series(**decoded)
else:
self.obj = Series(data)
return Series(data)

def _try_convert_types(self) -> None:
if self.obj is None:
return
obj, result = self._try_convert_data(
"data", self.obj, convert_dates=self.convert_dates
)
if result:
self.obj = obj
def _try_convert_types(self, obj: Series) -> Series:
obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing code is pretty strange, but wouldn't this return something different in the case where the result value was False?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think obj should be the same if result value was False i.e. the conversion was a no-op. I'm relying on that fact here that a no-op result should just be passed through here with _try_convert_data

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK - definitely not a fan of relying on the implementation of another function like that...but that can be cleaned up in a separate PR

return obj


class FrameParser(Parser):
_default_orient = "columns"
_split_keys = ("columns", "index", "data")
obj: DataFrame | None

def _parse(self) -> None:
def _parse(self) -> DataFrame:
json = self.json
orient = self.orient

if orient == "columns":
self.obj = DataFrame(
ujson_loads(json, precise_float=self.precise_float), dtype=None
)
elif orient == "split":
if orient == "split":
decoded = {
str(k): v
for k, v in ujson_loads(json, precise_float=self.precise_float).items()
Expand All @@ -1341,34 +1324,34 @@ def _parse(self) -> None:
orig_names,
is_potential_multi_index(orig_names, None),
)
self.obj = DataFrame(dtype=None, **decoded)
return DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame.from_dict(
return DataFrame.from_dict(
ujson_loads(json, precise_float=self.precise_float),
dtype=None,
orient="index",
)
elif orient == "table":
self.obj = parse_table_schema(json, precise_float=self.precise_float)
return parse_table_schema(json, precise_float=self.precise_float)
else:
self.obj = DataFrame(
# includes orient == "columns"
return DataFrame(
ujson_loads(json, precise_float=self.precise_float), dtype=None
)

@final
def _process_converter(
self,
obj: DataFrame,
f: Callable[[Hashable, Series], tuple[Series, bool]],
filt: Callable[[Hashable], bool] | None = None,
) -> None:
) -> DataFrame:
"""
Take a conversion function and possibly recreate the frame.
"""
if filt is None:
filt = lambda col: True

obj = self.obj
assert obj is not None # for mypy

needs_new_obj = False
new_obj = {}
for i, (col, c) in enumerate(obj.items()):
Expand All @@ -1383,48 +1366,43 @@ def _process_converter(
# possibly handle dup columns
new_frame = DataFrame(new_obj, index=obj.index)
new_frame.columns = obj.columns
self.obj = new_frame
return new_frame
return obj

def _try_convert_types(self) -> None:
if self.obj is None:
return
def _try_convert_types(self, obj: DataFrame) -> DataFrame:
if self.convert_dates:
self._try_convert_dates()

self._process_converter(
lambda col, c: self._try_convert_data(col, c, convert_dates=False)
)

def _try_convert_dates(self) -> None:
if self.obj is None:
return

# our columns to parse
convert_dates_list_bool = self.convert_dates
if isinstance(convert_dates_list_bool, bool):
convert_dates_list_bool = []
convert_dates = set(convert_dates_list_bool)

def is_ok(col) -> bool:
"""
Return if this col is ok to try for a date parse.
"""
if col in convert_dates:
return True
if not self.keep_default_dates:
return False
if not isinstance(col, str):
# our columns to parse
convert_dates_list_bool = self.convert_dates
if isinstance(convert_dates_list_bool, bool):
convert_dates_list_bool = []
convert_dates = set(convert_dates_list_bool)

def is_ok(col) -> bool:
"""
Return if this col is ok to try for a date parse.
"""
if col in convert_dates:
return True
if not self.keep_default_dates:
return False
if not isinstance(col, str):
return False

col_lower = col.lower()
if (
col_lower.endswith(("_at", "_time"))
or col_lower == "modified"
or col_lower == "date"
or col_lower == "datetime"
or col_lower.startswith("timestamp")
):
return True
return False

col_lower = col.lower()
if (
col_lower.endswith(("_at", "_time"))
or col_lower == "modified"
or col_lower == "date"
or col_lower == "datetime"
or col_lower.startswith("timestamp")
):
return True
return False

self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)
obj = self._process_converter(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to make less stateless while maintaining these as separate functions? I find the new code harder to read, especially with the subtle difference in arguments to self._process_converter

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing. I just pushed a change to remove _process_converter and make things simpler

obj, lambda col, c: self._try_convert_to_date(c), filt=is_ok
)

return self._process_converter(
obj, lambda col, c: self._try_convert_data(col, c, convert_dates=False)
)
2 changes: 1 addition & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self):

def test_typ(self):
s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
result = read_json(StringIO(s.to_json()), typ=None)
result = read_json(StringIO(s.to_json()), typ="series")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did the tests need to change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test didn't align with the docs/typing of typ accepting None. I added validation in this PR to ensure this case fails now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm OK. That's super strange we allowed that to begin with...but hopefully harmless taking it away

tm.assert_series_equal(result, s)

def test_reconstruction_index(self):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine):
s = pd.Series({"A": 1, "B": 2})

strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
unchunked = read_json(strio, lines=True, typ="series", engine=engine)

strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(
strio, lines=True, typ="Series", chunksize=1, engine=engine
strio, lines=True, typ="series", chunksize=1, engine=engine
) as reader:
chunked = pd.concat(reader)

Expand Down