Skip to content

REF: Make read_json less stateful #59124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 82 additions & 127 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,7 @@ def read(self) -> DataFrame | Series:
else:
return obj

def _get_object_parser(self, json) -> DataFrame | Series:
def _get_object_parser(self, json: str) -> DataFrame | Series:
"""
Parses a json document into a pandas object.
"""
Expand All @@ -985,16 +985,14 @@ def _get_object_parser(self, json) -> DataFrame | Series:
"date_unit": self.date_unit,
"dtype_backend": self.dtype_backend,
}
obj = None
if typ == "frame":
obj = FrameParser(json, **kwargs).parse()

if typ == "series" or obj is None:
return FrameParser(json, **kwargs).parse()
elif typ == "series":
if not isinstance(dtype, bool):
kwargs["dtype"] = dtype
obj = SeriesParser(json, **kwargs).parse()

return obj
return SeriesParser(json, **kwargs).parse()
else:
raise ValueError(f"{typ=} must be 'frame' or 'series'.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add a bugfix note for this at least to have it visibile. Something to the effect of "read_json now raises if the typ argument is neither frame nor series"


def close(self) -> None:
"""
Expand Down Expand Up @@ -1107,7 +1105,6 @@ def __init__(
self.convert_dates = convert_dates
self.date_unit = date_unit
self.keep_default_dates = keep_default_dates
self.obj: DataFrame | Series | None = None
self.dtype_backend = dtype_backend

@final
Expand All @@ -1121,26 +1118,22 @@ def check_keys_split(self, decoded: dict) -> None:
raise ValueError(f"JSON data had unexpected key(s): {bad_keys_joined}")

@final
def parse(self):
self._parse()
def parse(self) -> DataFrame | Series:
obj = self._parse()

if self.obj is None:
return None
if self.convert_axes:
self._convert_axes()
self._try_convert_types()
return self.obj
obj = self._convert_axes(obj)
obj = self._try_convert_types(obj)
return obj

def _parse(self) -> None:
def _parse(self) -> DataFrame | Series:
raise AbstractMethodError(self)

@final
def _convert_axes(self) -> None:
def _convert_axes(self, obj: DataFrame | Series) -> DataFrame | Series:
"""
Try to convert axes.
"""
obj = self.obj
assert obj is not None # for mypy
for axis_name in obj._AXIS_ORDERS:
ax = obj._get_axis(axis_name)
ser = Series(ax, dtype=ax.dtype, copy=False)
Expand All @@ -1153,9 +1146,10 @@ def _convert_axes(self) -> None:
)
if result:
new_axis = Index(new_ser, dtype=new_ser.dtype, copy=False)
setattr(self.obj, axis_name, new_axis)
setattr(obj, axis_name, new_axis)
return obj

def _try_convert_types(self) -> None:
def _try_convert_types(self, obj):
raise AbstractMethodError(self)

@final
Expand All @@ -1182,8 +1176,10 @@ def _try_convert_data(

elif self.dtype is True:
pass
else:
# dtype to force
elif not _should_convert_dates(
convert_dates, self.keep_default_dates, name
):
# convert_dates takes precedence over columns listed in dtypes
dtype = (
self.dtype.get(name) if isinstance(self.dtype, dict) else self.dtype
)
Expand All @@ -1194,8 +1190,8 @@ def _try_convert_data(
return data, False

if convert_dates:
new_data, result = self._try_convert_to_date(data)
if result:
new_data = self._try_convert_to_date(data)
if new_data is not data:
return new_data, True

converted = False
Expand Down Expand Up @@ -1245,16 +1241,16 @@ def _try_convert_data(
return data, converted

@final
def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
def _try_convert_to_date(self, data: Series) -> Series:
"""
Try to parse a ndarray like into a date column.

Try to coerce object in epoch/iso formats and integer/float in epoch
formats. Return a boolean if parsing was successful.
formats.
"""
# no conversion on empty
if not len(data):
return data, False
return data

new_data = data

Expand All @@ -1265,7 +1261,7 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
try:
new_data = data.astype("int64")
except OverflowError:
return data, False
return data
except (TypeError, ValueError):
pass

Expand All @@ -1277,57 +1273,45 @@ def _try_convert_to_date(self, data: Series) -> tuple[Series, bool]:
| (new_data._values == iNaT)
)
if not in_range.all():
return data, False
return data

date_units = (self.date_unit,) if self.date_unit else self._STAMP_UNITS
for date_unit in date_units:
try:
new_data = to_datetime(new_data, errors="raise", unit=date_unit)
return to_datetime(new_data, errors="raise", unit=date_unit)
except (ValueError, OverflowError, TypeError):
continue
return new_data, True
return data, False
return data


class SeriesParser(Parser):
_default_orient = "index"
_split_keys = ("name", "index", "data")
obj: Series | None

def _parse(self) -> None:
def _parse(self) -> Series:
data = ujson_loads(self.json, precise_float=self.precise_float)

if self.orient == "split":
decoded = {str(k): v for k, v in data.items()}
self.check_keys_split(decoded)
self.obj = Series(**decoded)
return Series(**decoded)
else:
self.obj = Series(data)
return Series(data)

def _try_convert_types(self) -> None:
if self.obj is None:
return
obj, result = self._try_convert_data(
"data", self.obj, convert_dates=self.convert_dates
)
if result:
self.obj = obj
def _try_convert_types(self, obj: Series) -> Series:
obj, _ = self._try_convert_data("data", obj, convert_dates=self.convert_dates)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existing code is pretty strange, but wouldn't this return something different in the case where the result value was False?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think obj should be the same if result value was False i.e. the conversion was a no-op. I'm relying on that fact here that a no-op result should just be passed through here with _try_convert_data

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK - definitely not a fan of relying on the implementation of another function like that...but that can be cleaned up in a separate PR

return obj


class FrameParser(Parser):
_default_orient = "columns"
_split_keys = ("columns", "index", "data")
obj: DataFrame | None

def _parse(self) -> None:
def _parse(self) -> DataFrame:
json = self.json
orient = self.orient

if orient == "columns":
self.obj = DataFrame(
ujson_loads(json, precise_float=self.precise_float), dtype=None
)
elif orient == "split":
if orient == "split":
decoded = {
str(k): v
for k, v in ujson_loads(json, precise_float=self.precise_float).items()
Expand All @@ -1341,90 +1325,61 @@ def _parse(self) -> None:
orig_names,
is_potential_multi_index(orig_names, None),
)
self.obj = DataFrame(dtype=None, **decoded)
return DataFrame(dtype=None, **decoded)
elif orient == "index":
self.obj = DataFrame.from_dict(
return DataFrame.from_dict(
ujson_loads(json, precise_float=self.precise_float),
dtype=None,
orient="index",
)
elif orient == "table":
self.obj = parse_table_schema(json, precise_float=self.precise_float)
return parse_table_schema(json, precise_float=self.precise_float)
else:
self.obj = DataFrame(
# includes orient == "columns"
return DataFrame(
ujson_loads(json, precise_float=self.precise_float), dtype=None
)

def _process_converter(
self,
f: Callable[[Hashable, Series], tuple[Series, bool]],
filt: Callable[[Hashable], bool] | None = None,
) -> None:
"""
Take a conversion function and possibly recreate the frame.
"""
if filt is None:
filt = lambda col: True

obj = self.obj
assert obj is not None # for mypy

needs_new_obj = False
new_obj = {}
for i, (col, c) in enumerate(obj.items()):
if filt(col):
new_data, result = f(col, c)
if result:
c = new_data
needs_new_obj = True
new_obj[i] = c

if needs_new_obj:
# possibly handle dup columns
new_frame = DataFrame(new_obj, index=obj.index)
new_frame.columns = obj.columns
self.obj = new_frame

def _try_convert_types(self) -> None:
if self.obj is None:
return
if self.convert_dates:
self._try_convert_dates()

self._process_converter(
lambda col, c: self._try_convert_data(col, c, convert_dates=False)
def _try_convert_types(self, obj: DataFrame) -> DataFrame:
arrays = []
for col_label, series in obj.items():
result, _ = self._try_convert_data(
col_label,
series,
convert_dates=_should_convert_dates(
self.convert_dates,
keep_default_dates=self.keep_default_dates,
col=col_label,
),
)
arrays.append(result.array)
return DataFrame._from_arrays(
arrays, obj.columns, obj.index, verify_integrity=False
)

def _try_convert_dates(self) -> None:
if self.obj is None:
return

# our columns to parse
convert_dates_list_bool = self.convert_dates
if isinstance(convert_dates_list_bool, bool):
convert_dates_list_bool = []
convert_dates = set(convert_dates_list_bool)

def is_ok(col) -> bool:
"""
Return if this col is ok to try for a date parse.
"""
if col in convert_dates:
return True
if not self.keep_default_dates:
return False
if not isinstance(col, str):
return False

col_lower = col.lower()
if (
col_lower.endswith(("_at", "_time"))
or col_lower == "modified"
or col_lower == "date"
or col_lower == "datetime"
or col_lower.startswith("timestamp")
):
return True
return False

self._process_converter(lambda col, c: self._try_convert_to_date(c), filt=is_ok)
def _should_convert_dates(
convert_dates: bool | list[str],
keep_default_dates: bool,
col: Hashable,
) -> bool:
"""
Return bool whether a DataFrame column should be cast to datetime.
"""
if convert_dates is False:
# convert_dates=True means follow keep_default_dates
return False
elif not isinstance(convert_dates, bool) and col in set(convert_dates):
return True
elif not keep_default_dates:
return False
elif not isinstance(col, str):
return False
col_lower = col.lower()
if (
col_lower.endswith(("_at", "_time"))
or col_lower in {"modified", "date", "datetime"}
or col_lower.startswith("timestamp")
):
return True
return False
2 changes: 1 addition & 1 deletion pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def test_frame_from_json_precise_float(self):

def test_typ(self):
s = Series(range(6), index=["a", "b", "c", "d", "e", "f"], dtype="int64")
result = read_json(StringIO(s.to_json()), typ=None)
result = read_json(StringIO(s.to_json()), typ="series")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did the tests need to change?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test didn't align with the docs/typing of typ accepting None. I added validation in this PR to ensure this case fails now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm OK. That's super strange we allowed that to begin with...but hopefully harmless taking it away

tm.assert_series_equal(result, s)

def test_reconstruction_index(self):
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/json/test_readlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,11 @@ def test_readjson_chunks_series(request, engine):
s = pd.Series({"A": 1, "B": 2})

strio = StringIO(s.to_json(lines=True, orient="records"))
unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
unchunked = read_json(strio, lines=True, typ="series", engine=engine)

strio = StringIO(s.to_json(lines=True, orient="records"))
with read_json(
strio, lines=True, typ="Series", chunksize=1, engine=engine
strio, lines=True, typ="series", chunksize=1, engine=engine
) as reader:
chunked = pd.concat(reader)

Expand Down