From 1e75718125fcaae3fb81266a9772c36901ed618b Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Fri, 30 Jul 2021 16:38:20 +0200 Subject: [PATCH 1/5] Fix dtypes for read_json --- pandas/io/json/_json.py | 9 +-------- pandas/tests/io/json/test_pandas.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fdeda868fdb5e..112280016a330 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -875,11 +875,8 @@ def check_keys_split(self, decoded): def parse(self): - # try numpy - numpy = self.numpy - if numpy: + if self.numpy: self._parse_numpy() - else: self._parse_no_numpy() @@ -940,10 +937,6 @@ def _try_convert_data(self, name, data, use_dtypes=True, convert_dates=True): ) if dtype is not None: try: - # error: Argument 1 to "dtype" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; - # expected "Type[Any]" - dtype = np.dtype(dtype) # type: ignore[arg-type] return data.astype(dtype), True except (TypeError, ValueError): return data, False diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index d97ba8694818b..22c2307ca928d 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1389,6 +1389,34 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) + def test_to_json_from_json_columns_dtypes(self): + expected = DataFrame.from_dict( + { + "Integer": pd.Series([1, 2, 3], dtype="int64"), + "Float": pd.Series([None, 2.0, 3.0], dtype="float64"), + "Object": pd.Series([None, "", "c"], dtype="object"), + "Bool": pd.Series([True, False, True], dtype="bool"), + "Category": pd.Series(["a", "b", None], dtype="category"), + "Datetime": pd.Series( + ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" + ), + } + ) + dfjson = expected.to_json(orient="columns") + result = read_json( + dfjson, + orient="columns", + dtype={ + "Integer": "int64", + "Float": "float64", + "Object": "object", + "Bool": "bool", + "Category": "category", + "Datetime": "datetime64[ns]", + }, + ) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("dtype", [True, {"b": int, "c": int}]) def test_read_json_table_dtype_raises(self, dtype): # GH21345 From 44a3935f327f2ceb2c2370c8d7da8adc427f8c47 Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Sat, 31 Jul 2021 12:36:39 +0200 Subject: [PATCH 2/5] Address comments --- pandas/tests/io/json/test_pandas.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 22c2307ca928d..974832fe37387 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1389,7 +1389,9 @@ def test_from_json_to_json_table_dtypes(self): result = read_json(dfjson, orient="table") tm.assert_frame_equal(result, expected) - def test_to_json_from_json_columns_dtypes(self): + @pytest.mark.parametrize("orient", ["split", "records", "index", "columns"]) + def test_to_json_from_json_columns_dtypes(self, orient): + # GH21892 GH33205 expected = DataFrame.from_dict( { "Integer": pd.Series([1, 2, 3], dtype="int64"), @@ -1402,10 +1404,10 @@ def test_to_json_from_json_columns_dtypes(self): ), } ) - dfjson = expected.to_json(orient="columns") + dfjson = expected.to_json(orient=orient) result = read_json( dfjson, - orient="columns", + orient=orient, dtype={ "Integer": "int64", "Float": "float64", From 75eb85c15516ed3f12f2613cd6dcb52f91f75055 Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Sat, 31 Jul 2021 12:51:02 +0200 Subject: [PATCH 3/5] Add whatsnew entry --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 432dd46000eb3..ad86f142211a9 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -245,7 +245,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) -- +- Bug in :func:`read_json` not handeling non-numpy dtypes correctly (especially `category`) (:issue:`21892`) (:issue:`33205`) Period ^^^^^^ From 99f5d364047ffc3b47bd5d9951eda8a660dc2d2e Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Sun, 1 Aug 2021 08:42:34 +0200 Subject: [PATCH 4/5] Update doc/source/whatsnew/v1.4.0.rst Co-authored-by: Matthew Zeitlin <37011898+mzeitlin11@users.noreply.github.com> --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index ad86f142211a9..a5a683059fbf7 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -245,7 +245,7 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) -- Bug in :func:`read_json` not handeling non-numpy dtypes correctly (especially `category`) (:issue:`21892`) (:issue:`33205`) +- Bug in :func:`read_json` not handling non-numpy dtypes correctly (especially ``category``) (:issue:`21892`, :issue:`33205`) Period ^^^^^^ From 4872ecbfa936e25f473e1f1c2d024497ae326ad8 Mon Sep 17 00:00:00 2001 From: Robin Raymond Date: Thu, 5 Aug 2021 06:37:54 +0200 Subject: [PATCH 5/5] Linting --- pandas/tests/io/json/test_pandas.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 974832fe37387..911fce4683de4 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1394,12 +1394,12 @@ def test_to_json_from_json_columns_dtypes(self, orient): # GH21892 GH33205 expected = DataFrame.from_dict( { - "Integer": pd.Series([1, 2, 3], dtype="int64"), - "Float": pd.Series([None, 2.0, 3.0], dtype="float64"), - "Object": pd.Series([None, "", "c"], dtype="object"), - "Bool": pd.Series([True, False, True], dtype="bool"), - "Category": pd.Series(["a", "b", None], dtype="category"), - "Datetime": pd.Series( + "Integer": Series([1, 2, 3], dtype="int64"), + "Float": Series([None, 2.0, 3.0], dtype="float64"), + "Object": Series([None, "", "c"], dtype="object"), + "Bool": Series([True, False, True], dtype="bool"), + "Category": Series(["a", "b", None], dtype="category"), + "Datetime": Series( ["2020-01-01", None, "2020-01-03"], dtype="datetime64[ns]" ), }