From 5d8371da248196ab16cc63818dba434f09f07710 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 17 Nov 2022 08:10:46 +0000 Subject: [PATCH 1/3] API: read_stata with index_col=None return RangeIndex --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/io/stata.py | 2 +- pandas/tests/io/test_stata.py | 15 ++++++++++++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index f2bed9cc9f782..8f772b4f24923 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -340,6 +340,7 @@ Other API changes - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) +- If no parameter ``index_col`` is given to :func:`read_stata`, the index will be a :class:`RangeIndex` Previously the index would have been a :class:`Int64Index` (:issue:`xxxxx`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5860aa4ae7c3e..269dd169cdeaa 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1725,7 +1725,7 @@ def read( # If index is not specified, use actual row number rather than # restarting at 0 for each chunk. if index_col is None: - rng = np.arange(self._lines_read - read_lines, self._lines_read) + rng = range(self._lines_read - read_lines, self._lines_read) data.index = Index(rng) # set attr instead of set_index to avoid copy if columns is not None: diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 368e9d5f6e6a1..660509c34c1cf 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -73,6 +73,19 @@ def test_read_empty_dta(self, version): empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) + @pytest.mark.parametrize("version", [114, 117, 118, 119, None]) + def test_read_index_col_none(self, version): + df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]}) + # GH 7369, make sure can read a 0-obs dta file + with tm.ensure_clean() as path: + df.to_stata(path, write_index=False, version=version) + read_df = read_stata(path) + + assert isinstance(read_df.index, pd.RangeIndex) + expected = df.copy() + expected["a"] = expected["a"].astype(np.int32) + tm.assert_frame_equal(read_df, expected) + @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) def test_read_dta1(self, file, datapath): @@ -1054,7 +1067,7 @@ def test_categorical_sorting(self, file, datapath): parsed = parsed.sort_values("srh", na_position="first") # Don't sort index - parsed.index = np.arange(parsed.shape[0]) + parsed.index = pd.RangeIndex(len(parsed)) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] cat = pd.Categorical.from_codes( From c02818087a66789c5999b1a4a7c65f223dcbf85a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 17 Nov 2022 18:19:50 +0000 Subject: [PATCH 2/3] fix comments --- doc/source/whatsnew/v2.0.0.rst | 3 ++- pandas/tests/io/test_stata.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 8f772b4f24923..b6ebee9a807fb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -340,7 +340,7 @@ Other API changes - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) -- If no parameter ``index_col`` is given to :func:`read_stata`, the index will be a :class:`RangeIndex` Previously the index would have been a :class:`Int64Index` (:issue:`xxxxx`) +- If no parameter ``index_col`` is given to :func:`read_stata`, the index will be a :class:`RangeIndex` Previously the index would have been a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - @@ -595,6 +595,7 @@ Performance improvements - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) +- Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None``(the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) .. --------------------------------------------------------------------------- diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 660509c34c1cf..32b616cd9ab9b 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -84,7 +84,7 @@ def test_read_index_col_none(self, version): assert isinstance(read_df.index, pd.RangeIndex) expected = df.copy() expected["a"] = expected["a"].astype(np.int32) - tm.assert_frame_equal(read_df, expected) + tm.assert_frame_equal(read_df, expected, check_index_type=True) @pytest.mark.parametrize("file", ["stata1_114", "stata1_117"]) def test_read_dta1(self, file, datapath): From 7f849aba067b6c73c70ed1109c083c13ca6e0e60 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 18 Nov 2022 06:45:31 +0000 Subject: [PATCH 3/3] fix comments II --- doc/source/whatsnew/v2.0.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index b6ebee9a807fb..d8757cbcabb69 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -340,7 +340,7 @@ Other API changes - Passing strings that cannot be parsed as datetimes to :class:`Series` or :class:`DataFrame` with ``dtype="datetime64[ns]"`` will raise instead of silently ignoring the keyword and returning ``object`` dtype (:issue:`24435`) - Passing a sequence containing a type that cannot be converted to :class:`Timedelta` to :func:`to_timedelta` or to the :class:`Series` or :class:`DataFrame` constructor with ``dtype="timedelta64[ns]"`` or to :class:`TimedeltaIndex` now raises ``TypeError`` instead of ``ValueError`` (:issue:`49525`) - Changed behavior of :class:`Index` constructor with sequence containing at least one ``NaT`` and everything else either ``None`` or ``NaN`` to infer ``datetime64[ns]`` dtype instead of ``object``, matching :class:`Series` behavior (:issue:`49340`) -- If no parameter ``index_col`` is given to :func:`read_stata`, the index will be a :class:`RangeIndex` Previously the index would have been a :class:`Int64Index` (:issue:`49745`) +- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`) - Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`) - @@ -595,7 +595,7 @@ Performance improvements - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``observed=False`` (:issue:`49596`) -- Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None``(the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) +- Performance improvement in :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default). Now the index will be a :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49745`) - Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`) .. ---------------------------------------------------------------------------