DEPR: squeeze argument in read_csv/read_table/read_excel (#43427)

lithomas1 · web-flow · commit cd61b59f7fab · 2021-09-10T09:45:47.000-04:00
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1208,6 +1208,10 @@ Returning Series
 Using the ``squeeze`` keyword, the parser will return output with a single column
 as a ``Series``:
 
+.. deprecated:: 1.4.0
+   Users should append ``.squeeze("columns")`` to the DataFrame returned by
+   ``read_csv`` instead.
+
 .. ipython:: python
    :suppress:
 
@@ -1217,6 +1221,7 @@ as a ``Series``:
        fh.write(data)
 
 .. ipython:: python
+   :okwarning:
 
    print(open("tmp.csv").read())
 
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -278,6 +278,7 @@ Other Deprecations
 - Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
 - Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`)
 - Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
+- Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`)
 
 .. ---------------------------------------------------------------------------
 
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -121,6 +121,10 @@
     Returns a subset of the columns according to behavior above.
 squeeze : bool, default False
     If the parsed data only contains one column then return a Series.
+
+    .. deprecated:: 1.4.0
+       Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze
+       the data.
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     Use `object` to preserve data as stored in Excel and not interpret dtype.
@@ -337,7 +341,7 @@ def read_excel(
     names=None,
     index_col=None,
     usecols=None,
-    squeeze=False,
+    squeeze=None,
     dtype: DtypeArg | None = None,
     engine=None,
     converters=None,
@@ -481,7 +485,7 @@ def parse(
         names=None,
         index_col=None,
         usecols=None,
-        squeeze=False,
+        squeeze=None,
         dtype: DtypeArg | None = None,
         true_values=None,
         false_values=None,
@@ -1243,7 +1247,7 @@ def parse(
         names=None,
         index_col=None,
         usecols=None,
-        squeeze=False,
+        squeeze=None,
         converters=None,
         true_values=None,
         false_values=None,
diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py
@@ -104,7 +104,7 @@
     "chunksize": None,
     "verbose": False,
     "encoding": None,
-    "squeeze": False,
+    "squeeze": None,
     "compression": None,
     "mangle_dupe_cols": True,
     "infer_datetime_format": False,
diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py
@@ -33,7 +33,6 @@ class CParserWrapper(ParserBase):
     def __init__(self, src: FilePathOrBuffer, **kwds):
         self.kwds = kwds
         kwds = kwds.copy()
-
         ParserBase.__init__(self, kwds)
 
         self.low_memory = kwds.pop("low_memory", False)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -28,6 +28,7 @@
     Appender,
     deprecate_nonkeyword_arguments,
 )
+from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
 from pandas.core.dtypes.common import (
@@ -131,6 +132,10 @@
     parsing time and lower memory usage.
 squeeze : bool, default False
     If the parsed data only contains one column then return a Series.
+
+    .. deprecated:: 1.4.0
+        Append ``.squeeze("columns")`` to the call to ``{func_name}`` to squeeze
+        the data.
 prefix : str, optional
     Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
 mangle_dupe_cols : bool, default True
@@ -439,7 +444,11 @@
     "low_memory",
 }
 
-_deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
+_deprecated_defaults: dict[str, Any] = {
+    "error_bad_lines": None,
+    "warn_bad_lines": None,
+    "squeeze": None,
+}
 
 
 def validate_integer(name, val, min_val=0):
@@ -552,7 +561,7 @@ def read_csv(
     names=lib.no_default,
     index_col=None,
     usecols=None,
-    squeeze=False,
+    squeeze=None,
     prefix=lib.no_default,
     mangle_dupe_cols=True,
     # General Parsing Configuration
@@ -650,7 +659,7 @@ def read_table(
     names=lib.no_default,
     index_col=None,
     usecols=None,
-    squeeze=False,
+    squeeze=None,
     prefix=lib.no_default,
     mangle_dupe_cols=True,
     # General Parsing Configuration
@@ -867,11 +876,12 @@ def __init__(self, f, engine=None, **kwds):
 
         self.chunksize = options.pop("chunksize", None)
         self.nrows = options.pop("nrows", None)
-        self.squeeze = options.pop("squeeze", False)
 
         self._check_file_or_buffer(f, engine)
         self.options, self.engine = self._clean_options(options, engine)
 
+        self.squeeze = self.options.pop("squeeze", False)
+
         if "has_index_names" in kwds:
             self.options["has_index_names"] = kwds["has_index_names"]
 
@@ -1050,7 +1060,7 @@ def _clean_options(self, options, engine):
                     f"The {arg} argument has been deprecated and will be "
                     "removed in a future version.\n\n"
                 )
-                warnings.warn(msg, FutureWarning, stacklevel=7)
+                warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
             else:
                 result[arg] = parser_default
 
@@ -1100,6 +1110,10 @@ def _clean_options(self, options, engine):
         result["na_values"] = na_values
         result["na_fvalues"] = na_fvalues
         result["skiprows"] = skiprows
+        # Default for squeeze is none since we need to check
+        # if user sets it. We then set to False to preserve
+        # previous behavior.
+        result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]
 
         return result, engine
 
@@ -1149,7 +1163,7 @@ def read(self, nrows=None):
             self._currow += new_rows
 
         if self.squeeze and len(df.columns) == 1:
-            return df[df.columns[0]].copy()
+            return df.squeeze("columns").copy()
         return df
 
     def get_chunk(self, size=None):
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -1039,8 +1039,7 @@ def test_to_csv_compression(self, df, encoding, compression):
                 compression=compression,
                 encoding=encoding,
                 index_col=0,
-                squeeze=True,
-            )
+            ).squeeze("columns")
             tm.assert_frame_equal(df, result)
 
             # explicitly make sure file is compressed
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
@@ -1194,18 +1194,25 @@ def test_read_excel_squeeze(self, read_ext):
         # GH 12157
         f = "test_squeeze" + read_ext
 
-        actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
-        expected = Series([2, 3, 4], [4, 5, 6], name="b")
-        expected.index.name = "a"
-        tm.assert_series_equal(actual, expected)
+        with tm.assert_produces_warning(
+            FutureWarning,
+            match="The squeeze argument has been deprecated "
+            "and will be removed in a future version.\n\n",
+        ):
+            actual = pd.read_excel(
+                f, sheet_name="two_columns", index_col=0, squeeze=True
+            )
+            expected = Series([2, 3, 4], [4, 5, 6], name="b")
+            expected.index.name = "a"
+            tm.assert_series_equal(actual, expected)
 
-        actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
-        expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
-        tm.assert_frame_equal(actual, expected)
+            actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
+            expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
+            tm.assert_frame_equal(actual, expected)
 
-        actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
-        expected = Series([1, 2, 3], name="a")
-        tm.assert_series_equal(actual, expected)
+            actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
+            expected = Series([1, 2, 3], name="a")
+            tm.assert_series_equal(actual, expected)
 
     def test_deprecated_kwargs(self, read_ext):
         with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -128,7 +128,8 @@ def test_1000_sep(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-def test_squeeze(all_parsers):
+@pytest.mark.parametrize("squeeze", [True, False])
+def test_squeeze(all_parsers, squeeze):
     data = """\
 a,1
 b,2
@@ -138,13 +139,25 @@ def test_squeeze(all_parsers):
     index = Index(["a", "b", "c"], name=0)
     expected = Series([1, 2, 3], name=1, index=index)
 
-    result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
-    tm.assert_series_equal(result, expected)
+    result = parser.read_csv_check_warnings(
+        FutureWarning,
+        "The squeeze argument has been deprecated "
+        "and will be removed in a future version.\n\n",
+        StringIO(data),
+        index_col=0,
+        header=None,
+        squeeze=squeeze,
+    )
+    if not squeeze:
+        expected = DataFrame(expected)
+        tm.assert_frame_equal(result, expected)
+    else:
+        tm.assert_series_equal(result, expected)
 
-    # see gh-8217
-    #
-    # Series should not be a view.
-    assert not result._is_view
+        # see gh-8217
+        #
+        # Series should not be a view.
+        assert not result._is_view
 
 
 @xfail_pyarrow
@@ -847,12 +860,13 @@ def test_deprecated_bad_lines_warns(all_parsers, csv1, on_bad_lines):
     # GH 15122
     parser = all_parsers
     kwds = {f"{on_bad_lines}_bad_lines": False}
-    with tm.assert_produces_warning(
+    parser.read_csv_check_warnings(
         FutureWarning,
-        match=f"The {on_bad_lines}_bad_lines argument has been deprecated "
+        f"The {on_bad_lines}_bad_lines argument has been deprecated "
         "and will be removed in a future version.\n\n",
-    ):
-        parser.read_csv(csv1, **kwds)
+        csv1,
+        **kwds,
+    )
 
 
 def test_malformed_second_line(all_parsers):
diff --git a/pandas/tests/io/parser/common/test_iterator.py b/pandas/tests/io/parser/common/test_iterator.py
@@ -8,7 +8,6 @@
 
 from pandas import (
     DataFrame,
-    Series,
     concat,
 )
 import pandas._testing as tm
@@ -94,7 +93,7 @@ def test_iterator_skipfooter_errors(all_parsers, kwargs):
 
 def test_iteration_open_handle(all_parsers):
     parser = all_parsers
-    kwargs = {"squeeze": True, "header": None}
+    kwargs = {"header": None}
 
     with tm.ensure_clean() as path:
         with open(path, "w") as f:
@@ -106,5 +105,5 @@ def test_iteration_open_handle(all_parsers):
                     break
 
             result = parser.read_csv(f, **kwargs)
-            expected = Series(["DDD", "EEE", "FFF", "GGG"], name=0)
-            tm.assert_series_equal(result, expected)
+            expected = DataFrame({0: ["DDD", "EEE", "FFF", "GGG"]})
+            tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py
@@ -10,6 +10,7 @@
     read_csv,
     read_table,
 )
+import pandas._testing as tm
 
 
 class BaseParser:
@@ -27,6 +28,16 @@ def read_csv(self, *args, **kwargs):
         kwargs = self.update_kwargs(kwargs)
         return read_csv(*args, **kwargs)
 
+    def read_csv_check_warnings(
+        self, warn_type: type[Warning], warn_msg: str, *args, **kwargs
+    ):
+        # We need to check the stacklevel here instead of in the tests
+        # since this is where read_csv is called and where the warning
+        # should point to.
+        kwargs = self.update_kwargs(kwargs)
+        with tm.assert_produces_warning(warn_type, match=warn_msg):
+            return read_csv(*args, **kwargs)
+
     def read_table(self, *args, **kwargs):
         kwargs = self.update_kwargs(kwargs)
         return read_table(*args, **kwargs)
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
@@ -95,7 +95,14 @@ def test_series_compression_defaults_to_infer(
     extension = icom._compression_to_extension[compression_only]
     with tm.ensure_clean("compressed" + extension) as path:
         getattr(input, write_method)(path, **write_kwargs)
-        output = read_method(path, compression=compression_only, **read_kwargs)
+        if "squeeze" in read_kwargs:
+            kwargs = read_kwargs.copy()
+            del kwargs["squeeze"]
+            output = read_method(path, compression=compression_only, **kwargs).squeeze(
+                "columns"
+            )
+        else:
+            output = read_method(path, compression=compression_only, **read_kwargs)
     tm.assert_series_equal(output, input, check_names=False)
 
 
diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py
@@ -13,11 +13,11 @@
 
 class TestSeriesToCSV:
     def read_csv(self, path, **kwargs):
-        params = {"squeeze": True, "index_col": 0, "header": None, "parse_dates": True}
+        params = {"index_col": 0, "header": None, "parse_dates": True}
         params.update(**kwargs)
 
         header = params.get("header")
-        out = pd.read_csv(path, **params)
+        out = pd.read_csv(path, **params).squeeze("columns")
 
         if header is None:
             out.name = out.index.name = None
@@ -138,8 +138,7 @@ def test_to_csv_compression(self, s, encoding, compression):
                 compression=compression,
                 encoding=encoding,
                 index_col=0,
-                squeeze=True,
-            )
+            ).squeeze("columns")
             tm.assert_series_equal(s, result)
 
             # test the round trip using file handle - to_csv -> read_csv
@@ -153,8 +152,7 @@ def test_to_csv_compression(self, s, encoding, compression):
                 compression=compression,
                 encoding=encoding,
                 index_col=0,
-                squeeze=True,
-            )
+            ).squeeze("columns")
             tm.assert_series_equal(s, result)
 
             # explicitly ensure file was compressed
@@ -164,7 +162,8 @@ def test_to_csv_compression(self, s, encoding, compression):
 
             with tm.decompress_file(filename, compression) as fh:
                 tm.assert_series_equal(
-                    s, pd.read_csv(fh, index_col=0, squeeze=True, encoding=encoding)
+                    s,
+                    pd.read_csv(fh, index_col=0, encoding=encoding).squeeze("columns"),
                 )
 
     def test_to_csv_interval_index(self):
@@ -173,7 +172,7 @@ def test_to_csv_interval_index(self):
 
         with tm.ensure_clean("__tmp_to_csv_interval_index__.csv") as path:
             s.to_csv(path, header=False)
-            result = self.read_csv(path, index_col=0, squeeze=True)
+            result = self.read_csv(path, index_col=0)
 
             # can't roundtrip intervalindex via read_csv so check string repr (GH 23595)
             expected = s.copy()