Skip to content

DEPR: squeeze argument in read_csv/read_table/read_excel #43427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Sep 10, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ Other Deprecations
- Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
- Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`)
- Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
- Deprecated the ``squeeze`` argument to :meth:`read_csv`, :meth:`read_table`, and :meth:`read_excel`. Users should squeeze the DataFrame afterwards with ``.squeeze("columns")`` instead. (:issue:`43242`)

.. ---------------------------------------------------------------------------

Expand Down
77 changes: 44 additions & 33 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@
Returns a subset of the columns according to behavior above.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.
.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``read_excel`` to squeeze
the data.
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
Use `object` to preserve data as stored in Excel and not interpret dtype.
Expand Down Expand Up @@ -337,7 +340,7 @@ def read_excel(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
engine=None,
converters=None,
Expand Down Expand Up @@ -481,7 +484,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
dtype: DtypeArg | None = None,
true_values=None,
false_values=None,
Expand Down Expand Up @@ -598,41 +601,49 @@ def parse(
data[row][col] = last
else:
last = data[row][col]

future_warnings = []
# GH 12292 : error when read one empty column from excel file
try:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
squeeze=squeeze,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)
# Gotta catch deprecation warnings to raise at correct stacklevel :(
with warnings.catch_warnings(record=True) as w:
parser = TextParser(
data,
names=names,
header=header,
index_col=index_col,
has_index_names=has_index_names,
squeeze=squeeze,
dtype=dtype,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
skip_blank_lines=False, # GH 39808
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
usecols=usecols,
mangle_dupe_cols=mangle_dupe_cols,
**kwds,
)

output[asheetname] = parser.read(nrows=nrows)
output[asheetname] = parser.read(nrows=nrows)

if not squeeze or isinstance(output[asheetname], DataFrame):
if header_names:
output[asheetname].columns = output[
asheetname
].columns.set_names(header_names)
if not squeeze or isinstance(output[asheetname], DataFrame):
if header_names:
output[asheetname].columns = output[
asheetname
].columns.set_names(header_names)

# Record warning messages, can't raise here since it would be
# suppressed again
for warning in w:
future_warnings.append(str(warning.message))
for warning in future_warnings:
warnings.warn(warning, FutureWarning, stacklevel=5)
except EmptyDataError:
# No Data, return an empty DataFrame
output[asheetname] = DataFrame()
Expand Down Expand Up @@ -1243,7 +1254,7 @@ def parse(
names=None,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
converters=None,
true_values=None,
false_values=None,
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@
"chunksize": None,
"verbose": False,
"encoding": None,
"squeeze": False,
"squeeze": None,
"compression": None,
"mangle_dupe_cols": True,
"infer_datetime_format": False,
Expand Down
1 change: 0 additions & 1 deletion pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class CParserWrapper(ParserBase):
def __init__(self, src: FilePathOrBuffer, **kwds):
self.kwds = kwds
kwds = kwds.copy()

ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)
Expand Down
23 changes: 18 additions & 5 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@
parsing time and lower memory usage.
squeeze : bool, default False
If the parsed data only contains one column then return a Series.

.. deprecated:: 1.4.0
Append ``.squeeze("columns")`` to the call to ``read_csv`` to squeeze
the data.
prefix : str, optional
Prefix to add to column numbers when no header, e.g. 'X' for X0, X1, ...
mangle_dupe_cols : bool, default True
Expand Down Expand Up @@ -439,7 +443,11 @@
"low_memory",
}

_deprecated_defaults: dict[str, Any] = {"error_bad_lines": None, "warn_bad_lines": None}
_deprecated_defaults: dict[str, Any] = {
"error_bad_lines": None,
"warn_bad_lines": None,
"squeeze": None,
}


def validate_integer(name, val, min_val=0):
Expand Down Expand Up @@ -552,7 +560,7 @@ def read_csv(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -650,7 +658,7 @@ def read_table(
names=lib.no_default,
index_col=None,
usecols=None,
squeeze=False,
squeeze=None,
prefix=lib.no_default,
mangle_dupe_cols=True,
# General Parsing Configuration
Expand Down Expand Up @@ -867,11 +875,12 @@ def __init__(self, f, engine=None, **kwds):

self.chunksize = options.pop("chunksize", None)
self.nrows = options.pop("nrows", None)
self.squeeze = options.pop("squeeze", False)

self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)

self.squeeze = self.options.pop("squeeze", False)

if "has_index_names" in kwds:
self.options["has_index_names"] = kwds["has_index_names"]

Expand Down Expand Up @@ -1100,6 +1109,10 @@ def _clean_options(self, options, engine):
result["na_values"] = na_values
result["na_fvalues"] = na_fvalues
result["skiprows"] = skiprows
# Default for squeeze is none since we need to check
# if user sets it, we set to False since behavior is
# equivlent
result["squeeze"] = False if options["squeeze"] is None else options["squeeze"]

return result, engine

Expand Down Expand Up @@ -1149,7 +1162,7 @@ def read(self, nrows=None):
self._currow += new_rows

if self.squeeze and len(df.columns) == 1:
return df[df.columns[0]].copy()
return df.squeeze("columns").copy()
return df

def get_chunk(self, size=None):
Expand Down
27 changes: 17 additions & 10 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1194,18 +1194,25 @@ def test_read_excel_squeeze(self, read_ext):
# GH 12157
f = "test_squeeze" + read_ext

actual = pd.read_excel(f, sheet_name="two_columns", index_col=0, squeeze=True)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)
with tm.assert_produces_warning(
FutureWarning,
match="The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
):
actual = pd.read_excel(
f, sheet_name="two_columns", index_col=0, squeeze=True
)
expected = Series([2, 3, 4], [4, 5, 6], name="b")
expected.index.name = "a"
tm.assert_series_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="two_columns", squeeze=True)
expected = DataFrame({"a": [4, 5, 6], "b": [2, 3, 4]})
tm.assert_frame_equal(actual, expected)

actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)
actual = pd.read_excel(f, sheet_name="one_column", squeeze=True)
expected = Series([1, 2, 3], name="a")
tm.assert_series_equal(actual, expected)

def test_deprecated_kwargs(self, read_ext):
with tm.assert_produces_warning(FutureWarning, raise_on_extra_warnings=False):
Expand Down
26 changes: 19 additions & 7 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def test_1000_sep(all_parsers):
tm.assert_frame_equal(result, expected)


def test_squeeze(all_parsers):
@pytest.mark.parametrize("squeeze", [True, False])
def test_squeeze(all_parsers, squeeze):
data = """\
a,1
b,2
Expand All @@ -138,13 +139,24 @@ def test_squeeze(all_parsers):
index = Index(["a", "b", "c"], name=0)
expected = Series([1, 2, 3], name=1, index=index)

result = parser.read_csv(StringIO(data), index_col=0, header=None, squeeze=True)
tm.assert_series_equal(result, expected)
with tm.assert_produces_warning(
FutureWarning,
match="The squeeze argument has been deprecated "
"and will be removed in a future version.\n\n",
):
result = parser.read_csv(
StringIO(data), index_col=0, header=None, squeeze=squeeze
)
if not squeeze:
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)
else:
tm.assert_series_equal(result, expected)

# see gh-8217
#
# Series should not be a view.
assert not result._is_view
# see gh-8217
#
# Series should not be a view.
assert not result._is_view


@xfail_pyarrow
Expand Down