Skip to content

Commit 467e1c2

Browse files
API: Make most arguments for read_html and read_json keyword-ony (#27573)
* Deprecate use of most positional arguments for read_html and read_json * Import pandas._testing instead of pandas.util.testing * Import pandas._testing instead of pandas.util.testing * Update pandas/util/_decorators.py Co-Authored-By: Joris Van den Bossche <[email protected]> * Change displayed warning message * Update pandas/io/html.py Co-Authored-By: Joris Van den Bossche <[email protected]> * Update pandas/io/json/_json.py Co-Authored-By: Joris Van den Bossche <[email protected]> * Restore permissions to v1.0.0.rst * Fix expected warning message in tests for deprecate_nonkeyword_arguments * Reformat too long line * Remove a test too similar to another one. * Update Whatsnew * Fix linting Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 25e5a74 commit 467e1c2

File tree

7 files changed

+318
-49
lines changed

7 files changed

+318
-49
lines changed

doc/source/whatsnew/v1.1.0.rst

+13
Original file line numberDiff line numberDiff line change
@@ -250,14 +250,27 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns
250250

251251
Deprecations
252252
~~~~~~~~~~~~
253+
253254
- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`)
255+
254256
- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`)
255257
- Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`)
256258
- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`)
257259
- :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`)
258260
- The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`)
259261
- :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`)
260262

263+
- Passing any arguments but the first one to :func:`read_html` as
264+
positional arguments is deprecated since version 1.1. All other
265+
arguments should be given as keyword arguments (:issue:`27573`).
266+
267+
- Passing any arguments but `path_or_buf` (the first one) to
268+
:func:`read_json` as positional arguments is deprecated since
269+
version 1.1. All other arguments should be given as keyword
270+
arguments (:issue:`27573`).
271+
272+
-
273+
261274
.. ---------------------------------------------------------------------------
262275
263276

pandas/io/html.py

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.compat._optional import import_optional_dependency
1313
from pandas.errors import AbstractMethodError, EmptyDataError
14+
from pandas.util._decorators import deprecate_nonkeyword_arguments
1415

1516
from pandas.core.dtypes.common import is_list_like
1617

@@ -921,6 +922,7 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
921922
return ret
922923

923924

925+
@deprecate_nonkeyword_arguments(version="2.0")
924926
def read_html(
925927
io,
926928
match=".+",

pandas/io/json/_json.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pandas._libs.tslibs import iNaT
1212
from pandas._typing import JSONSerializable
1313
from pandas.errors import AbstractMethodError
14-
from pandas.util._decorators import deprecate_kwarg
14+
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments
1515

1616
from pandas.core.dtypes.common import ensure_str, is_period_dtype
1717

@@ -345,6 +345,9 @@ def _write(
345345

346346

347347
@deprecate_kwarg(old_arg_name="numpy", new_arg_name=None)
348+
@deprecate_nonkeyword_arguments(
349+
version="2.0", allowed_args=["path_or_buf"], stacklevel=3
350+
)
348351
def read_json(
349352
path_or_buf=None,
350353
orient=None,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""
2+
Tests for the deprecated keyword arguments for `read_json`.
3+
"""
4+
5+
import pandas as pd
6+
import pandas._testing as tm
7+
8+
from pandas.io.json import read_json
9+
10+
11+
def test_deprecated_kwargs():
12+
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
13+
buf = df.to_json(orient="split")
14+
with tm.assert_produces_warning(FutureWarning):
15+
tm.assert_frame_equal(df, read_json(buf, "split"))
16+
buf = df.to_json(orient="columns")
17+
with tm.assert_produces_warning(FutureWarning):
18+
tm.assert_frame_equal(df, read_json(buf, "columns"))
19+
buf = df.to_json(orient="index")
20+
with tm.assert_produces_warning(FutureWarning):
21+
tm.assert_frame_equal(df, read_json(buf, "index"))
22+
23+
24+
def test_good_kwargs():
25+
df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2])
26+
with tm.assert_produces_warning(None):
27+
tm.assert_frame_equal(df, read_json(df.to_json(orient="split"), orient="split"))
28+
tm.assert_frame_equal(
29+
df, read_json(df.to_json(orient="columns"), orient="columns")
30+
)
31+
tm.assert_frame_equal(df, read_json(df.to_json(orient="index"), orient="index"))

pandas/tests/io/test_html.py

+68-48
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def test_invalid_flavor():
7272
msg = r"\{" + flavor + r"\} is not a valid set of flavors"
7373

7474
with pytest.raises(ValueError, match=msg):
75-
read_html(url, "google", flavor=flavor)
75+
read_html(url, match="google", flavor=flavor)
7676

7777

7878
@td.skip_if_no("bs4")
@@ -121,13 +121,26 @@ def test_to_html_compat(self):
121121
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
122122
tm.assert_frame_equal(res, df)
123123

124+
@tm.network
125+
def test_banklist_url_positional_match(self):
126+
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
127+
# Passing match argument as positional should cause a FutureWarning.
128+
with tm.assert_produces_warning(FutureWarning):
129+
df1 = self.read_html(
130+
url, "First Federal Bank of Florida", attrs={"id": "table"}
131+
)
132+
with tm.assert_produces_warning(FutureWarning):
133+
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})
134+
135+
assert_framelist_equal(df1, df2)
136+
124137
@tm.network
125138
def test_banklist_url(self):
126139
url = "http://www.fdic.gov/bank/individual/failed/banklist.html"
127140
df1 = self.read_html(
128-
url, "First Federal Bank of Florida", attrs={"id": "table"}
141+
url, match="First Federal Bank of Florida", attrs={"id": "table"}
129142
)
130-
df2 = self.read_html(url, "Metcalf Bank", attrs={"id": "table"})
143+
df2 = self.read_html(url, match="Metcalf Bank", attrs={"id": "table"})
131144

132145
assert_framelist_equal(df1, df2)
133146

@@ -137,21 +150,25 @@ def test_spam_url(self):
137150
"https://raw.githubusercontent.com/pandas-dev/pandas/master/"
138151
"pandas/tests/io/data/html/spam.html"
139152
)
140-
df1 = self.read_html(url, ".*Water.*")
141-
df2 = self.read_html(url, "Unit")
153+
df1 = self.read_html(url, match=".*Water.*")
154+
df2 = self.read_html(url, match="Unit")
142155

143156
assert_framelist_equal(df1, df2)
144157

145158
@pytest.mark.slow
146159
def test_banklist(self):
147-
df1 = self.read_html(self.banklist_data, ".*Florida.*", attrs={"id": "table"})
148-
df2 = self.read_html(self.banklist_data, "Metcalf Bank", attrs={"id": "table"})
160+
df1 = self.read_html(
161+
self.banklist_data, match=".*Florida.*", attrs={"id": "table"}
162+
)
163+
df2 = self.read_html(
164+
self.banklist_data, match="Metcalf Bank", attrs={"id": "table"}
165+
)
149166

150167
assert_framelist_equal(df1, df2)
151168

152169
def test_spam(self):
153-
df1 = self.read_html(self.spam_data, ".*Water.*")
154-
df2 = self.read_html(self.spam_data, "Unit")
170+
df1 = self.read_html(self.spam_data, match=".*Water.*")
171+
df2 = self.read_html(self.spam_data, match="Unit")
155172
assert_framelist_equal(df1, df2)
156173

157174
assert df1[0].iloc[0, 0] == "Proximates"
@@ -168,81 +185,82 @@ def test_banklist_no_match(self):
168185
assert isinstance(df, DataFrame)
169186

170187
def test_spam_header(self):
171-
df = self.read_html(self.spam_data, ".*Water.*", header=2)[0]
188+
df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0]
172189
assert df.columns[0] == "Proximates"
173190
assert not df.empty
174191

175192
def test_skiprows_int(self):
176-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
177-
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
193+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
194+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)
178195

179196
assert_framelist_equal(df1, df2)
180197

181198
def test_skiprows_range(self):
182-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=range(2))[0]
183-
df2 = self.read_html(self.spam_data, "Unit", skiprows=range(2))[0]
184-
tm.assert_frame_equal(df1, df2)
199+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2))
200+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2))
201+
202+
assert_framelist_equal(df1, df2)
185203

186204
def test_skiprows_list(self):
187-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=[1, 2])
188-
df2 = self.read_html(self.spam_data, "Unit", skiprows=[2, 1])
205+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2])
206+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1])
189207

190208
assert_framelist_equal(df1, df2)
191209

192210
def test_skiprows_set(self):
193-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows={1, 2})
194-
df2 = self.read_html(self.spam_data, "Unit", skiprows={2, 1})
211+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2})
212+
df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1})
195213

196214
assert_framelist_equal(df1, df2)
197215

198216
def test_skiprows_slice(self):
199-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=1)
200-
df2 = self.read_html(self.spam_data, "Unit", skiprows=1)
217+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1)
218+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=1)
201219

202220
assert_framelist_equal(df1, df2)
203221

204222
def test_skiprows_slice_short(self):
205-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2))
206-
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(2))
223+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2))
224+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2))
207225

208226
assert_framelist_equal(df1, df2)
209227

210228
def test_skiprows_slice_long(self):
211-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=slice(2, 5))
212-
df2 = self.read_html(self.spam_data, "Unit", skiprows=slice(4, 1, -1))
229+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5))
230+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1))
213231

214232
assert_framelist_equal(df1, df2)
215233

216234
def test_skiprows_ndarray(self):
217-
df1 = self.read_html(self.spam_data, ".*Water.*", skiprows=np.arange(2))
218-
df2 = self.read_html(self.spam_data, "Unit", skiprows=np.arange(2))
235+
df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2))
236+
df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2))
219237

220238
assert_framelist_equal(df1, df2)
221239

222240
def test_skiprows_invalid(self):
223241
with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
224-
self.read_html(self.spam_data, ".*Water.*", skiprows="asdf")
242+
self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf")
225243

226244
def test_index(self):
227-
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
228-
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
245+
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
246+
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
229247
assert_framelist_equal(df1, df2)
230248

231249
def test_header_and_index_no_types(self):
232-
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
233-
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
250+
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
251+
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
234252
assert_framelist_equal(df1, df2)
235253

236254
def test_header_and_index_with_types(self):
237-
df1 = self.read_html(self.spam_data, ".*Water.*", header=1, index_col=0)
238-
df2 = self.read_html(self.spam_data, "Unit", header=1, index_col=0)
255+
df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0)
256+
df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0)
239257
assert_framelist_equal(df1, df2)
240258

241259
def test_infer_types(self):
242260

243261
# 10892 infer_types removed
244-
df1 = self.read_html(self.spam_data, ".*Water.*", index_col=0)
245-
df2 = self.read_html(self.spam_data, "Unit", index_col=0)
262+
df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0)
263+
df2 = self.read_html(self.spam_data, match="Unit", index_col=0)
246264
assert_framelist_equal(df1, df2)
247265

248266
def test_string_io(self):
@@ -252,25 +270,25 @@ def test_string_io(self):
252270
with open(self.spam_data, **self.spam_data_kwargs) as f:
253271
data2 = StringIO(f.read())
254272

255-
df1 = self.read_html(data1, ".*Water.*")
256-
df2 = self.read_html(data2, "Unit")
273+
df1 = self.read_html(data1, match=".*Water.*")
274+
df2 = self.read_html(data2, match="Unit")
257275
assert_framelist_equal(df1, df2)
258276

259277
def test_string(self):
260278
with open(self.spam_data, **self.spam_data_kwargs) as f:
261279
data = f.read()
262280

263-
df1 = self.read_html(data, ".*Water.*")
264-
df2 = self.read_html(data, "Unit")
281+
df1 = self.read_html(data, match=".*Water.*")
282+
df2 = self.read_html(data, match="Unit")
265283

266284
assert_framelist_equal(df1, df2)
267285

268286
def test_file_like(self):
269287
with open(self.spam_data, **self.spam_data_kwargs) as f:
270-
df1 = self.read_html(f, ".*Water.*")
288+
df1 = self.read_html(f, match=".*Water.*")
271289

272290
with open(self.spam_data, **self.spam_data_kwargs) as f:
273-
df2 = self.read_html(f, "Unit")
291+
df2 = self.read_html(f, match="Unit")
274292

275293
assert_framelist_equal(df1, df2)
276294

@@ -292,7 +310,7 @@ def test_invalid_url(self):
292310
def test_file_url(self):
293311
url = self.banklist_data
294312
dfs = self.read_html(
295-
file_path_to_url(os.path.abspath(url)), "First", attrs={"id": "table"}
313+
file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
296314
)
297315
assert isinstance(dfs, list)
298316
for df in dfs:
@@ -308,7 +326,7 @@ def test_invalid_table_attrs(self):
308326

309327
def _bank_data(self, *args, **kwargs):
310328
return self.read_html(
311-
self.banklist_data, "Metcalf", attrs={"id": "table"}, *args, **kwargs
329+
self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs
312330
)
313331

314332
@pytest.mark.slow
@@ -358,7 +376,7 @@ def test_regex_idempotency(self):
358376
def test_negative_skiprows(self):
359377
msg = r"\(you passed a negative value\)"
360378
with pytest.raises(ValueError, match=msg):
361-
self.read_html(self.spam_data, "Water", skiprows=-1)
379+
self.read_html(self.spam_data, match="Water", skiprows=-1)
362380

363381
@tm.network
364382
def test_multiple_matches(self):
@@ -600,7 +618,9 @@ def test_gold_canyon(self):
600618
raw_text = f.read()
601619

602620
assert gc in raw_text
603-
df = self.read_html(self.banklist_data, "Gold Canyon", attrs={"id": "table"})[0]
621+
df = self.read_html(
622+
self.banklist_data, match="Gold Canyon", attrs={"id": "table"}
623+
)[0]
604624
assert gc in df.to_string()
605625

606626
def test_different_number_of_cols(self):
@@ -855,7 +875,7 @@ def test_wikipedia_states_table(self, datapath):
855875
data = datapath("io", "data", "html", "wikipedia_states.html")
856876
assert os.path.isfile(data), f"{repr(data)} is not a file"
857877
assert os.path.getsize(data), f"{repr(data)} is an empty file"
858-
result = self.read_html(data, "Arizona", header=1)[0]
878+
result = self.read_html(data, match="Arizona", header=1)[0]
859879
assert result.shape == (60, 12)
860880
assert "Unnamed" in result.columns[-1]
861881
assert result["sq mi"].dtype == np.dtype("float64")
@@ -1065,7 +1085,7 @@ def test_works_on_valid_markup(self, datapath):
10651085
@pytest.mark.slow
10661086
def test_fallback_success(self, datapath):
10671087
banklist_data = datapath("io", "data", "html", "banklist.html")
1068-
self.read_html(banklist_data, ".*Water.*", flavor=["lxml", "html5lib"])
1088+
self.read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])
10691089

10701090
def test_to_html_timestamp(self):
10711091
rng = date_range("2000-01-01", periods=10)

0 commit comments

Comments
 (0)