diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d76af1ce42546..2a34c71412789 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1365,8 +1365,6 @@ def itertuples( ----- The column names will be renamed to positional names if they are invalid Python identifiers, repeated, or start with an underscore. - On python versions < 3.7 regular tuples are returned for DataFrames - with a large number of columns (>254). Examples -------- diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index f3a6f1f80359c..3717952a2183e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -25,16 +25,6 @@ ) import pandas._testing as tm -_seriesd = tm.getSeriesData() - -_frame = DataFrame(_seriesd) - -_cat_frame = _frame.copy() -cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) -_cat_frame.index = pd.CategoricalIndex(cat, name="E") -_cat_frame["E"] = list(reversed(cat)) -_cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") - def assert_json_roundtrip_equal(result, expected, orient): if orient == "records" or orient == "values": @@ -49,11 +39,17 @@ def assert_json_roundtrip_equal(result, expected, orient): ) @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: - @pytest.fixture(autouse=True) - def setup(self): - self.categorical = _cat_frame.copy() + @pytest.fixture + def categorical_frame(self): + _seriesd = tm.getSeriesData() + + _cat_frame = DataFrame(_seriesd) - yield + cat = ["bah"] * 5 + ["bar"] * 5 + ["baz"] * 5 + ["foo"] * (len(_cat_frame) - 15) + _cat_frame.index = pd.CategoricalIndex(cat, name="E") + _cat_frame["E"] = list(reversed(cat)) + _cat_frame["sort"] = np.arange(len(_cat_frame), dtype="int64") + return _cat_frame @pytest.fixture def datetime_series(self): @@ -215,7 +211,9 @@ def test_roundtrip_str_axes(self, request, orient, convert_axes, numpy, dtype): @pytest.mark.parametrize("convert_axes", [True, False]) @pytest.mark.parametrize("numpy", [True, False]) - def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): + def test_roundtrip_categorical( + self, request, orient, categorical_frame, convert_axes, numpy + ): # TODO: create a better frame to test with and improve coverage if orient in ("index", "columns"): request.node.add_marker( @@ -224,7 +222,7 @@ def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): ) ) - data = self.categorical.to_json(orient=orient) + data = categorical_frame.to_json(orient=orient) if numpy and orient in ("records", "values"): request.node.add_marker( pytest.mark.xfail(reason=f"Orient {orient} is broken with numpy=True") @@ -232,7 +230,7 @@ def test_roundtrip_categorical(self, request, orient, convert_axes, numpy): result = read_json(data, orient=orient, convert_axes=convert_axes, numpy=numpy) - expected = self.categorical.copy() + expected = categorical_frame.copy() expected.index = expected.index.astype(str) # Categorical not preserved expected.index.name = None # index names aren't preserved in JSON diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index d594bf8a75d49..a58ed02d30ef9 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -6,7 +6,6 @@ BytesIO, StringIO, ) -import os import numpy as np import pytest @@ -25,27 +24,23 @@ class TestTextReader: - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath("io", "parser", "data") - csv1_dirpath = datapath("io", "data", "csv") - self.csv1 = os.path.join(csv1_dirpath, "test1.csv") - self.csv2 = os.path.join(self.dirpath, "test2.csv") - self.xls1 = os.path.join(self.dirpath, "test.xls") - - def test_file_handle(self): - with open(self.csv1, "rb") as f: + @pytest.fixture + def csv_path(self, datapath): + return datapath("io", "data", "csv", "test1.csv") + + def test_file_handle(self, csv_path): + with open(csv_path, "rb") as f: reader = TextReader(f) reader.read() - def test_file_handle_mmap(self): + def test_file_handle_mmap(self, csv_path): # this was never using memory_map=True - with open(self.csv1, "rb") as f: + with open(csv_path, "rb") as f: reader = TextReader(f, header=None) reader.read() - def test_StringIO(self): - with open(self.csv1, "rb") as f: + def test_StringIO(self, csv_path): + with open(csv_path, "rb") as f: text = f.read() src = BytesIO(text) reader = TextReader(src, header=None) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 9232ea8a25e4d..2046427deeaf0 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,5 +1,3 @@ -import os - import numpy as np import pytest @@ -24,113 +22,122 @@ def numeric_as_float(data): class TestXport: @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath("io", "sas", "data") - self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") - self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") - self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") - self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") - self.file05 = os.path.join(self.dirpath, "DEMO_PUF.cpt") - + def setup_method(self): with td.file_leak_context(): yield + @pytest.fixture + def file01(self, datapath): + return datapath("io", "sas", "data", "DEMO_G.xpt") + + @pytest.fixture + def file02(self, datapath): + return datapath("io", "sas", "data", "SSHSV1_A.xpt") + + @pytest.fixture + def file03(self, datapath): + return datapath("io", "sas", "data", "DRXFCD_G.xpt") + + @pytest.fixture + def file04(self, datapath): + return datapath("io", "sas", "data", "paxraw_d_short.xpt") + + @pytest.fixture + def file05(self, datapath): + return datapath("io", "sas", "data", "DEMO_PUF.cpt") + @pytest.mark.slow - def test1_basic(self): + def test1_basic(self, file01): # Tests with DEMO_G.xpt (all numeric file) # Compare to this - data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) numeric_as_float(data_csv) # Read full file - data = read_sas(self.file01, format="xport") + data = read_sas(file01, format="xport") tm.assert_frame_equal(data, data_csv) num_rows = data.shape[0] # Test reading beyond end of file - with read_sas(self.file01, format="xport", iterator=True) as reader: + with read_sas(file01, format="xport", iterator=True) as reader: data = reader.read(num_rows + 100) assert data.shape[0] == num_rows # Test incremental read with `read` method. - with read_sas(self.file01, format="xport", iterator=True) as reader: + with read_sas(file01, format="xport", iterator=True) as reader: data = reader.read(10) tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test incremental read with `get_chunk` method. - with read_sas(self.file01, format="xport", chunksize=10) as reader: + with read_sas(file01, format="xport", chunksize=10) as reader: data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :]) # Test read in loop m = 0 - with read_sas(self.file01, format="xport", chunksize=100) as reader: + with read_sas(file01, format="xport", chunksize=100) as reader: for x in reader: m += x.shape[0] assert m == num_rows # Read full file with `read_sas` method - data = read_sas(self.file01) + data = read_sas(file01) tm.assert_frame_equal(data, data_csv) - def test1_index(self): + def test1_index(self, file01): # Tests with DEMO_G.xpt using index (all numeric file) # Compare to this - data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) # Read full file - data = read_sas(self.file01, index="SEQN", format="xport") + data = read_sas(file01, index="SEQN", format="xport") tm.assert_frame_equal(data, data_csv, check_index_type=False) # Test incremental read with `read` method. - with read_sas( - self.file01, index="SEQN", format="xport", iterator=True - ) as reader: + with read_sas(file01, index="SEQN", format="xport", iterator=True) as reader: data = reader.read(10) tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. - with read_sas( - self.file01, index="SEQN", format="xport", chunksize=10 - ) as reader: + with read_sas(file01, index="SEQN", format="xport", chunksize=10) as reader: data = reader.get_chunk() tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) - def test1_incremental(self): + def test1_incremental(self, file01): # Test with DEMO_G.xpt, reading full file incrementally - data_csv = pd.read_csv(self.file01.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file01.replace(".xpt", ".csv")) data_csv = data_csv.set_index("SEQN") numeric_as_float(data_csv) - with read_sas(self.file01, index="SEQN", chunksize=1000) as reader: + with read_sas(file01, index="SEQN", chunksize=1000) as reader: all_data = list(reader) data = pd.concat(all_data, axis=0) tm.assert_frame_equal(data, data_csv, check_index_type=False) - def test2(self): + def test2(self, file02): # Test with SSHSV1_A.xpt # Compare to this - data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) - data = read_sas(self.file02) + data = read_sas(file02) tm.assert_frame_equal(data, data_csv) - def test2_binary(self): + def test2_binary(self, file02): # Test with SSHSV1_A.xpt, read as a binary file # Compare to this - data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) - with open(self.file02, "rb") as fd: + with open(file02, "rb") as fd: with td.file_leak_context(): # GH#35693 ensure that if we pass an open file, we # dont incorrectly close it in read_sas @@ -138,31 +145,31 @@ def test2_binary(self): tm.assert_frame_equal(data, data_csv) - def test_multiple_types(self): + def test_multiple_types(self, file03): # Test with DRXFCD_G.xpt (contains text and numeric variables) # Compare to this - data_csv = pd.read_csv(self.file03.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file03.replace(".xpt", ".csv")) - data = read_sas(self.file03, encoding="utf-8") + data = read_sas(file03, encoding="utf-8") tm.assert_frame_equal(data, data_csv) - def test_truncated_float_support(self): + def test_truncated_float_support(self, file04): # Test with paxraw_d_short.xpt, a shortened version of: # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP # This file has truncated floats (5 bytes in this case). # GH 11713 - data_csv = pd.read_csv(self.file04.replace(".xpt", ".csv")) + data_csv = pd.read_csv(file04.replace(".xpt", ".csv")) - data = read_sas(self.file04, format="xport") + data = read_sas(file04, format="xport") tm.assert_frame_equal(data.astype("int64"), data_csv) - def test_cport_header_found_raises(self): + def test_cport_header_found_raises(self, file05): # Test with DEMO_PUF.cpt, the beginning of puf2019_1_fall.xpt # from https://www.cms.gov/files/zip/puf2019.zip # (despite the extension, it's a cpt file) msg = "Header record indicates a CPORT file, which is not readable." with pytest.raises(ValueError, match=msg): - read_sas(self.file05, format="xport") + read_sas(file05, format="xport") diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 9c978623d4fb6..9427e389c40da 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -105,15 +105,16 @@ def test_same_ordering(datapath): scope="class", ) class TestReadHtml: - @pytest.fixture(autouse=True) - def set_files(self, datapath): - self.spam_data = datapath("io", "data", "html", "spam.html") - self.spam_data_kwargs = {} - self.spam_data_kwargs["encoding"] = "UTF-8" - self.banklist_data = datapath("io", "data", "html", "banklist.html") + @pytest.fixture + def spam_data(self, datapath): + return datapath("io", "data", "html", "spam.html") + + @pytest.fixture + def banklist_data(self, datapath): + return datapath("io", "data", "html", "banklist.html") @pytest.fixture(autouse=True, scope="function") - def set_defaults(self, flavor, request): + def set_defaults(self, flavor): self.read_html = partial(read_html, flavor=flavor) yield @@ -180,126 +181,122 @@ def test_spam_url(self): assert_framelist_equal(df1, df2) @pytest.mark.slow - def test_banklist(self): - df1 = self.read_html( - self.banklist_data, match=".*Florida.*", attrs={"id": "table"} - ) - df2 = self.read_html( - self.banklist_data, match="Metcalf Bank", attrs={"id": "table"} - ) + def test_banklist(self, banklist_data): + df1 = self.read_html(banklist_data, match=".*Florida.*", attrs={"id": "table"}) + df2 = self.read_html(banklist_data, match="Metcalf Bank", attrs={"id": "table"}) assert_framelist_equal(df1, df2) - def test_spam(self): - df1 = self.read_html(self.spam_data, match=".*Water.*") - df2 = self.read_html(self.spam_data, match="Unit") + def test_spam(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*") + df2 = self.read_html(spam_data, match="Unit") assert_framelist_equal(df1, df2) assert df1[0].iloc[0, 0] == "Proximates" assert df1[0].columns[0] == "Nutrient" - def test_spam_no_match(self): - dfs = self.read_html(self.spam_data) + def test_spam_no_match(self, spam_data): + dfs = self.read_html(spam_data) for df in dfs: assert isinstance(df, DataFrame) - def test_banklist_no_match(self): - dfs = self.read_html(self.banklist_data, attrs={"id": "table"}) + def test_banklist_no_match(self, banklist_data): + dfs = self.read_html(banklist_data, attrs={"id": "table"}) for df in dfs: assert isinstance(df, DataFrame) - def test_spam_header(self): - df = self.read_html(self.spam_data, match=".*Water.*", header=2)[0] + def test_spam_header(self, spam_data): + df = self.read_html(spam_data, match=".*Water.*", header=2)[0] assert df.columns[0] == "Proximates" assert not df.empty - def test_skiprows_int(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) + def test_skiprows_int(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = self.read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_range(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=range(2)) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=range(2)) + def test_skiprows_range(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=range(2)) + df2 = self.read_html(spam_data, match="Unit", skiprows=range(2)) assert_framelist_equal(df1, df2) - def test_skiprows_list(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=[1, 2]) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=[2, 1]) + def test_skiprows_list(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=[1, 2]) + df2 = self.read_html(spam_data, match="Unit", skiprows=[2, 1]) assert_framelist_equal(df1, df2) - def test_skiprows_set(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows={1, 2}) - df2 = self.read_html(self.spam_data, match="Unit", skiprows={2, 1}) + def test_skiprows_set(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows={1, 2}) + df2 = self.read_html(spam_data, match="Unit", skiprows={2, 1}) assert_framelist_equal(df1, df2) - def test_skiprows_slice(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=1) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=1) + def test_skiprows_slice(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=1) + df2 = self.read_html(spam_data, match="Unit", skiprows=1) assert_framelist_equal(df1, df2) - def test_skiprows_slice_short(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2)) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(2)) + def test_skiprows_slice_short(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2)) + df2 = self.read_html(spam_data, match="Unit", skiprows=slice(2)) assert_framelist_equal(df1, df2) - def test_skiprows_slice_long(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=slice(2, 5)) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=slice(4, 1, -1)) + def test_skiprows_slice_long(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5)) + df2 = self.read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1)) assert_framelist_equal(df1, df2) - def test_skiprows_ndarray(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", skiprows=np.arange(2)) - df2 = self.read_html(self.spam_data, match="Unit", skiprows=np.arange(2)) + def test_skiprows_ndarray(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", skiprows=np.arange(2)) + df2 = self.read_html(spam_data, match="Unit", skiprows=np.arange(2)) assert_framelist_equal(df1, df2) - def test_skiprows_invalid(self): + def test_skiprows_invalid(self, spam_data): with pytest.raises(TypeError, match=("is not a valid type for skipping rows")): - self.read_html(self.spam_data, match=".*Water.*", skiprows="asdf") + self.read_html(spam_data, match=".*Water.*", skiprows="asdf") - def test_index(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(self.spam_data, match="Unit", index_col=0) + def test_index(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) + df2 = self.read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_no_types(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_no_types(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_header_and_index_with_types(self): - df1 = self.read_html(self.spam_data, match=".*Water.*", header=1, index_col=0) - df2 = self.read_html(self.spam_data, match="Unit", header=1, index_col=0) + def test_header_and_index_with_types(self, spam_data): + df1 = self.read_html(spam_data, match=".*Water.*", header=1, index_col=0) + df2 = self.read_html(spam_data, match="Unit", header=1, index_col=0) assert_framelist_equal(df1, df2) - def test_infer_types(self): + def test_infer_types(self, spam_data): # 10892 infer_types removed - df1 = self.read_html(self.spam_data, match=".*Water.*", index_col=0) - df2 = self.read_html(self.spam_data, match="Unit", index_col=0) + df1 = self.read_html(spam_data, match=".*Water.*", index_col=0) + df2 = self.read_html(spam_data, match="Unit", index_col=0) assert_framelist_equal(df1, df2) - def test_string_io(self): - with open(self.spam_data, **self.spam_data_kwargs) as f: + def test_string_io(self, spam_data): + with open(spam_data, encoding="UTF-8") as f: data1 = StringIO(f.read()) - with open(self.spam_data, **self.spam_data_kwargs) as f: + with open(spam_data, encoding="UTF-8") as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, match=".*Water.*") df2 = self.read_html(data2, match="Unit") assert_framelist_equal(df1, df2) - def test_string(self): - with open(self.spam_data, **self.spam_data_kwargs) as f: + def test_string(self, spam_data): + with open(spam_data, encoding="UTF-8") as f: data = f.read() df1 = self.read_html(data, match=".*Water.*") @@ -307,11 +304,11 @@ def test_string(self): assert_framelist_equal(df1, df2) - def test_file_like(self): - with open(self.spam_data, **self.spam_data_kwargs) as f: + def test_file_like(self, spam_data): + with open(spam_data, encoding="UTF-8") as f: df1 = self.read_html(f, match=".*Water.*") - with open(self.spam_data, **self.spam_data_kwargs) as f: + with open(spam_data, encoding="UTF-8") as f: df2 = self.read_html(f, match="Unit") assert_framelist_equal(df1, df2) @@ -332,8 +329,8 @@ def test_invalid_url(self): self.read_html("http://www.a23950sdfa908sd.com", match=".*Water.*") @pytest.mark.slow - def test_file_url(self): - url = self.banklist_data + def test_file_url(self, banklist_data): + url = banklist_data dfs = self.read_html( file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"} ) @@ -342,53 +339,55 @@ def test_file_url(self): assert isinstance(df, DataFrame) @pytest.mark.slow - def test_invalid_table_attrs(self): - url = self.banklist_data + def test_invalid_table_attrs(self, banklist_data): + url = banklist_data with pytest.raises(ValueError, match="No tables found"): self.read_html( url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"} ) - def _bank_data(self, *args, **kwargs): + def _bank_data(self, path, *args, **kwargs): return self.read_html( - self.banklist_data, match="Metcalf", attrs={"id": "table"}, *args, **kwargs + path, match="Metcalf", attrs={"id": "table"}, *args, **kwargs ) @pytest.mark.slow - def test_multiindex_header(self): - df = self._bank_data(header=[0, 1])[0] + def test_multiindex_header(self, banklist_data): + df = self._bank_data(banklist_data, header=[0, 1])[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_index(self): - df = self._bank_data(index_col=[0, 1])[0] + def test_multiindex_index(self, banklist_data): + df = self._bank_data(banklist_data, index_col=[0, 1])[0] assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index(self): - df = self._bank_data(header=[0, 1], index_col=[0, 1])[0] + def test_multiindex_header_index(self, banklist_data): + df = self._bank_data(banklist_data, header=[0, 1], index_col=[0, 1])[0] assert isinstance(df.columns, MultiIndex) assert isinstance(df.index, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows_tuples(self): - df = self._bank_data(header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows_tuples(self, banklist_data): + df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_skiprows(self): - df = self._bank_data(header=[0, 1], skiprows=1)[0] + def test_multiindex_header_skiprows(self, banklist_data): + df = self._bank_data(banklist_data, header=[0, 1], skiprows=1)[0] assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_multiindex_header_index_skiprows(self): - df = self._bank_data(header=[0, 1], index_col=[0, 1], skiprows=1)[0] + def test_multiindex_header_index_skiprows(self, banklist_data): + df = self._bank_data( + banklist_data, header=[0, 1], index_col=[0, 1], skiprows=1 + )[0] assert isinstance(df.index, MultiIndex) assert isinstance(df.columns, MultiIndex) @pytest.mark.slow - def test_regex_idempotency(self): - url = self.banklist_data + def test_regex_idempotency(self, banklist_data): + url = banklist_data dfs = self.read_html( file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile("Florida")), @@ -398,10 +397,10 @@ def test_regex_idempotency(self): for df in dfs: assert isinstance(df, DataFrame) - def test_negative_skiprows(self): + def test_negative_skiprows(self, spam_data): msg = r"\(you passed a negative value\)" with pytest.raises(ValueError, match=msg): - self.read_html(self.spam_data, match="Water", skiprows=-1) + self.read_html(spam_data, match="Water", skiprows=-1) @tm.network def test_multiple_matches(self): @@ -589,7 +588,7 @@ def test_parse_header_of_non_string_column(self): tm.assert_frame_equal(result, expected) @pytest.mark.slow - def test_banklist_header(self, datapath): + def test_banklist_header(self, banklist_data, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -598,9 +597,7 @@ def try_remove_ws(x): except AttributeError: return x - df = self.read_html(self.banklist_data, match="Metcalf", attrs={"id": "table"})[ - 0 - ] + df = self.read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0] ground_truth = read_csv( datapath("io", "data", "csv", "banklist.csv"), converters={"Updated Date": Timestamp, "Closing Date": Timestamp}, @@ -639,15 +636,15 @@ def try_remove_ws(x): tm.assert_frame_equal(converted, gtnew) @pytest.mark.slow - def test_gold_canyon(self): + def test_gold_canyon(self, banklist_data): gc = "Gold Canyon" - with open(self.banklist_data) as f: + with open(banklist_data) as f: raw_text = f.read() assert gc in raw_text - df = self.read_html( - self.banklist_data, match="Gold Canyon", attrs={"id": "table"} - )[0] + df = self.read_html(banklist_data, match="Gold Canyon", attrs={"id": "table"})[ + 0 + ] assert gc in df.to_string() def test_different_number_of_cols(self): @@ -966,16 +963,16 @@ def test_decimal_rows(self): assert result["Header"].dtype == np.dtype("float64") tm.assert_frame_equal(result, expected) - def test_bool_header_arg(self): + @pytest.mark.parametrize("arg", [True, False]) + def test_bool_header_arg(self, spam_data, arg): # GH 6114 msg = re.escape( "Passing a bool to header is invalid. Use header=None for no header or " "header=int or list-like of ints to specify the row(s) making up the " "column names" ) - for arg in [True, False]: - with pytest.raises(TypeError, match=msg): - self.read_html(self.spam_data, header=arg) + with pytest.raises(TypeError, match=msg): + self.read_html(spam_data, header=arg) def test_converters(self): # GH 13461 diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index ea2f16eae6411..ebf67b0518c65 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -27,17 +27,29 @@ def read_data(self, datapath, name, dedupe=False): x.time = to_datetime(x.time) return x - @pytest.fixture(autouse=True) - def setup_method(self, datapath): + @pytest.fixture + def trades(self, datapath): + return self.read_data(datapath, "trades.csv") - self.trades = self.read_data(datapath, "trades.csv") - self.quotes = self.read_data(datapath, "quotes.csv", dedupe=True) - self.asof = self.read_data(datapath, "asof.csv") - self.tolerance = self.read_data(datapath, "tolerance.csv") - self.allow_exact_matches = self.read_data(datapath, "allow_exact_matches.csv") - self.allow_exact_matches_and_tolerance = self.read_data( - datapath, "allow_exact_matches_and_tolerance.csv" - ) + @pytest.fixture + def quotes(self, datapath): + return self.read_data(datapath, "quotes.csv", dedupe=True) + + @pytest.fixture + def asof(self, datapath): + return self.read_data(datapath, "asof.csv") + + @pytest.fixture + def tolerance(self, datapath): + return self.read_data(datapath, "tolerance.csv") + + @pytest.fixture + def allow_exact_matches(self, datapath): + return self.read_data(datapath, "allow_exact_matches.csv") + + @pytest.fixture + def allow_exact_matches_and_tolerance(self, datapath): + return self.read_data(datapath, "allow_exact_matches_and_tolerance.csv") def test_examples1(self): """doc-string examples""" @@ -163,33 +175,28 @@ def test_examples4(self): result = merge_asof(left, right, on="a", direction="nearest") tm.assert_frame_equal(result, expected) - def test_basic(self): + def test_basic(self, trades, asof, quotes): - expected = self.asof - trades = self.trades - quotes = self.quotes + expected = asof result = merge_asof(trades, quotes, on="time", by="ticker") tm.assert_frame_equal(result, expected) - def test_basic_categorical(self): + def test_basic_categorical(self, trades, asof, quotes): - expected = self.asof - trades = self.trades.copy() + expected = asof trades.ticker = trades.ticker.astype("category") - quotes = self.quotes.copy() quotes.ticker = quotes.ticker.astype("category") expected.ticker = expected.ticker.astype("category") result = merge_asof(trades, quotes, on="time", by="ticker") tm.assert_frame_equal(result, expected) - def test_basic_left_index(self): + def test_basic_left_index(self, trades, asof, quotes): # GH14253 - expected = self.asof - trades = self.trades.set_index("time") - quotes = self.quotes + expected = asof + trades = trades.set_index("time") result = merge_asof( trades, quotes, left_index=True, right_on="time", by="ticker" @@ -200,77 +207,77 @@ def test_basic_left_index(self): expected = expected[result.columns] tm.assert_frame_equal(result, expected) - def test_basic_right_index(self): + def test_basic_right_index(self, trades, asof, quotes): - expected = self.asof - trades = self.trades - quotes = self.quotes.set_index("time") + expected = asof + trades = trades + quotes = quotes.set_index("time") result = merge_asof( trades, quotes, left_on="time", right_index=True, by="ticker" ) tm.assert_frame_equal(result, expected) - def test_basic_left_index_right_index(self): + def test_basic_left_index_right_index(self, trades, asof, quotes): - expected = self.asof.set_index("time") - trades = self.trades.set_index("time") - quotes = self.quotes.set_index("time") + expected = asof.set_index("time") + trades = trades.set_index("time") + quotes = quotes.set_index("time") result = merge_asof( trades, quotes, left_index=True, right_index=True, by="ticker" ) tm.assert_frame_equal(result, expected) - def test_multi_index(self): + def test_multi_index_left(self, trades, quotes): # MultiIndex is prohibited - trades = self.trades.set_index(["time", "price"]) - quotes = self.quotes.set_index("time") + trades = trades.set_index(["time", "price"]) + quotes = quotes.set_index("time") with pytest.raises(MergeError, match="left can only have one index"): merge_asof(trades, quotes, left_index=True, right_index=True) - trades = self.trades.set_index("time") - quotes = self.quotes.set_index(["time", "bid"]) + def test_multi_index_right(self, trades, quotes): + + # MultiIndex is prohibited + trades = trades.set_index("time") + quotes = quotes.set_index(["time", "bid"]) with pytest.raises(MergeError, match="right can only have one index"): merge_asof(trades, quotes, left_index=True, right_index=True) - def test_on_and_index(self): + def test_on_and_index_left_on(self, trades, quotes): # "on" parameter and index together is prohibited - trades = self.trades.set_index("time") - quotes = self.quotes.set_index("time") + trades = trades.set_index("time") + quotes = quotes.set_index("time") msg = 'Can only pass argument "left_on" OR "left_index" not both.' with pytest.raises(MergeError, match=msg): merge_asof( trades, quotes, left_on="price", left_index=True, right_index=True ) - trades = self.trades.set_index("time") - quotes = self.quotes.set_index("time") + def test_on_and_index_right_on(self, trades, quotes): + trades = trades.set_index("time") + quotes = quotes.set_index("time") msg = 'Can only pass argument "right_on" OR "right_index" not both.' with pytest.raises(MergeError, match=msg): merge_asof( trades, quotes, right_on="bid", left_index=True, right_index=True ) - def test_basic_left_by_right_by(self): + def test_basic_left_by_right_by(self, trades, asof, quotes): # GH14253 - expected = self.asof - trades = self.trades - quotes = self.quotes + expected = asof result = merge_asof( trades, quotes, on="time", left_by="ticker", right_by="ticker" ) tm.assert_frame_equal(result, expected) - def test_missing_right_by(self): + def test_missing_right_by(self, trades, asof, quotes): - expected = self.asof - trades = self.trades - quotes = self.quotes + expected = asof q = quotes[quotes.ticker != "MSFT"] result = merge_asof(trades, q, on="time", by="ticker") @@ -466,7 +473,7 @@ def test_basic2(self, datapath): result = merge_asof(trades, quotes, on="time", by="ticker") tm.assert_frame_equal(result, expected) - def test_basic_no_by(self): + def test_basic_no_by(self, trades, asof, quotes): f = ( lambda x: x[x.ticker == "MSFT"] .drop("ticker", axis=1) @@ -474,17 +481,14 @@ def test_basic_no_by(self): ) # just use a single ticker - expected = f(self.asof) - trades = f(self.trades) - quotes = f(self.quotes) + expected = f(asof) + trades = f(trades) + quotes = f(quotes) result = merge_asof(trades, quotes, on="time") tm.assert_frame_equal(result, expected) - def test_valid_join_keys(self): - - trades = self.trades - quotes = self.quotes + def test_valid_join_keys(self, trades, quotes): msg = r"incompatible merge keys \[1\] .* must be the same type" @@ -497,14 +501,14 @@ def test_valid_join_keys(self): with pytest.raises(MergeError, match="can only asof on a key for left"): merge_asof(trades, quotes, by="ticker") - def test_with_duplicates(self, datapath): + def test_with_duplicates(self, datapath, trades, quotes): q = ( - pd.concat([self.quotes, self.quotes]) + pd.concat([quotes, quotes]) .sort_values(["time", "ticker"]) .reset_index(drop=True) ) - result = merge_asof(self.trades, q, on="time", by="ticker") + result = merge_asof(trades, q, on="time", by="ticker") expected = self.read_data(datapath, "asof.csv") tm.assert_frame_equal(result, expected) @@ -518,10 +522,7 @@ def test_with_duplicates_no_on(self): ) tm.assert_frame_equal(result, expected) - def test_valid_allow_exact_matches(self): - - trades = self.trades - quotes = self.quotes + def test_valid_allow_exact_matches(self, trades, quotes): msg = "allow_exact_matches must be boolean, passed foo" @@ -530,10 +531,7 @@ def test_valid_allow_exact_matches(self): trades, quotes, on="time", by="ticker", allow_exact_matches="foo" ) - def test_valid_tolerance(self): - - trades = self.trades - quotes = self.quotes + def test_valid_tolerance(self, trades, quotes): # dti merge_asof(trades, quotes, on="time", by="ticker", tolerance=Timedelta("1s")) @@ -580,10 +578,10 @@ def test_valid_tolerance(self): tolerance=-1, ) - def test_non_sorted(self): + def test_non_sorted(self, trades, quotes): - trades = self.trades.sort_values("time", ascending=False) - quotes = self.quotes.sort_values("time", ascending=False) + trades = trades.sort_values("time", ascending=False) + quotes = quotes.sort_values("time", ascending=False) # we require that we are already sorted on time & quotes assert not trades.time.is_monotonic_increasing @@ -591,31 +589,29 @@ def test_non_sorted(self): with pytest.raises(ValueError, match="left keys must be sorted"): merge_asof(trades, quotes, on="time", by="ticker") - trades = self.trades.sort_values("time") + trades = trades.sort_values("time") assert trades.time.is_monotonic_increasing assert not quotes.time.is_monotonic_increasing with pytest.raises(ValueError, match="right keys must be sorted"): merge_asof(trades, quotes, on="time", by="ticker") - quotes = self.quotes.sort_values("time") + quotes = quotes.sort_values("time") assert trades.time.is_monotonic_increasing assert quotes.time.is_monotonic_increasing # ok, though has dupes - merge_asof(trades, self.quotes, on="time", by="ticker") + merge_asof(trades, quotes, on="time", by="ticker") @pytest.mark.parametrize( - "tolerance", + "tolerance_ts", [Timedelta("1day"), datetime.timedelta(days=1)], ids=["Timedelta", "datetime.timedelta"], ) - def test_tolerance(self, tolerance): - - trades = self.trades - quotes = self.quotes - - result = merge_asof(trades, quotes, on="time", by="ticker", tolerance=tolerance) - expected = self.tolerance + def test_tolerance(self, tolerance_ts, trades, quotes, tolerance): + result = merge_asof( + trades, quotes, on="time", by="ticker", tolerance=tolerance_ts + ) + expected = tolerance tm.assert_frame_equal(result, expected) def test_tolerance_forward(self): @@ -702,11 +698,11 @@ def test_tolerance_float(self): result = merge_asof(left, right, on="a", direction="nearest", tolerance=0.5) tm.assert_frame_equal(result, expected) - def test_index_tolerance(self): + def test_index_tolerance(self, trades, quotes, tolerance): # GH 15135 - expected = self.tolerance.set_index("time") - trades = self.trades.set_index("time") - quotes = self.quotes.set_index("time") + expected = tolerance.set_index("time") + trades = trades.set_index("time") + quotes = quotes.set_index("time") result = merge_asof( trades, @@ -718,12 +714,12 @@ def test_index_tolerance(self): ) tm.assert_frame_equal(result, expected) - def test_allow_exact_matches(self): + def test_allow_exact_matches(self, trades, quotes, allow_exact_matches): result = merge_asof( - self.trades, self.quotes, on="time", by="ticker", allow_exact_matches=False + trades, quotes, on="time", by="ticker", allow_exact_matches=False ) - expected = self.allow_exact_matches + expected = allow_exact_matches tm.assert_frame_equal(result, expected) def test_allow_exact_matches_forward(self): @@ -756,17 +752,19 @@ def test_allow_exact_matches_nearest(self): ) tm.assert_frame_equal(result, expected) - def test_allow_exact_matches_and_tolerance(self): + def test_allow_exact_matches_and_tolerance( + self, trades, quotes, allow_exact_matches_and_tolerance + ): result = merge_asof( - self.trades, - self.quotes, + trades, + quotes, on="time", by="ticker", tolerance=Timedelta("100ms"), allow_exact_matches=False, ) - expected = self.allow_exact_matches_and_tolerance + expected = allow_exact_matches_and_tolerance tm.assert_frame_equal(result, expected) def test_allow_exact_matches_and_tolerance2(self):