diff --git a/MANIFEST.in b/MANIFEST.in index 9773019c6e6e0..b417b8890fa24 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,27 +3,39 @@ include LICENSE include RELEASE.md include README.md include setup.py -include pyproject.toml graft doc prune doc/build +graft LICENSES + graft pandas -global-exclude *.so -global-exclude *.pyd +global-exclude *.bz2 +global-exclude *.csv +global-exclude *.dta +global-exclude *.gz +global-exclude *.h5 +global-exclude *.html +global-exclude *.json +global-exclude *.msgpack +global-exclude *.pickle +global-exclude *.png global-exclude *.pyc +global-exclude *.pyd +global-exclude *.sas7bdat +global-exclude *.so +global-exclude *.xls +global-exclude *.xlsm +global-exclude *.xlsx +global-exclude *.xpt +global-exclude *.xz +global-exclude *.zip global-exclude *~ -global-exclude \#* -global-exclude .git* global-exclude .DS_Store -global-exclude *.png +global-exclude .git* +global-exclude \#* -# include examples/data/* -# recursive-include examples *.py -# recursive-include doc/source * -# recursive-include doc/sphinxext * -# recursive-include LICENSES * include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl diff --git a/ci/script_single.sh b/ci/script_single.sh index f376c920ac71b..60e2fbb33ee5d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -25,12 +25,12 @@ if [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas - pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas + pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest fi diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index a603bf9f7e9e0..a41a6c31b0678 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -76,6 +76,11 @@ Documentation Changes - - +Build Changes +------------- + +- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`) + .. _whatsnew_0232.bug_fixes: Bug Fixes diff --git a/pandas/conftest.py b/pandas/conftest.py index b4a599758417c..82d860b091b82 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,7 +1,9 @@ +import os import importlib import pytest +import pandas import numpy as np import pandas as pd from pandas.compat import PY3 @@ -17,6 +19,8 @@ def pytest_addoption(parser): help="run high memory tests") parser.addoption("--only-slow", action="store_true", help="run only slow tests") + parser.addoption("--strict-data-files", action="store_true", + help="Fail if a test is skipped for missing data file.") def pytest_runtest_setup(item): @@ -131,6 +135,43 @@ def join_type(request): return request.param +@pytest.fixture +def datapath(request): + """Get the path to a data file. + + Parameters + ---------- + path : str + Path to the file, relative to ``pandas/tests/`` + + Returns + ------- + path : path including ``pandas/tests``. + + Raises + ------ + ValueError + If the path doesn't exist and the --strict-data-files option is set. + """ + def deco(*args): + path = os.path.join('pandas', 'tests', *args) + if not os.path.exists(path): + if request.config.getoption("--strict-data-files"): + msg = "Could not find file {} and --strict-data-files is set." + raise ValueError(msg.format(path)) + else: + msg = "Could not find {}." + pytest.skip(msg.format(path)) + return path + return deco + + +@pytest.fixture +def iris(datapath): + """The iris dataset as a DataFrame.""" + return pandas.read_csv(datapath('data', 'iris.csv')) + + @pytest.fixture(params=['nlargest', 'nsmallest']) def nselect_method(request): """ diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 362f917e74972..c925c4c403960 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1182,12 +1182,12 @@ def test_iter(self): ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] assert result == expected - def test_legacy_pickle(self): + def test_legacy_pickle(self, datapath): if PY3: pytest.skip("testing for legacy pickles not " "support on py3") - path = tm.get_data_path('multiindex_v1.pickle') + path = datapath('indexes', 'data', 'multiindex_v1.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) @@ -1203,10 +1203,10 @@ def test_legacy_pickle(self): assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) - def test_legacy_v2_unpickle(self): + def test_legacy_v2_unpickle(self, datapath): # 0.7.3 -> 0.8.0 format manage - path = tm.get_data_path('mindex_073.pickle') + path = datapath('indexes', 'data', 'mindex_073.pickle') obj = pd.read_pickle(path) obj2 = MultiIndex.from_tuples(obj.values) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8deb51e190bab..7623587803b41 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,32 +1,23 @@ -import os - import pytest from pandas.io.parsers import read_table -from pandas.util import testing as tm - - -@pytest.fixture -def parser_data(request): - return os.path.join(tm.get_data_path(), '..', 'parser', 'data') @pytest.fixture -def tips_file(parser_data): +def tips_file(datapath): """Path to the tips dataset""" - return os.path.join(parser_data, 'tips.csv') + return datapath('io', 'parser', 'data', 'tips.csv') @pytest.fixture -def jsonl_file(parser_data): +def jsonl_file(datapath): """Path a JSONL dataset""" - return os.path.join(parser_data, 'items.jsonl') + return datapath('io', 'parser', 'data', 'items.jsonl') @pytest.fixture -def salaries_table(parser_data): +def salaries_table(datapath): """DataFrame with the salaries dataset""" - path = os.path.join(parser_data, 'salaries.csv') - return read_table(path) + return read_table(datapath('io', 'parser', 'data', 'salaries.csv')) @pytest.fixture diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f221df93dd412..63b7cb3459069 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self): dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})}) compat.text_type(dm.to_string()) - def test_string_repr_encoding(self): - filepath = tm.get_data_path('unicode_series.csv') + def test_string_repr_encoding(self, datapath): + filepath = datapath('io', 'formats', 'data', 'unicode_series.csv') df = pd.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c9074ca49e5be..05ceace20f5a4 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -21,11 +21,11 @@ def test_compression_roundtrip(compression): assert_frame_equal(df, pd.read_json(result)) -def test_read_zipped_json(): - uncompressed_path = tm.get_data_path("tsframe_v012.json") +def test_read_zipped_json(datapath): + uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json") uncompressed_df = pd.read_json(uncompressed_path) - compressed_path = tm.get_data_path("tsframe_v012.json.zip") + compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip") compressed_df = pd.read_json(compressed_path, compression='zip') assert_frame_equal(uncompressed_df, compressed_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 7e497c395266f..bcbac4400c953 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -37,8 +37,9 @@ class TestPandasContainer(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(scope="function", autouse=True) + def setup(self, datapath): + self.dirpath = datapath("io", "json", "data") self.ts = tm.makeTimeSeries() self.ts.name = 'ts' @@ -59,7 +60,8 @@ def setup_method(self, method): self.mixed_frame = _mixed_frame.copy() self.categorical = _cat_frame.copy() - def teardown_method(self, method): + yield + del self.dirpath del self.ts diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index 6e1d3575a1481..9e871d27f0ce8 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -77,7 +77,7 @@ def test_read_csv(self): else: prefix = u("file://") - fname = prefix + compat.text_type(self.csv1) + fname = prefix + compat.text_type(os.path.abspath(self.csv1)) self.read_csv(fname, index_col=0, parse_dates=True) def test_1000_sep(self): @@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self): tm.assert_frame_equal(df, expected) @tm.network - def test_url(self): + def test_url(self, datapath): # HTTP(S) url = ('https://raw.github.com/pandas-dev/pandas/master/' 'pandas/tests/io/parser/data/salaries.csv') url_table = self.read_table(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) # TODO: ftp testing @pytest.mark.slow - def test_file(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') + def test_file(self, datapath): + localtable = datapath('io', 'parser', 'data', 'salaries.csv') local_table = self.read_table(localtable) try: @@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) - def test_utf16_example(self): - path = tm.get_data_path('utf16_ex.txt') + def test_utf16_example(self, datapath): + path = datapath('io', 'parser', 'data', 'utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -767,8 +765,8 @@ def test_utf16_example(self): result = self.read_table(buf, encoding='utf-16') assert len(result) == 50 - def test_unicode_encoding(self): - pth = tm.get_data_path('unicode_series.csv') + def test_unicode_encoding(self, datapath): + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self): result = self.read_csv(path) tm.assert_frame_equal(result, expected) - def test_sub_character(self): + def test_sub_character(self, datapath): # see gh-16893 - dirpath = tm.get_data_path() - filename = os.path.join(dirpath, "sub_char.csv") + filename = datapath('io', 'parser', 'data', 'sub_char.csv') expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"]) result = self.read_csv(filename) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py index e84db66561c49..e4950af19ea95 100644 --- a/pandas/tests/io/parser/compression.py +++ b/pandas/tests/io/parser/compression.py @@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) - def test_read_csv_compressed_utf16_example(self): + def test_read_csv_compressed_utf16_example(self, datapath): # GH18071 - path = tm.get_data_path('utf16_ex_small.zip') + path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') result = self.read_csv(path, encoding='utf-16', compression='zip', sep='\t') diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py index b91ce04673e29..8060ebf2fbcd4 100644 --- a/pandas/tests/io/parser/dtypes.py +++ b/pandas/tests/io/parser/dtypes.py @@ -125,9 +125,9 @@ def test_categorical_dtype_high_cardinality_numeric(self): np.sort(actual.a.cat.categories), ordered=True) tm.assert_frame_equal(actual, expected) - def test_categorical_dtype_encoding(self): + def test_categorical_dtype_encoding(self, datapath): # GH 10153 - pth = tm.get_data_path('unicode_series.csv') + pth = datapath('io', 'parser', 'data', 'unicode_series.csv') encoding = 'latin-1' expected = self.read_csv(pth, header=None, encoding=encoding) expected[1] = Categorical(expected[1]) @@ -135,7 +135,7 @@ def test_categorical_dtype_encoding(self): dtype={1: 'category'}) tm.assert_frame_equal(actual, expected) - pth = tm.get_data_path('utf16_ex.txt') + pth = datapath('io', 'parser', 'data', 'utf16_ex.txt') encoding = 'utf-16' expected = self.read_table(pth, encoding=encoding) expected = expected.apply(Categorical) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index fdf45f307e953..e2243b8087a5b 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -48,10 +48,16 @@ def check_compressed_urls(salaries_table, compression, extension, mode, tm.assert_frame_equal(url_table, salaries_table) +@pytest.fixture +def tips_df(datapath): + """DataFrame with the tips dataset.""" + return read_csv(datapath('io', 'parser', 'data', 'tips.csv')) + + @pytest.mark.usefixtures("s3_resource") class TestS3(object): - def test_parse_public_s3_bucket(self): + def test_parse_public_s3_bucket(self, tips_df): pytest.importorskip('s3fs') # more of an integration test due to the not-public contents portion # can probably mock this though. @@ -60,45 +66,40 @@ def test_parse_public_s3_bucket(self): ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL df = read_csv('s3n://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL df = read_csv('s3a://pandas-test/tips.csv', nrows=10) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp) @@ -109,14 +110,13 @@ def test_parse_public_s3_bucket_chunked(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 - local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df_reader = read_csv('s3://pandas-test/tips.csv' + ext, chunksize=chunksize, compression=comp, @@ -127,36 +127,33 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() assert isinstance(df, DataFrame) assert not df.empty - true_df = local_tips.iloc[ + true_df = tips_df.iloc[ chunksize * i_chunk: chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self): + def test_infer_s3_compression(self, tips_df): for ext in ['', '.gz', '.bz2']: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', compression='infer') assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: df = read_csv('s3://pandas-test/tips.csv' + ext, engine='python', nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(tips_df.iloc[:10], df) def test_s3_fails(self): with pytest.raises(IOError): diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 7717102b64fc5..b6f13039641a2 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- import os +import pytest import pandas.util.testing as tm from pandas import read_csv, read_table, DataFrame @@ -45,8 +46,9 @@ def read_table(self, *args, **kwargs): def float_precision_choices(self): raise com.AbstractMethodError(self) - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index e8d9d8b52164b..c7026e3e0fc88 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -28,8 +28,9 @@ class TestTextReader(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath('io', 'parser', 'data') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index b80263021c269..101ee3e619f5b 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -11,8 +11,9 @@ class TestSAS7BDAT(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.data = [] self.test_ix = [list(range(1, 16)), [16]] for j in 1, 2: @@ -123,9 +124,8 @@ def test_iterator_read_too_much(self): rdr.close() -def test_encoding_options(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test1.sas7bdat") +def test_encoding_options(datapath): + fname = datapath("io", "sas", "data", "test1.sas7bdat") df1 = pd.read_sas(fname) df2 = pd.read_sas(fname, encoding='utf-8') for col in df1.columns: @@ -143,43 +143,39 @@ def test_encoding_options(): assert(x == y.decode()) -def test_productsales(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "productsales.sas7bdat") +def test_productsales(datapath): + fname = datapath("io", "sas", "data", "productsales.sas7bdat") df = pd.read_sas(fname, encoding='utf-8') - fname = os.path.join(dirpath, "productsales.csv") + fname = datapath("io", "sas", "data", "productsales.csv") df0 = pd.read_csv(fname, parse_dates=['MONTH']) vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"] df0[vn] = df0[vn].astype(np.float64) tm.assert_frame_equal(df, df0) -def test_12659(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "test_12659.sas7bdat") +def test_12659(datapath): + fname = datapath("io", "sas", "data", "test_12659.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "test_12659.csv") + fname = datapath("io", "sas", "data", "test_12659.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0) -def test_airline(): - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "airline.sas7bdat") +def test_airline(datapath): + fname = datapath("io", "sas", "data", "airline.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "airline.csv") + fname = datapath("io", "sas", "data", "airline.csv") df0 = pd.read_csv(fname) df0 = df0.astype(np.float64) tm.assert_frame_equal(df, df0, check_exact=False) -def test_date_time(): +def test_date_time(datapath): # Support of different SAS date/datetime formats (PR #15871) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "datetime.sas7bdat") + fname = datapath("io", "sas", "data", "datetime.sas7bdat") df = pd.read_sas(fname) - fname = os.path.join(dirpath, "datetime.csv") + fname = datapath("io", "sas", "data", "datetime.csv") df0 = pd.read_csv(fname, parse_dates=['Date1', 'Date2', 'DateTime', 'DateTimeHi', 'Taiw']) # GH 19732: Timestamps imported from sas will incur floating point errors @@ -187,9 +183,8 @@ def test_date_time(): tm.assert_frame_equal(df, df0) -def test_zero_variables(): +def test_zero_variables(datapath): # Check if the SAS file has zero variables (PR #18184) - dirpath = tm.get_data_path() - fname = os.path.join(dirpath, "zero_variables.sas7bdat") + fname = datapath("io", "sas", "data", "zero_variables.sas7bdat") with pytest.raises(EmptyDataError): pd.read_sas(fname) diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index de31c3e36a8d5..6e5b2ab067aa5 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -1,3 +1,4 @@ +import pytest import pandas as pd import pandas.util.testing as tm from pandas.io.sas.sasreader import read_sas @@ -18,8 +19,9 @@ def numeric_as_float(data): class TestXport(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a89156db38ae3..5c9739be73393 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -149,27 +149,22 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext): reader(path) @pytest.mark.parametrize('reader, module, path', [ - (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')), - (pd.read_fwf, 'os', os.path.join(HERE, 'data', - 'fixed_width_format.txt')), - (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')), - (pd.read_feather, 'feather', os.path.join(HERE, 'data', - 'feather-0_3_1.feather')), - (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf', - 'datetimetz_object.h5')), - (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')), - (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data', - 'test1.sas7bdat')), - (pd.read_json, 'os', os.path.join(HERE, 'json', 'data', - 'tsframe_v012.json')), - (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data', - 'frame.mp')), - (pd.read_pickle, 'os', os.path.join(HERE, 'data', - 'categorical_0_14_1.pickle')), + (pd.read_csv, 'os', ('io', 'data', 'iris.csv')), + (pd.read_table, 'os', ('io', 'data', 'iris.csv')), + (pd.read_fwf, 'os', ('io', 'data', 'fixed_width_format.txt')), + (pd.read_excel, 'xlrd', ('io', 'data', 'test1.xlsx')), + (pd.read_feather, 'feather', ('io', 'data', 'feather-0_3_1.feather')), + (pd.read_hdf, 'tables', ('io', 'data', 'legacy_hdf', + 'datetimetz_object.h5')), + (pd.read_stata, 'os', ('io', 'data', 'stata10_115.dta')), + (pd.read_sas, 'os', ('io', 'sas', 'data', 'test1.sas7bdat')), + (pd.read_json, 'os', ('io', 'json', 'data', 'tsframe_v012.json')), + (pd.read_msgpack, 'os', ('io', 'msgpack', 'data', 'frame.mp')), + (pd.read_pickle, 'os', ('io', 'data', 'categorical_0_14_1.pickle')), ]) - def test_read_fspath_all(self, reader, module, path): + def test_read_fspath_all(self, reader, module, path, datapath): pytest.importorskip(module) + path = datapath(*path) mypath = CustomFSPath(path) result = reader(mypath) @@ -232,13 +227,14 @@ def test_write_fspath_hdf5(self): tm.assert_frame_equal(result, expected) -class TestMMapWrapper(object): +@pytest.fixture +def mmap_file(datapath): + return datapath('io', 'data', 'test_mmap.csv') + - def setup_method(self, method): - self.mmap_file = os.path.join(tm.get_data_path(), - 'test_mmap.csv') +class TestMMapWrapper(object): - def test_constructor_bad_file(self): + def test_constructor_bad_file(self, mmap_file): non_file = StringIO('I am not a file') non_file.fileno = lambda: -1 @@ -252,15 +248,15 @@ def test_constructor_bad_file(self): tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file) - target = open(self.mmap_file, 'r') + target = open(mmap_file, 'r') target.close() msg = "I/O operation on closed file" tm.assert_raises_regex( ValueError, msg, common.MMapWrapper, target) - def test_get_attr(self): - with open(self.mmap_file, 'r') as target: + def test_get_attr(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -273,8 +269,8 @@ def test_get_attr(self): assert not hasattr(wrapper, 'foo') - def test_next(self): - with open(self.mmap_file, 'r') as target: + def test_next(self, mmap_file): + with open(mmap_file, 'r') as target: wrapper = common.MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index 2a225e6fe6a45..1fda56dbff772 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -39,8 +39,9 @@ @td.skip_if_no('xlrd', '0.9') class SharedItems(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -49,7 +50,6 @@ def setup_method(self, method): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Test data path is defined by pandas.util.testing.get_data_path() Parameters ---------- @@ -68,8 +68,7 @@ def get_csv_refdf(self, basename): def get_excelfile(self, basename, ext): """ - Return test data ExcelFile instance. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data ExcelFile instance. Parameters ---------- @@ -86,8 +85,7 @@ def get_excelfile(self, basename, ext): def get_exceldf(self, basename, ext, *args, **kwds): """ - Return test data DataFrame. Test data path is defined by - pandas.util.testing.get_data_path() + Return test data DataFrame. Parameters ---------- diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index a56946b82b027..9c6a8de7ed446 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1,6 +1,5 @@ from __future__ import print_function -import glob import os import re import threading @@ -25,8 +24,18 @@ import pandas.util._test_decorators as td from pandas.util.testing import makeCustomDataframe as mkdf, network +HERE = os.path.dirname(__file__) -DATA_PATH = tm.get_data_path() + +@pytest.fixture(params=[ + 'chinese_utf-16.html', + 'chinese_utf-32.html', + 'chinese_utf-8.html', + 'letz_latin1.html', +]) +def html_encoding_file(request, datapath): + """Parametrized fixture for HTML encoding test filenames.""" + return datapath('io', 'data', 'html_encoding', request.param) def assert_framelist_equal(list1, list2, *args, **kwargs): @@ -44,11 +53,11 @@ def assert_framelist_equal(list1, list2, *args, **kwargs): @td.skip_if_no('bs4') -def test_bs4_version_fails(monkeypatch): +def test_bs4_version_fails(monkeypatch, datapath): import bs4 monkeypatch.setattr(bs4, '__version__', '4.2') with tm.assert_raises_regex(ValueError, "minimum version"): - read_html(os.path.join(DATA_PATH, "spam.html"), flavor='bs4') + read_html(datapath("io", "data", "spam.html"), flavor='bs4') def test_invalid_flavor(): @@ -59,8 +68,8 @@ def test_invalid_flavor(): @td.skip_if_no('bs4') @td.skip_if_no('lxml') -def test_same_ordering(): - filename = os.path.join(DATA_PATH, 'valid_markup.html') +def test_same_ordering(datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs_lxml = read_html(filename, index_col=0, flavor=['lxml']) dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4']) assert_framelist_equal(dfs_lxml, dfs_bs4) @@ -72,11 +81,14 @@ def test_same_ordering(): pytest.param('lxml', marks=pytest.mark.skipif( not td.safe_import('lxml'), reason='No lxml'))], scope="class") class TestReadHtml(object): - spam_data = os.path.join(DATA_PATH, 'spam.html') - spam_data_kwargs = {} - if PY3: - spam_data_kwargs['encoding'] = 'UTF-8' - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + + @pytest.fixture(autouse=True) + def set_files(self, datapath): + self.spam_data = datapath('io', 'data', 'spam.html') + self.spam_data_kwargs = {} + if PY3: + self.spam_data_kwargs['encoding'] = 'UTF-8' + self.banklist_data = datapath("io", "data", "banklist.html") @pytest.fixture(autouse=True, scope="function") def set_defaults(self, flavor, request): @@ -272,7 +284,8 @@ def test_invalid_url(self): @pytest.mark.slow def test_file_url(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), 'First', + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), + 'First', attrs={'id': 'table'}) assert isinstance(dfs, list) for df in dfs: @@ -326,7 +339,7 @@ def test_multiindex_header_index_skiprows(self): @pytest.mark.slow def test_regex_idempotency(self): url = self.banklist_data - dfs = self.read_html(file_path_to_url(url), + dfs = self.read_html(file_path_to_url(os.path.abspath(url)), match=re.compile(re.compile('Florida')), attrs={'id': 'table'}) assert isinstance(dfs, list) @@ -352,9 +365,9 @@ def test_python_docs_table(self): assert sorted(zz) == sorted(['Repo', 'What']) @pytest.mark.slow - def test_thousands_macau_stats(self): + def test_thousands_macau_stats(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath("io", "data", "macau.html") dfs = self.read_html(macau_data, index_col=0, attrs={'class': 'style1'}) df = dfs[all_non_nan_table_index] @@ -362,9 +375,9 @@ def test_thousands_macau_stats(self): assert not any(s.isna().any() for _, s in df.iteritems()) @pytest.mark.slow - def test_thousands_macau_index_col(self): + def test_thousands_macau_index_col(self, datapath): all_non_nan_table_index = -2 - macau_data = os.path.join(DATA_PATH, 'macau.html') + macau_data = datapath('io', 'data', 'macau.html') dfs = self.read_html(macau_data, index_col=0, header=0) df = dfs[all_non_nan_table_index] @@ -518,8 +531,8 @@ def test_countries_municipalities(self): res2 = self.read_html(data2, header=0) assert_framelist_equal(res1, res2) - def test_nyse_wsj_commas_table(self): - data = os.path.join(DATA_PATH, 'nyse_wsj.html') + def test_nyse_wsj_commas_table(self, datapath): + data = datapath('io', 'data', 'nyse_wsj.html') df = self.read_html(data, index_col=0, header=0, attrs={'class': 'mdcTable'})[0] @@ -530,7 +543,7 @@ def test_nyse_wsj_commas_table(self): tm.assert_index_equal(df.columns, columns) @pytest.mark.slow - def test_banklist_header(self): + def test_banklist_header(self, datapath): from pandas.io.html import _remove_whitespace def try_remove_ws(x): @@ -541,7 +554,7 @@ def try_remove_ws(x): df = self.read_html(self.banklist_data, 'Metcalf', attrs={'id': 'table'})[0] - ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'), + ground_truth = read_csv(datapath('io', 'data', 'banklist.csv'), converters={'Updated Date': Timestamp, 'Closing Date': Timestamp}) assert df.shape == ground_truth.shape @@ -658,19 +671,19 @@ def test_parse_dates_combine(self): newdf = DataFrame({'datetime': raw_dates}) tm.assert_frame_equal(newdf, res[0]) - def test_computer_sales_page(self): - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + def test_computer_sales_page(self, datapath): + data = datapath('io', 'data', 'computer_sales_page.html') with tm.assert_raises_regex(ParserError, r"Passed header=\[0,1\] are " r"too many rows for this " r"multi_index of columns"): self.read_html(data, header=[0, 1]) - data = os.path.join(DATA_PATH, 'computer_sales_page.html') + data = datapath('io', 'data', 'computer_sales_page.html') assert self.read_html(data, header=[1, 2]) - def test_wikipedia_states_table(self): - data = os.path.join(DATA_PATH, 'wikipedia_states.html') + def test_wikipedia_states_table(self, datapath): + data = datapath('io', 'data', 'wikipedia_states.html') assert os.path.isfile(data), '%r is not a file' % data assert os.path.getsize(data), '%r is an empty file' % data result = self.read_html(data, 'Arizona', header=1)[0] @@ -784,15 +797,15 @@ def test_multiple_header_rows(self): html_df = read_html(html, )[0] tm.assert_frame_equal(expected_df, html_df) - def test_works_on_valid_markup(self): - filename = os.path.join(DATA_PATH, 'valid_markup.html') + def test_works_on_valid_markup(self, datapath): + filename = datapath('io', 'data', 'valid_markup.html') dfs = self.read_html(filename, index_col=0) assert isinstance(dfs, list) assert isinstance(dfs[0], DataFrame) @pytest.mark.slow - def test_fallback_success(self): - banklist_data = os.path.join(DATA_PATH, 'banklist.html') + def test_fallback_success(self, datapath): + banklist_data = datapath('io', 'data', 'banklist.html') self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib']) def test_to_html_timestamp(self): @@ -835,22 +848,23 @@ def test_displayed_only(self, displayed_only, exp0, exp1): else: assert len(dfs) == 1 # Should not parse hidden table - @pytest.mark.parametrize("f", glob.glob( - os.path.join(DATA_PATH, 'html_encoding', '*.html'))) - def test_encode(self, f): - _, encoding = os.path.splitext(os.path.basename(f))[0].split('_') + def test_encode(self, html_encoding_file): + _, encoding = os.path.splitext( + os.path.basename(html_encoding_file) + )[0].split('_') try: - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_string = self.read_html(fobj.read(), encoding=encoding, index_col=0).pop() - with open(f, 'rb') as fobj: + with open(html_encoding_file, 'rb') as fobj: from_file_like = self.read_html(BytesIO(fobj.read()), encoding=encoding, index_col=0).pop() - from_filename = self.read_html(f, encoding=encoding, + from_filename = self.read_html(html_encoding_file, + encoding=encoding, index_col=0).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) @@ -906,7 +920,7 @@ def seekable(self): assert self.read_html(bad) @pytest.mark.slow - def test_importcheck_thread_safety(self): + def test_importcheck_thread_safety(self, datapath): # see gh-16928 class ErrorThread(threading.Thread): @@ -921,7 +935,7 @@ def run(self): # force import check by reinitalising global vars in html.py reload(pandas.io.html) - filename = os.path.join(DATA_PATH, 'valid_markup.html') + filename = datapath('io', 'data', 'valid_markup.html') helper_thread1 = ErrorThread(target=self.read_html, args=(filename,)) helper_thread2 = ErrorThread(target=self.read_html, args=(filename,)) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 0b1c1ca178762..412e218f95c6f 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -3,6 +3,7 @@ from warnings import catch_warnings import os import datetime +import glob import numpy as np from distutils.version import LooseVersion @@ -836,13 +837,13 @@ def test_default_encoding(self): assert_frame_equal(result, frame) -def legacy_packers_versions(): - # yield the packers versions - path = tm.get_data_path('legacy_msgpack') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - yield v +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_msgpack", "*", "*.msgpack")) + + +@pytest.fixture(params=files) +def legacy_packer(request, datapath): + return datapath(request.param) class TestMsgpack(object): @@ -919,24 +920,20 @@ def compare_frame_dt_mixed_tzs(self, result, expected, typ, version): else: tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize('version', legacy_packers_versions()) def test_msgpacks_legacy(self, current_packers_data, all_packers_data, - version): - - pth = tm.get_data_path('legacy_msgpack/{0}'.format(version)) - n = 0 - for f in os.listdir(pth): - # GH12142 0.17 files packed in P2 can't be read in P3 - if (compat.PY3 and version.startswith('0.17.') and - f.split('.')[-4][-1] == '2'): - continue - vf = os.path.join(pth, f) - try: - with catch_warnings(record=True): - self.compare(current_packers_data, all_packers_data, - vf, version) - except ImportError: - # blosc not installed - continue - n += 1 - assert n > 0, 'Msgpack files are not tested' + legacy_packer, datapath): + + version = os.path.basename(os.path.dirname(legacy_packer)) + + # GH12142 0.17 files packed in P2 can't be read in P3 + if (compat.PY3 and version.startswith('0.17.') and + legacy_packer.split('.')[-4][-1] == '2'): + msg = "Files packed in Py2 can't be read in Py3 ({})" + pytest.skip(msg.format(version)) + try: + with catch_warnings(record=True): + self.compare(current_packers_data, all_packers_data, + legacy_packer, version) + except ImportError: + # blosc not installed + pass diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index fbe2174e603e2..45cbbd43cd6a8 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -12,7 +12,7 @@ 3. Move the created pickle to "data/legacy_pickle/" directory. """ - +import glob import pytest from warnings import catch_warnings @@ -184,27 +184,25 @@ def compare_sp_frame_float(result, expected, typ, version): tm.assert_sp_frame_equal(result, expected) +files = glob.glob(os.path.join(os.path.dirname(__file__), "data", + "legacy_pickle", "*", "*.pickle")) + + +@pytest.fixture(params=files) +def legacy_pickle(request, datapath): + return datapath(request.param) + + # --------------------- # tests # --------------------- -def legacy_pickle_versions(): - # yield the pickle versions - path = tm.get_data_path('legacy_pickle') - for v in os.listdir(path): - p = os.path.join(path, v) - if os.path.isdir(p): - for f in os.listdir(p): - yield (v, f) - - -@pytest.mark.parametrize('version, f', legacy_pickle_versions()) -def test_pickles(current_pickle_data, version, f): +def test_pickles(current_pickle_data, legacy_pickle): if not is_platform_little_endian(): pytest.skip("known failure on non-little endian") - vf = tm.get_data_path('legacy_pickle/{}/{}'.format(version, f)) + version = os.path.basename(os.path.dirname(legacy_pickle)) with catch_warnings(record=True): - compare(current_pickle_data, vf, version) + compare(current_pickle_data, legacy_pickle, version) def test_round_trip_current(current_pickle_data): @@ -260,12 +258,11 @@ def python_unpickler(path): compare_element(result, expected, typ) -def test_pickle_v0_14_1(): +def test_pickle_v0_14_1(datapath): cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_14_1.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_14_1.pickle') # This code was executed once on v0.14.1 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], @@ -275,14 +272,13 @@ def test_pickle_v0_14_1(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -def test_pickle_v0_15_2(): +def test_pickle_v0_15_2(datapath): # ordered -> _ordered # GH 9347 cat = pd.Categorical(values=['a', 'b', 'c'], ordered=False, categories=['a', 'b', 'c', 'd']) - pickle_path = os.path.join(tm.get_data_path(), - 'categorical_0_15_2.pickle') + pickle_path = datapath('io', 'data', 'categorical_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'], diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index f96e7eeb40ea2..b95df3840b6c5 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4449,28 +4449,27 @@ def f(): store.select('df') tm.assert_raises_regex(ClosedFileError, 'file is not open', f) - def test_pytables_native_read(self): - + def test_pytables_native_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native.h5'), + datapath('io', 'data', 'legacy_hdf/pytables_native.h5'), mode='r') as store: d2 = store['detector/readout'] assert isinstance(d2, DataFrame) @pytest.mark.skipif(PY35 and is_platform_windows(), reason="native2 read fails oddly on windows / 3.5") - def test_pytables_native2_read(self): + def test_pytables_native2_read(self, datapath): with ensure_clean_store( - tm.get_data_path('legacy_hdf/pytables_native2.h5'), + datapath('io', 'data', 'legacy_hdf', 'pytables_native2.h5'), mode='r') as store: str(store) d1 = store['detector'] assert isinstance(d1, DataFrame) - def test_legacy_table_read(self): + def test_legacy_table_read(self, datapath): # legacy table types with ensure_clean_store( - tm.get_data_path('legacy_hdf/legacy_table.h5'), + datapath('io', 'data', 'legacy_hdf', 'legacy_table.h5'), mode='r') as store: with catch_warnings(record=True): @@ -5117,7 +5116,7 @@ def test_fspath(self): with pd.HDFStore(path) as store: assert os.fspath(store) == str(path) - def test_read_py2_hdf_file_in_py3(self): + def test_read_py2_hdf_file_in_py3(self, datapath): # GH 16781 # tests reading a PeriodIndex DataFrame written in Python2 in Python3 @@ -5132,8 +5131,8 @@ def test_read_py2_hdf_file_in_py3(self): ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B')) with ensure_clean_store( - tm.get_data_path( - 'legacy_hdf/periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), + datapath('io', 'data', 'legacy_hdf', + 'periodindex_0.20.1_x86_64_darwin_2.7.13.h5'), mode='r') as store: result = store['p'] assert_frame_equal(result, expected) @@ -5530,14 +5529,14 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self): + def test_legacy_datetimetz_object(self, datapath): # legacy from < 0.17.0 # 8260 expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) with ensure_clean_store( - tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + datapath('io', 'data', 'legacy_hdf', 'datetimetz_object.h5'), mode='r') as store: result = store['df'] assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index f3ab74d37a2bc..f8f742c5980ac 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -22,7 +22,6 @@ import pytest import sqlite3 import csv -import os import warnings import numpy as np @@ -184,9 +183,11 @@ class MixInBase(object): def teardown_method(self, method): - for tbl in self._get_all_tables(): - self.drop_table(tbl) - self._close_conn() + # if setup fails, there may not be a connection to close. + if hasattr(self, 'conn'): + for tbl in self._get_all_tables(): + self.drop_table(tbl) + self._close_conn() class MySQLMixIn(MixInBase): @@ -253,9 +254,9 @@ def _get_exec(self): else: return self.conn.cursor() - def _load_iris_data(self): + def _load_iris_data(self, datapath): import io - iris_csv_file = os.path.join(tm.get_data_path(), 'iris.csv') + iris_csv_file = datapath('io', 'data', 'iris.csv') self.drop_table('iris') self._get_exec().execute(SQL_STRINGS['create_iris'][self.flavor]) @@ -503,9 +504,10 @@ class _TestSQLApi(PandasSQLTest): flavor = 'sqlite' mode = None - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_iris_view() self._load_test1_data() self._load_test2_data() @@ -1025,8 +1027,9 @@ class _EngineToConnMixin(object): A mixin that causes setup_connect to create a conn rather than an engine. """ - def setup_method(self, method): - super(_EngineToConnMixin, self).setup_method(method) + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + super(_EngineToConnMixin, self).setup_method(datapath) engine = self.conn conn = engine.connect() self.__tx = conn.begin() @@ -1034,12 +1037,14 @@ def setup_method(self, method): self.__engine = engine self.conn = conn - def teardown_method(self, method): + yield + self.__tx.rollback() self.conn.close() self.conn = self.__engine self.pandasSQL = sql.SQLDatabase(self.__engine) - super(_EngineToConnMixin, self).teardown_method(method) + # XXX: + # super(_EngineToConnMixin, self).teardown_method(method) @pytest.mark.single @@ -1136,7 +1141,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): """ flavor = None - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): cls.setup_import() cls.setup_driver() @@ -1149,10 +1154,11 @@ def setup_class(cls): msg = "{0} - can't connect to {1} server".format(cls, cls.flavor) pytest.skip(msg) - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.setup_connect() - self._load_iris_data() + self._load_iris_data(datapath) self._load_raw_sql() self._load_test1_data() @@ -1920,11 +1926,12 @@ class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): def connect(cls): return sqlite3.connect(':memory:') - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): self.conn = self.connect() self.pandasSQL = sql.SQLiteDatabase(self.conn) - self._load_iris_data() + self._load_iris_data(datapath) self._load_test1_data() @@ -2135,8 +2142,9 @@ def _skip_if_no_pymysql(): @pytest.mark.single class TestXSQLite(SQLiteMixIn): - def setup_method(self, method): - self.method = method + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): + self.method = request.function self.conn = sqlite3.connect(':memory:') def test_basic(self): @@ -2215,8 +2223,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): create_sql = """ CREATE TABLE test ( @@ -2236,7 +2243,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): pass @@ -2341,7 +2348,7 @@ def clean_up(test_table_to_drop): "if SQLAlchemy is not installed") class TestXMySQL(MySQLMixIn): - @classmethod + @pytest.fixture(autouse=True, scope='class') def setup_class(cls): _skip_if_no_pymysql() @@ -2370,7 +2377,8 @@ def setup_class(cls): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, request, datapath): _skip_if_no_pymysql() import pymysql try: @@ -2396,7 +2404,7 @@ def setup_method(self, method): "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - self.method = method + self.method = request.function def test_basic(self): _skip_if_no_pymysql() @@ -2501,8 +2509,7 @@ def test_execute_fail(self): with pytest.raises(Exception): sql.execute('INSERT INTO test VALUES("foo", "bar", 7)', self.conn) - @tm.capture_stdout - def test_execute_closed_connection(self): + def test_execute_closed_connection(self, request, datapath): _skip_if_no_pymysql() drop_sql = "DROP TABLE IF EXISTS test" create_sql = """ @@ -2525,7 +2532,7 @@ def test_execute_closed_connection(self): tquery("select * from test", con=self.conn) # Initialize connection again (needed for tearDown) - self.setup_method(self.method) + self.setup_method(request, datapath) def test_na_roundtrip(self): _skip_if_no_pymysql() diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index bfb72be80400e..cfe47cae7e5e1 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -25,8 +25,8 @@ @pytest.fixture -def dirpath(): - return tm.get_data_path() +def dirpath(datapath): + return datapath("io", "data") @pytest.fixture @@ -39,8 +39,9 @@ def parsed_114(dirpath): class TestStata(object): - def setup_method(self, method): - self.dirpath = tm.get_data_path() + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") self.dta1_114 = os.path.join(self.dirpath, 'stata1_114.dta') self.dta1_117 = os.path.join(self.dirpath, 'stata1_117.dta') diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f65791329f2f1..09687dd97bd43 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -74,11 +74,6 @@ def setup_method(self, method): else: self.default_figsize = (8.0, 6.0) self.default_tick_position = 'left' if self.mpl_ge_2_0_0 else 'default' - # common test data - from pandas import read_csv - base = os.path.join(os.path.dirname(curpath()), os.pardir) - path = os.path.join(base, 'tests', 'data', 'iris.csv') - self.iris = read_csv(path) n = 100 with tm.RNGContext(42): diff --git a/pandas/tests/plotting/test_deprecated.py b/pandas/tests/plotting/test_deprecated.py index 2c2d371921d2f..a45b17ec98261 100644 --- a/pandas/tests/plotting/test_deprecated.py +++ b/pandas/tests/plotting/test_deprecated.py @@ -46,10 +46,9 @@ def test_boxplot_deprecated(self): by='indic') @pytest.mark.slow - def test_radviz_deprecated(self): - df = self.iris + def test_radviz_deprecated(self, iris): with tm.assert_produces_warning(FutureWarning): - plotting.radviz(frame=df, class_column='Name') + plotting.radviz(frame=iris, class_column='Name') @pytest.mark.slow def test_plot_params(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index c82c939584dc7..0473610ea2f8f 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -100,11 +100,11 @@ def test_scatter_matrix_axis(self): axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) @pytest.mark.slow - def test_andrews_curves(self): + def test_andrews_curves(self, iris): from pandas.plotting import andrews_curves from matplotlib import cm - df = self.iris + df = iris _check_plot_works(andrews_curves, frame=df, class_column='Name') @@ -165,11 +165,11 @@ def test_andrews_curves(self): andrews_curves(data=df, class_column='Name') @pytest.mark.slow - def test_parallel_coordinates(self): + def test_parallel_coordinates(self, iris): from pandas.plotting import parallel_coordinates from matplotlib import cm - df = self.iris + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column='Name') @@ -234,11 +234,11 @@ def test_parallel_coordinates_with_sorted_labels(self): assert prev[1] < nxt[1] and prev[0] < nxt[0] @pytest.mark.slow - def test_radviz(self): + def test_radviz(self, iris): from pandas.plotting import radviz from matplotlib import cm - df = self.iris + df = iris _check_plot_works(radviz, frame=df, class_column='Name') rgba = ('#556270', '#4ECDC4', '#C7F464') @@ -272,8 +272,8 @@ def test_radviz(self): self._check_colors(handles, facecolors=colors) @pytest.mark.slow - def test_subplot_titles(self): - df = self.iris.drop('Name', axis=1).head() + def test_subplot_titles(self, iris): + df = iris.drop('Name', axis=1).head() # Use the column names as the subplot titles title = list(df.columns) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cebbcc41c3e17..59b53cd23010e 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1,4 +1,3 @@ -import os import pytest import pytz @@ -13,8 +12,8 @@ class TestAsOfMerge(object): - def read_data(self, name, dedupe=False): - path = os.path.join(tm.get_data_path(), name) + def read_data(self, datapath, name, dedupe=False): + path = datapath('reshape', 'merge', 'data', name) x = read_csv(path) if dedupe: x = (x.drop_duplicates(['time', 'ticker'], keep='last') @@ -23,15 +22,17 @@ def read_data(self, name, dedupe=False): x.time = to_datetime(x.time) return x - def setup_method(self, method): + @pytest.fixture(autouse=True) + def setup_method(self, datapath): - self.trades = self.read_data('trades.csv') - self.quotes = self.read_data('quotes.csv', dedupe=True) - self.asof = self.read_data('asof.csv') - self.tolerance = self.read_data('tolerance.csv') - self.allow_exact_matches = self.read_data('allow_exact_matches.csv') + self.trades = self.read_data(datapath, 'trades.csv') + self.quotes = self.read_data(datapath, 'quotes.csv', dedupe=True) + self.asof = self.read_data(datapath, 'asof.csv') + self.tolerance = self.read_data(datapath, 'tolerance.csv') + self.allow_exact_matches = self.read_data(datapath, + 'allow_exact_matches.csv') self.allow_exact_matches_and_tolerance = self.read_data( - 'allow_exact_matches_and_tolerance.csv') + datapath, 'allow_exact_matches_and_tolerance.csv') def test_examples1(self): """ doc-string examples """ @@ -423,11 +424,11 @@ def test_multiby_indexed(self): pd.merge_asof(left, right, left_index=True, right_index=True, left_by=['k1', 'k2'], right_by=['k1']) - def test_basic2(self): + def test_basic2(self, datapath): - expected = self.read_data('asof2.csv') - trades = self.read_data('trades2.csv') - quotes = self.read_data('quotes2.csv', dedupe=True) + expected = self.read_data(datapath, 'asof2.csv') + trades = self.read_data(datapath, 'trades2.csv') + quotes = self.read_data(datapath, 'quotes2.csv', dedupe=True) result = merge_asof(trades, quotes, on='time', @@ -467,14 +468,14 @@ def test_valid_join_keys(self): merge_asof(trades, quotes, by='ticker') - def test_with_duplicates(self): + def test_with_duplicates(self, datapath): q = pd.concat([self.quotes, self.quotes]).sort_values( ['time', 'ticker']).reset_index(drop=True) result = merge_asof(self.trades, q, on='time', by='ticker') - expected = self.read_data('asof.csv') + expected = self.read_data(datapath, 'asof.csv') assert_frame_equal(result, expected) def test_with_duplicates_no_on(self): diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 5ea27f9e34e1c..807fb2530603a 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -282,10 +282,10 @@ def test_round_frac(self): result = tmod._round_frac(0.000123456, precision=2) assert result == 0.00012 - def test_qcut_binning_issues(self): + def test_qcut_binning_issues(self, datapath): # #1978, 1979 - path = os.path.join(tm.get_data_path(), 'cut_data.csv') - arr = np.loadtxt(path) + cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv')) + arr = np.loadtxt(cut_file) result = qcut(arr, 20) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 74bc08ee9649b..b93a0206479ca 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -1,4 +1,3 @@ -import os from distutils.version import LooseVersion from datetime import date, datetime, timedelta @@ -518,14 +517,15 @@ def test_add(self, offset_types, tz): assert isinstance(result, Timestamp) assert result == expected_localize - def test_pickle_v0_15_2(self): + def test_pickle_v0_15_2(self, datapath): offsets = {'DateOffset': DateOffset(years=1), 'MonthBegin': MonthBegin(1), 'Day': Day(1), 'YearBegin': YearBegin(1), 'Week': Week(1)} - pickle_path = os.path.join(tm.get_data_path(), - 'dateoffset_0_15_2.pickle') + + pickle_path = datapath('tseries', 'offsets', 'data', + 'dateoffset_0_15_2.pickle') # This code was executed once on v0.15.2 to generate the pickle: # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f) # @@ -1838,12 +1838,10 @@ def _check_roundtrip(obj): _check_roundtrip(self.offset2) _check_roundtrip(self.offset * 2) - def test_pickle_compat_0_14_1(self): + def test_pickle_compat_0_14_1(self, datapath): hdays = [datetime(2013, 1, 1) for ele in range(4)] - - pth = tm.get_data_path() - - cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + pth = datapath('tseries', 'offsets', 'data', 'cday-0.14.1.pickle') + cday0_14_1 = read_pickle(pth) cday = CDay(holidays=hdays) assert cday == cday0_14_1 diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index ab7c4fb528452..4d34987e14f75 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +import os import pandas as pd import pytest import numpy as np @@ -841,3 +842,15 @@ def test_locale(self): # GH9744 locales = tm.get_locales() assert len(locales) >= 1 + + +def test_datapath_missing(datapath, request): + if not request.config.getoption("--strict-data-files"): + pytest.skip("Need to set '--strict-data-files'") + + with pytest.raises(ValueError): + datapath('not_a_file') + + result = datapath('data', 'iris.csv') + expected = os.path.join('pandas', 'tests', 'data', 'iris.csv') + assert result == expected diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 27c24e3a68079..c6ab24403d58d 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,7 +23,6 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ - import pytest import locale from distutils.version import LooseVersion diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 675dd94d49750..a5afcb6915034 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -6,7 +6,6 @@ import sys import tempfile import warnings -import inspect import os import subprocess import locale @@ -757,15 +756,6 @@ def ensure_clean(filename=None, return_filelike=False): print("Exception on removing file: {error}".format(error=e)) -def get_data_path(f=''): - """Return the path of a data file, these are relative to the current test - directory. - """ - # get our callers file - _, filename, _, _, _, _ = inspect.getouterframes(inspect.currentframe())[1] - base_dir = os.path.abspath(os.path.dirname(filename)) - return os.path.join(base_dir, 'data', f) - # ----------------------------------------------------------------------------- # Comparators diff --git a/setup.cfg b/setup.cfg index 6d9657737a8bd..9ec967c25e225 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,4 +32,5 @@ markers = slow: mark a test as slow network: mark a test as network high_memory: mark a test as a high-memory only -doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL +addopts = --strict-data-files +doctest_optionflags= NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL \ No newline at end of file diff --git a/setup.py b/setup.py index dd026bd611727..0fd008612b5bd 100755 --- a/setup.py +++ b/setup.py @@ -735,11 +735,7 @@ def pxd(name): maintainer=AUTHOR, version=versioneer.get_version(), packages=find_packages(include=['pandas', 'pandas.*']), - package_data={'': ['data/*', 'templates/*', '_libs/*.dll'], - 'pandas.tests.io': ['data/legacy_hdf/*.h5', - 'data/legacy_pickle/*/*.pickle', - 'data/legacy_msgpack/*/*.msgpack', - 'data/html_encoding/*.html']}, + package_data={'': ['templates/*', '_libs/*.dll']}, ext_modules=extensions, maintainer_email=EMAIL, description=DESCRIPTION,