pandas-dev · TomAugspurger · Jun 26, 2018 · Feb 3, 2018 · Feb 6, 2018 · Feb 22, 2018
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -3,27 +3,39 @@ include LICENSE
 include RELEASE.md
 include README.md
 include setup.py
-include pyproject.toml
 
 graft doc
 prune doc/build
 
+graft LICENSES
+
 graft pandas
 
-global-exclude *.so
-global-exclude *.pyd
+global-exclude *.bz2
+global-exclude *.csv
+global-exclude *.dta
+global-exclude *.gz
+global-exclude *.h5
+global-exclude *.html
+global-exclude *.json
+global-exclude *.msgpack
+global-exclude *.pickle
+global-exclude *.png
 global-exclude *.pyc
+global-exclude *.pyd
+global-exclude *.sas7bdat
+global-exclude *.so
+global-exclude *.xls
+global-exclude *.xlsm
+global-exclude *.xlsx
+global-exclude *.xpt
+global-exclude *.xz
+global-exclude *.zip
 global-exclude *~
-global-exclude \#*
-global-exclude .git*
 global-exclude .DS_Store
-global-exclude *.png
+global-exclude .git*
+global-exclude \#*
 
-# include examples/data/*
-# recursive-include examples *.py
-# recursive-include doc/source *
-# recursive-include doc/sphinxext *
-# recursive-include LICENSES *
 include versioneer.py
 include pandas/_version.py
 include pandas/io/formats/templates/*.tpl
diff --git a/ci/script_single.sh b/ci/script_single.sh
@@ -25,12 +25,12 @@ if [ "$DOC" ]; then
     echo "We are not running pytest as this is a doc-build"
 
 elif [ "$COVERAGE" ]; then
-    echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
-    pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
+    echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
+    pytest      -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
 
 else
-    echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
-    pytest -m "single" -r xX  --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
+    echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
+    pytest      -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
 
 fi
 

diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -76,6 +76,11 @@ Documentation Changes
 -
 -
 
+Build Changes
+-------------
+
+- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`)
+
 .. _whatsnew_0232.bug_fixes:
 
 Bug Fixes

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1,7 +1,9 @@
+import os
 import importlib
 
 import pytest
 
+import pandas
 import numpy as np
 import pandas as pd
 from pandas.compat import PY3
@@ -17,6 +19,8 @@ def pytest_addoption(parser):
                      help="run high memory tests")
     parser.addoption("--only-slow", action="store_true",
                      help="run only slow tests")
+    parser.addoption("--strict-data-files", action="store_true",
+                     help="Fail if a test is skipped for missing data file.")
 
 
 def pytest_runtest_setup(item):
@@ -131,6 +135,43 @@ def join_type(request):
     return request.param
 
 
+@pytest.fixture
+def datapath(request):
+    """Get the path to a data file.
+
+    Parameters
+    ----------
+    path : str
+        Path to the file, relative to ``pandas/tests/``
+
+    Returns
+    -------
+    path : path including ``pandas/tests``.
+
+    Raises
+    ------
+    ValueError
+        If the path doesn't exist and the --strict-data-files option is set.
+    """
+    def deco(*args):
+        path = os.path.join('pandas', 'tests', *args)
+        if not os.path.exists(path):
+            if request.config.getoption("--strict-data-files"):
+                msg = "Could not find file {} and --strict-data-files is set."
+                raise ValueError(msg.format(path))
+            else:
+                msg = "Could not find {}."
+                pytest.skip(msg.format(path))
+        return path
+    return deco
+
+
+@pytest.fixture
+def iris(datapath):
+    """The iris dataset as a DataFrame."""
+    return pandas.read_csv(datapath('data', 'iris.csv'))
+
+
 @pytest.fixture(params=['nlargest', 'nsmallest'])
 def nselect_method(request):
     """

diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
@@ -1182,12 +1182,12 @@ def test_iter(self):
                     ('baz', 'two'), ('qux', 'one'), ('qux', 'two')]
         assert result == expected
 
-    def test_legacy_pickle(self):
+    def test_legacy_pickle(self, datapath):
         if PY3:
             pytest.skip("testing for legacy pickles not "
                         "support on py3")
 
-        path = tm.get_data_path('multiindex_v1.pickle')
+        path = datapath('indexes', 'data', 'multiindex_v1.pickle')
         obj = pd.read_pickle(path)
 
         obj2 = MultiIndex.from_tuples(obj.values)
@@ -1203,10 +1203,10 @@ def test_legacy_pickle(self):
         assert_almost_equal(res, exp)
         assert_almost_equal(exp, exp2)
 
-    def test_legacy_v2_unpickle(self):
+    def test_legacy_v2_unpickle(self, datapath):
 
         # 0.7.3 -> 0.8.0 format manage
-        path = tm.get_data_path('mindex_073.pickle')
+        path = datapath('indexes', 'data', 'mindex_073.pickle')
         obj = pd.read_pickle(path)
 
         obj2 = MultiIndex.from_tuples(obj.values)

diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -1,32 +1,23 @@
-import os
-
 import pytest
 from pandas.io.parsers import read_table
-from pandas.util import testing as tm
-
-
-@pytest.fixture
-def parser_data(request):
-    return os.path.join(tm.get_data_path(), '..', 'parser', 'data')
 
 
 @pytest.fixture
-def tips_file(parser_data):
+def tips_file(datapath):
     """Path to the tips dataset"""
-    return os.path.join(parser_data, 'tips.csv')
+    return datapath('io', 'parser', 'data', 'tips.csv')
 
 
 @pytest.fixture
-def jsonl_file(parser_data):
+def jsonl_file(datapath):
     """Path a JSONL dataset"""
-    return os.path.join(parser_data, 'items.jsonl')
+    return datapath('io', 'parser', 'data', 'items.jsonl')
 
 
 @pytest.fixture
-def salaries_table(parser_data):
+def salaries_table(datapath):
     """DataFrame with the salaries dataset"""
-    path = os.path.join(parser_data, 'salaries.csv')
-    return read_table(path)
+    return read_table(datapath('io', 'parser', 'data', 'salaries.csv'))
 
 
 @pytest.fixture

diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
@@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self):
         dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})})
         compat.text_type(dm.to_string())
 
-    def test_string_repr_encoding(self):
-        filepath = tm.get_data_path('unicode_series.csv')
+    def test_string_repr_encoding(self, datapath):
+        filepath = datapath('io', 'formats', 'data', 'unicode_series.csv')
         df = pd.read_csv(filepath, header=None, encoding='latin1')
         repr(df)
         repr(df[1])

diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -21,11 +21,11 @@ def test_compression_roundtrip(compression):
         assert_frame_equal(df, pd.read_json(result))
 
 
-def test_read_zipped_json():
-    uncompressed_path = tm.get_data_path("tsframe_v012.json")
+def test_read_zipped_json(datapath):
+    uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
     uncompressed_df = pd.read_json(uncompressed_path)
 
-    compressed_path = tm.get_data_path("tsframe_v012.json.zip")
+    compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
     compressed_df = pd.read_json(compressed_path, compression='zip')
 
     assert_frame_equal(uncompressed_df, compressed_df)

diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -37,8 +37,9 @@
 
 class TestPandasContainer(object):
 
-    def setup_method(self, method):
-        self.dirpath = tm.get_data_path()
+    @pytest.fixture(scope="function", autouse=True)
+    def setup(self, datapath):
+        self.dirpath = datapath("io", "json", "data")
 
         self.ts = tm.makeTimeSeries()
         self.ts.name = 'ts'
@@ -59,7 +60,8 @@ def setup_method(self, method):
         self.mixed_frame = _mixed_frame.copy()
         self.categorical = _cat_frame.copy()
 
-    def teardown_method(self, method):
+        yield
+
         del self.dirpath
 
         del self.ts

diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -77,7 +77,7 @@ def test_read_csv(self):
             else:
                 prefix = u("file://")
 
-            fname = prefix + compat.text_type(self.csv1)
+            fname = prefix + compat.text_type(os.path.abspath(self.csv1))
             self.read_csv(fname, index_col=0, parse_dates=True)
 
     def test_1000_sep(self):
@@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self):
         tm.assert_frame_equal(df, expected)
 
     @tm.network
-    def test_url(self):
+    def test_url(self, datapath):
         # HTTP(S)
         url = ('https://raw.github.com/pandas-dev/pandas/master/'
                'pandas/tests/io/parser/data/salaries.csv')
         url_table = self.read_table(url)
-        dirpath = tm.get_data_path()
-        localtable = os.path.join(dirpath, 'salaries.csv')
+        localtable = datapath('io', 'parser', 'data', 'salaries.csv')
         local_table = self.read_table(localtable)
         tm.assert_frame_equal(url_table, local_table)
         # TODO: ftp testing
 
     @pytest.mark.slow
-    def test_file(self):
-        dirpath = tm.get_data_path()
-        localtable = os.path.join(dirpath, 'salaries.csv')
+    def test_file(self, datapath):
+        localtable = datapath('io', 'parser', 'data', 'salaries.csv')
         local_table = self.read_table(localtable)
 
         try:
@@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self):
 
                     tm.assert_frame_equal(result, expected)
 
-    def test_utf16_example(self):
-        path = tm.get_data_path('utf16_ex.txt')
+    def test_utf16_example(self, datapath):
+        path = datapath('io', 'parser', 'data', 'utf16_ex.txt')
 
         # it works! and is the right length
         result = self.read_table(path, encoding='utf-16')
@@ -767,8 +765,8 @@ def test_utf16_example(self):
             result = self.read_table(buf, encoding='utf-16')
             assert len(result) == 50
 
-    def test_unicode_encoding(self):
-        pth = tm.get_data_path('unicode_series.csv')
+    def test_unicode_encoding(self, datapath):
+        pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
 
         result = self.read_csv(pth, header=None, encoding='latin-1')
         result = result.set_index(0)
@@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self):
             result = self.read_csv(path)
             tm.assert_frame_equal(result, expected)
 
-    def test_sub_character(self):
+    def test_sub_character(self, datapath):
         # see gh-16893
-        dirpath = tm.get_data_path()
-        filename = os.path.join(dirpath, "sub_char.csv")
+        filename = datapath('io', 'parser', 'data', 'sub_char.csv')
 
         expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
         result = self.read_csv(filename)

diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py
@@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self):
 
                 tm.assert_frame_equal(expected, df)
 
-    def test_read_csv_compressed_utf16_example(self):
+    def test_read_csv_compressed_utf16_example(self, datapath):
         # GH18071
-        path = tm.get_data_path('utf16_ex_small.zip')
+        path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')
 
         result = self.read_csv(path, encoding='utf-16',
                                compression='zip', sep='\t')

diff --git a/pandas/tests/io/parser/dtypes.py b/pandas/tests/io/parser/dtypes.py
@@ -125,17 +125,17 @@ def test_categorical_dtype_high_cardinality_numeric(self):
             np.sort(actual.a.cat.categories), ordered=True)
         tm.assert_frame_equal(actual, expected)
 
-    def test_categorical_dtype_encoding(self):
+    def test_categorical_dtype_encoding(self, datapath):
         # GH 10153
-        pth = tm.get_data_path('unicode_series.csv')
+        pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
         encoding = 'latin-1'
         expected = self.read_csv(pth, header=None, encoding=encoding)
         expected[1] = Categorical(expected[1])
         actual = self.read_csv(pth, header=None, encoding=encoding,
                                dtype={1: 'category'})
         tm.assert_frame_equal(actual, expected)
 
-        pth = tm.get_data_path('utf16_ex.txt')
+        pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
         encoding = 'utf-16'
         expected = self.read_table(pth, encoding=encoding)
         expected = expected.apply(Categorical)