pandas-dev · TomAugspurger · Jun 26, 2018 · Feb 3, 2018 · Feb 6, 2018 · Feb 22, 2018
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -10,20 +10,33 @@ prune doc/build
 
 graft pandas
 
-global-exclude *.so
-global-exclude *.pyd
+global-exclude *.bz2
+global-exclude *.csv
+global-exclude *.dta
+global-exclude *.gz
+global-exclude *.h5
+global-exclude *.html
+global-exclude *.json
+global-exclude *.msgpack
+global-exclude *.pickle
+global-exclude *.png
 global-exclude *.pyc
+global-exclude *.pyd
+global-exclude *.sas7bdat
+global-exclude *.so
+global-exclude *.xls
+global-exclude *.xlsm
+global-exclude *.xlsx
+global-exclude *.xpt
+global-exclude *.xz
+global-exclude *.zip
 global-exclude *~
-global-exclude \#*
-global-exclude .git*
 global-exclude .DS_Store
-global-exclude *.png
+global-exclude .git*
+global-exclude \#*
+
+recursive-exclude pandas/tests/io/data
 
-# include examples/data/*
-# recursive-include examples *.py
-# recursive-include doc/source *
-# recursive-include doc/sphinxext *
-# recursive-include LICENSES *
 include versioneer.py
 include pandas/_version.py
 include pandas/io/formats/templates/*.tpl
diff --git a/ci/script_single.sh b/ci/script_single.sh
@@ -25,12 +25,12 @@ if [ "$DOC" ]; then
     echo "We are not running pytest as this is a doc-build"
 
 elif [ "$COVERAGE" ]; then
-    echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
-    pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
+    echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
+    pytest      -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
 
 else
-    echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
-    pytest -m "single" -r xX  --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
+    echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
+    pytest      -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
 
 fi
 

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -16,6 +16,8 @@ def pytest_addoption(parser):
                      help="run high memory tests")
     parser.addoption("--only-slow", action="store_true",
                      help="run only slow tests")
+    parser.addoption("--strict-data-files", action="store_true",
+                     help="Fail if a test is skipped for missing data file.")
 
 
 def pytest_runtest_setup(item):

diff --git a/pandas/tests/conftest.py b/pandas/tests/conftest.py
@@ -0,0 +1,32 @@
+import os
+
+import pytest
+
+
+@pytest.fixture
+def datapath(request):
+    """Get the path to a data file.
+
+    Parameters
+    ----------
+    path : str
+        Path to the file, relative to ``pandas/tests/``
+
+    Returns
+    -------
+    path : path including ``pandas/tests``.
+
+    Raises
+    ------
+    ValueError
+        If the path doesn't exist and the --strict-data-files option is set.
+    """
+    def deco(path):
+        path = os.path.join('pandas', 'tests', os.path.join(path))
+        if not os.path.exists(path):
+            if request.config.getoption("--strict-data-files"):
+                raise ValueError("Failed.")
+            else:
+                pytest.skip("Data files not included in pandas distribution.")
+        return path
+    return deco
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
@@ -11,22 +11,21 @@ def parser_data(request):
 
 
 @pytest.fixture
-def tips_file(parser_data):
+def tips_file(datapath):
     """Path to the tips dataset"""
-    return os.path.join(parser_data, 'tips.csv')
+    return datapath(os.path.join('io', 'parser', 'data', 'tips.csv'))
 
 
 @pytest.fixture
-def jsonl_file(parser_data):
+def jsonl_file(datapath):
     """Path a JSONL dataset"""
-    return os.path.join(parser_data, 'items.jsonl')
+    return datapath(os.path.join('io', 'parser', 'data', 'items.jsonl'))
 
 
 @pytest.fixture
-def salaries_table(parser_data):
+def salaries_table(datapath):
     """DataFrame with the salaries dataset"""
-    path = os.path.join(parser_data, 'salaries.csv')
-    return read_table(path)
+    return datapath(os.path.join('io', 'parser', 'data', 'salaries.csv'))
 
 
 @pytest.fixture

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -170,6 +170,8 @@ def test_read_non_existant(self, reader, module, error_class, fn_ext):
     ])
     def test_read_fspath_all(self, reader, module, path):
         pytest.importorskip(module)
+        if not os.path.exists(path):
+            pytest.skip("Data files not included in pandas distribution.")
 
         mypath = CustomFSPath(path)
         result = reader(mypath)
@@ -232,13 +234,14 @@ def test_write_fspath_hdf5(self):
         tm.assert_frame_equal(result, expected)
 
 
-class TestMMapWrapper(object):
+@pytest.fixture
+def mmap_file(datapath):
+    return datapath(os.path.join('io', 'data', 'test_mmap.csv'))
+
 
-    def setup_method(self, method):
-        self.mmap_file = os.path.join(tm.get_data_path(),
-                                      'test_mmap.csv')
+class TestMMapWrapper(object):
 
-    def test_constructor_bad_file(self):
+    def test_constructor_bad_file(self, mmap_file):
         non_file = StringIO('I am not a file')
         non_file.fileno = lambda: -1
 
@@ -252,15 +255,15 @@ def test_constructor_bad_file(self):
 
         tm.assert_raises_regex(err, msg, common.MMapWrapper, non_file)
 
-        target = open(self.mmap_file, 'r')
+        target = open(mmap_file, 'r')
         target.close()
 
         msg = "I/O operation on closed file"
         tm.assert_raises_regex(
             ValueError, msg, common.MMapWrapper, target)
 
-    def test_get_attr(self):
-        with open(self.mmap_file, 'r') as target:
+    def test_get_attr(self, mmap_file):
+        with open(mmap_file, 'r') as target:
             wrapper = common.MMapWrapper(target)
 
         attrs = dir(wrapper.mmap)
@@ -273,8 +276,8 @@ def test_get_attr(self):
 
         assert not hasattr(wrapper, 'foo')
 
-    def test_next(self):
-        with open(self.mmap_file, 'r') as target:
+    def test_next(self, mmap_file):
+        with open(mmap_file, 'r') as target:
             wrapper = common.MMapWrapper(target)
             lines = target.readlines()
 

diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -65,9 +65,6 @@ def _skip_if_none_of(module_names):
                 pytest.skip("Bad version of bs4: 4.2.0")
 
 
-DATA_PATH = tm.get_data_path()
-
-
 def assert_framelist_equal(list1, list2, *args, **kwargs):
     assert len(list1) == len(list2), ('lists are not of equal size '
                                       'len(list1) == {0}, '
@@ -86,8 +83,8 @@ def test_bs4_version_fails():
     _skip_if_none_of(('bs4', 'html5lib'))
     import bs4
     if LooseVersion(bs4.__version__) == LooseVersion('4.2.0'):
-        tm.assert_raises(AssertionError, read_html, os.path.join(DATA_PATH,
-                                                                 "spam.html"),
+        tm.assert_raises(AssertionError, read_html,
+                         os.path.join(tm.get_data_path(), "spam.html"),
                          flavor='bs4')
 
 
@@ -100,16 +97,17 @@ def read_html(self, *args, **kwargs):
 
 class TestReadHtml(ReadHtmlMixin):
     flavor = 'bs4'
-    spam_data = os.path.join(DATA_PATH, 'spam.html')
-    spam_data_kwargs = {}
-    if PY3:
-        spam_data_kwargs['encoding'] = 'UTF-8'
-    banklist_data = os.path.join(DATA_PATH, 'banklist.html')
 
     @classmethod
     def setup_class(cls):
         _skip_if_none_of(('bs4', 'html5lib'))
 
+        cls.spam_data = os.path.join(tm.get_data_path(), 'spam.html')
+        cls.spam_data_kwargs = {}
+        if PY3:
+            cls.spam_data_kwargs['encoding'] = 'UTF-8'
+        cls.banklist_data = os.path.join(tm.get_data_path(), 'banklist.html')
+
     def test_to_html_compat(self):
         df = mkdf(4, 3, data_gen_f=lambda *args: rand(), c_idx_names=False,
                   r_idx_names=False).applymap('{0:.3f}'.format).astype(float)
@@ -382,7 +380,7 @@ def test_python_docs_table(self):
     @pytest.mark.slow
     def test_thousands_macau_stats(self):
         all_non_nan_table_index = -2
-        macau_data = os.path.join(DATA_PATH, 'macau.html')
+        macau_data = os.path.join(tm.get_data_path(), 'macau.html')
         dfs = self.read_html(macau_data, index_col=0,
                              attrs={'class': 'style1'})
         df = dfs[all_non_nan_table_index]
@@ -392,7 +390,7 @@ def test_thousands_macau_stats(self):
     @pytest.mark.slow
     def test_thousands_macau_index_col(self):
         all_non_nan_table_index = -2
-        macau_data = os.path.join(DATA_PATH, 'macau.html')
+        macau_data = os.path.join(tm.get_data_path(), 'macau.html')
         dfs = self.read_html(macau_data, index_col=0, header=0)
         df = dfs[all_non_nan_table_index]
 
@@ -520,7 +518,7 @@ def test_countries_municipalities(self):
         assert_framelist_equal(res1, res2)
 
     def test_nyse_wsj_commas_table(self):
-        data = os.path.join(DATA_PATH, 'nyse_wsj.html')
+        data = os.path.join(tm.get_data_path(), 'nyse_wsj.html')
         df = self.read_html(data, index_col=0, header=0,
                             attrs={'class': 'mdcTable'})[0]
 
@@ -542,7 +540,8 @@ def try_remove_ws(x):
 
         df = self.read_html(self.banklist_data, 'Metcalf',
                             attrs={'id': 'table'})[0]
-        ground_truth = read_csv(os.path.join(DATA_PATH, 'banklist.csv'),
+        ground_truth = read_csv(os.path.join(tm.get_data_path(),
+                                             'banklist.csv'),
                                 converters={'Updated Date': Timestamp,
                                             'Closing Date': Timestamp})
         assert df.shape == ground_truth.shape
@@ -660,15 +659,15 @@ def test_parse_dates_combine(self):
         tm.assert_frame_equal(newdf, res[0])
 
     def test_computer_sales_page(self):
-        data = os.path.join(DATA_PATH, 'computer_sales_page.html')
+        data = os.path.join(tm.get_data_path(), 'computer_sales_page.html')
         with tm.assert_raises_regex(ParserError,
                                     r"Passed header=\[0,1\] are "
                                     r"too many rows for this "
                                     r"multi_index of columns"):
             self.read_html(data, header=[0, 1])
 
     def test_wikipedia_states_table(self):
-        data = os.path.join(DATA_PATH, 'wikipedia_states.html')
+        data = os.path.join(tm.get_data_path(), 'wikipedia_states.html')
         assert os.path.isfile(data), '%r is not a file' % data
         assert os.path.getsize(data), '%r is an empty file' % data
         result = self.read_html(data, 'Arizona', header=1)[0]
@@ -788,11 +787,14 @@ def _lang_enc(filename):
 
 
 class TestReadHtmlEncoding(object):
-    files = glob.glob(os.path.join(DATA_PATH, 'html_encoding', '*.html'))
     flavor = 'bs4'
 
     @classmethod
     def setup_class(cls):
+        cls.files = glob.glob(os.path.join(tm.get_data_path(),
+                                           'html_encoding',
+                                           '*.html'))
+
         _skip_if_none_of((cls.flavor, 'html5lib'))
 
     def read_html(self, *args, **kwargs):
@@ -847,8 +849,8 @@ def setup_class(cls):
 
     def test_data_fail(self):
         from lxml.etree import XMLSyntaxError
-        spam_data = os.path.join(DATA_PATH, 'spam.html')
-        banklist_data = os.path.join(DATA_PATH, 'banklist.html')
+        spam_data = os.path.join(tm.get_data_path(), 'spam.html')
+        banklist_data = os.path.join(tm.get_data_path(), 'banklist.html')
 
         with pytest.raises(XMLSyntaxError):
             self.read_html(spam_data)
@@ -857,15 +859,15 @@ def test_data_fail(self):
             self.read_html(banklist_data)
 
     def test_works_on_valid_markup(self):
-        filename = os.path.join(DATA_PATH, 'valid_markup.html')
+        filename = os.path.join(tm.get_data_path(), 'valid_markup.html')
         dfs = self.read_html(filename, index_col=0)
         assert isinstance(dfs, list)
         assert isinstance(dfs[0], DataFrame)
 
     @pytest.mark.slow
     def test_fallback_success(self):
         _skip_if_none_of(('bs4', 'html5lib'))
-        banklist_data = os.path.join(DATA_PATH, 'banklist.html')
+        banklist_data = os.path.join(tm.get_data_path(), 'banklist.html')
         self.read_html(banklist_data, '.*Water.*', flavor=['lxml', 'html5lib'])
 
     def test_to_html_timestamp(self):
@@ -893,7 +895,7 @@ def test_parse_dates_combine(self):
         tm.assert_frame_equal(newdf, res[0])
 
     def test_computer_sales_page(self):
-        data = os.path.join(DATA_PATH, 'computer_sales_page.html')
+        data = os.path.join(tm.get_data_path(), 'computer_sales_page.html')
         self.read_html(data, header=[0, 1])
 
 
@@ -914,7 +916,7 @@ def get_elements_from_file(url, element='table'):
 
 @pytest.mark.slow
 def test_bs4_finds_tables():
-    filepath = os.path.join(DATA_PATH, "spam.html")
+    filepath = os.path.join(tm.get_data_path(), "spam.html")
     with warnings.catch_warnings():
         warnings.filterwarnings('ignore')
         assert get_elements_from_file(filepath, 'table')
@@ -929,19 +931,19 @@ def get_lxml_elements(url, element):
 
 @pytest.mark.slow
 def test_lxml_finds_tables():
-    filepath = os.path.join(DATA_PATH, "spam.html")
+    filepath = os.path.join(tm.get_data_path(), "spam.html")
     assert get_lxml_elements(filepath, 'table')
 
 
 @pytest.mark.slow
 def test_lxml_finds_tbody():
-    filepath = os.path.join(DATA_PATH, "spam.html")
+    filepath = os.path.join(tm.get_data_path(), "spam.html")
     assert get_lxml_elements(filepath, 'tbody')
 
 
 def test_same_ordering():
     _skip_if_none_of(['bs4', 'lxml', 'html5lib'])
-    filename = os.path.join(DATA_PATH, 'valid_markup.html')
+    filename = os.path.join(tm.get_data_path(), 'valid_markup.html')
     dfs_lxml = read_html(filename, index_col=0, flavor=['lxml'])
     dfs_bs4 = read_html(filename, index_col=0, flavor=['bs4'])
     assert_framelist_equal(dfs_lxml, dfs_bs4)
@@ -965,7 +967,7 @@ def test_importcheck_thread_safety():
     pytest.importorskip('lxml')
     reload(pandas.io.html)
 
-    filename = os.path.join(DATA_PATH, 'valid_markup.html')
+    filename = os.path.join(tm.get_data_path(), 'valid_markup.html')
     helper_thread1 = ErrorThread(target=read_html, args=(filename,))
     helper_thread2 = ErrorThread(target=read_html, args=(filename,))
 

diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py
@@ -830,6 +830,8 @@ def test_default_encoding(self):
 def legacy_packers_versions():
     # yield the packers versions
     path = tm.get_data_path('legacy_msgpack')
+    if not os.path.exists(path):
+        raise pytest.skip("Data file {} does not exist.".format(path))
     for v in os.listdir(path):
         p = os.path.join(path, v)
         if os.path.isdir(p):