Skip to content

PKG: Exclude data test files. #19535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 40 commits into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
4d77cd8
PKG: Exclude data test files.
TomAugspurger Feb 3, 2018
270e442
Stuff
TomAugspurger Feb 6, 2018
26e9b4b
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Feb 22, 2018
1804bcc
Refactor data path handling
TomAugspurger Feb 25, 2018
7022152
More fixtures
TomAugspurger Feb 25, 2018
080f000
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Mar 26, 2018
151ffda
Updated html
TomAugspurger Mar 26, 2018
d9d6570
Remove os.path.joins
TomAugspurger Mar 26, 2018
5849591
More modules
TomAugspurger Mar 27, 2018
31fb0b6
Some more
TomAugspurger Mar 27, 2018
9193f15
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Mar 27, 2018
e897f11
Updated packers
TomAugspurger Mar 27, 2018
9cf30fd
Pickle
TomAugspurger Mar 27, 2018
95cde7a
Linting
TomAugspurger Mar 27, 2018
10ddddc
Autouse stata
TomAugspurger Mar 27, 2018
e1ea208
Remove filename
TomAugspurger Mar 27, 2018
8616878
Autouse in merge_asof
TomAugspurger Mar 27, 2018
77bf77c
Cleanup plotting
TomAugspurger Mar 27, 2018
156e14b
CLN: Simplify fspath
TomAugspurger Mar 27, 2018
f3f3662
Refactor sql tests
TomAugspurger Mar 27, 2018
2cd9706
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Mar 27, 2018
aac3606
Fixed docstrings
TomAugspurger Mar 27, 2018
762a2d1
Moved
TomAugspurger Mar 27, 2018
7c44b77
Use fixture for iris plotting
TomAugspurger Mar 28, 2018
ad09951
Abs path for file test
TomAugspurger Mar 28, 2018
6f02d6b
Removed stdout capture from sql tests
TomAugspurger Mar 29, 2018
489f540
Merge remote-tracking branch 'origin/master' into package-size
TomAugspurger May 12, 2018
8b42d1c
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger May 12, 2018
ee4fefd
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Jun 12, 2018
bac438c
Cleanup Manifest
TomAugspurger Jun 12, 2018
84ccdbf
fixed test test
TomAugspurger Jun 12, 2018
c4802db
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Jun 20, 2018
7fd7660
Fixed windows
TomAugspurger Jun 20, 2018
c187f8b
whatsnew
TomAugspurger Jun 20, 2018
632a61d
Clarify note [ci skip]
TomAugspurger Jun 20, 2018
b5b70c7
TST: refactored html tests
TomAugspurger Jun 21, 2018
9954bba
Remove auto-generated html fixtures
TomAugspurger Jun 22, 2018
c771885
linting
TomAugspurger Jun 22, 2018
dd75270
Removed test test file
TomAugspurger Jun 26, 2018
dbe0c57
Merge remote-tracking branch 'upstream/master' into package-size
TomAugspurger Jun 26, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,39 @@ include LICENSE
include RELEASE.md
include README.md
include setup.py
include pyproject.toml

graft doc
prune doc/build

graft LICENSES

graft pandas

global-exclude *.so
global-exclude *.pyd
global-exclude *.bz2
global-exclude *.csv
global-exclude *.dta
global-exclude *.gz
global-exclude *.h5
global-exclude *.html
global-exclude *.json
global-exclude *.msgpack
global-exclude *.pickle
global-exclude *.png
global-exclude *.pyc
global-exclude *.pyd
global-exclude *.sas7bdat
global-exclude *.so
global-exclude *.xls
global-exclude *.xlsm
global-exclude *.xlsx
global-exclude *.xpt
global-exclude *.xz
global-exclude *.zip
global-exclude *~
global-exclude \#*
global-exclude .git*
global-exclude .DS_Store
global-exclude *.png
global-exclude .git*
global-exclude \#*

# include examples/data/*
# recursive-include examples *.py
# recursive-include doc/source *
# recursive-include doc/sphinxext *
# recursive-include LICENSES *
include versioneer.py
include pandas/_version.py
include pandas/io/formats/templates/*.tpl
8 changes: 4 additions & 4 deletions ci/script_single.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ if [ "$DOC" ]; then
echo "We are not running pytest as this is a doc-build"

elif [ "$COVERAGE" ]; then
echo pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
pytest -s -m "single" --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
echo pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe should make a variable that holds all of these options for both the echo and the run to avoid duplication

pytest -s -m "single" -r xXs --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas

else
echo pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
pytest -m "single" -r xX --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest
echo pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas
pytest -m "single" -r xXs --junitxml=/tmp/single.xml --strict $TEST_ARGS pandas # TODO: doctest

fi

Expand Down
5 changes: 5 additions & 0 deletions doc/source/whatsnew/v0.23.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ Documentation Changes
-
-

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a ref here as well

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

General note: IMO it is not needed to always ask this of contributors, as this ref is only needed when we actually want to make an explicit link to it from within the rst files (and chances are quite high we will never do this). The ref can always be added at the moment one adds a link.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, but in general its a good practice

Build Changes
-------------

- The source and binary distributions no longer include test data files, resulting in smaller download sizes. Tests relying on these data files will be skipped when using ``pandas.test()``. (:issue:`19320`)

.. _whatsnew_0232.bug_fixes:

Bug Fixes
Expand Down
41 changes: 41 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import os
import importlib

import pytest

import pandas
import numpy as np
import pandas as pd
from pandas.compat import PY3
Expand All @@ -17,6 +19,8 @@ def pytest_addoption(parser):
help="run high memory tests")
parser.addoption("--only-slow", action="store_true",
help="run only slow tests")
parser.addoption("--strict-data-files", action="store_true",
help="Fail if a test is skipped for missing data file.")


def pytest_runtest_setup(item):
Expand Down Expand Up @@ -131,6 +135,43 @@ def join_type(request):
return request.param


@pytest.fixture
def datapath(request):
"""Get the path to a data file.

Parameters
----------
path : str
Path to the file, relative to ``pandas/tests/``

Returns
-------
path : path including ``pandas/tests``.

Raises
------
ValueError
If the path doesn't exist and the --strict-data-files option is set.
"""
def deco(*args):
path = os.path.join('pandas', 'tests', *args)
if not os.path.exists(path):
if request.config.getoption("--strict-data-files"):
msg = "Could not find file {} and --strict-data-files is set."
raise ValueError(msg.format(path))
else:
msg = "Could not find {}."
pytest.skip(msg.format(path))
return path
return deco


@pytest.fixture
def iris(datapath):
"""The iris dataset as a DataFrame."""
return pandas.read_csv(datapath('data', 'iris.csv'))


@pytest.fixture(params=['nlargest', 'nsmallest'])
def nselect_method(request):
"""
Expand Down
8 changes: 4 additions & 4 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1182,12 +1182,12 @@ def test_iter(self):
('baz', 'two'), ('qux', 'one'), ('qux', 'two')]
assert result == expected

def test_legacy_pickle(self):
def test_legacy_pickle(self, datapath):
if PY3:
pytest.skip("testing for legacy pickles not "
"support on py3")

path = tm.get_data_path('multiindex_v1.pickle')
path = datapath('indexes', 'data', 'multiindex_v1.pickle')
obj = pd.read_pickle(path)

obj2 = MultiIndex.from_tuples(obj.values)
Expand All @@ -1203,10 +1203,10 @@ def test_legacy_pickle(self):
assert_almost_equal(res, exp)
assert_almost_equal(exp, exp2)

def test_legacy_v2_unpickle(self):
def test_legacy_v2_unpickle(self, datapath):

# 0.7.3 -> 0.8.0 format manage
path = tm.get_data_path('mindex_073.pickle')
path = datapath('indexes', 'data', 'mindex_073.pickle')
obj = pd.read_pickle(path)

obj2 = MultiIndex.from_tuples(obj.values)
Expand Down
21 changes: 6 additions & 15 deletions pandas/tests/io/conftest.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,23 @@
import os

import pytest
from pandas.io.parsers import read_table
from pandas.util import testing as tm


@pytest.fixture
def parser_data(request):
return os.path.join(tm.get_data_path(), '..', 'parser', 'data')


@pytest.fixture
def tips_file(parser_data):
def tips_file(datapath):
"""Path to the tips dataset"""
return os.path.join(parser_data, 'tips.csv')
return datapath('io', 'parser', 'data', 'tips.csv')


@pytest.fixture
def jsonl_file(parser_data):
def jsonl_file(datapath):
"""Path a JSONL dataset"""
return os.path.join(parser_data, 'items.jsonl')
return datapath('io', 'parser', 'data', 'items.jsonl')


@pytest.fixture
def salaries_table(parser_data):
def salaries_table(datapath):
"""DataFrame with the salaries dataset"""
path = os.path.join(parser_data, 'salaries.csv')
return read_table(path)
return read_table(datapath('io', 'parser', 'data', 'salaries.csv'))


@pytest.fixture
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/formats/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,8 +916,8 @@ def test_unicode_problem_decoding_as_ascii(self):
dm = DataFrame({u('c/\u03c3'): Series({'test': np.nan})})
compat.text_type(dm.to_string())

def test_string_repr_encoding(self):
filepath = tm.get_data_path('unicode_series.csv')
def test_string_repr_encoding(self, datapath):
filepath = datapath('io', 'formats', 'data', 'unicode_series.csv')
df = pd.read_csv(filepath, header=None, encoding='latin1')
repr(df)
repr(df[1])
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/json/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ def test_compression_roundtrip(compression):
assert_frame_equal(df, pd.read_json(result))


def test_read_zipped_json():
uncompressed_path = tm.get_data_path("tsframe_v012.json")
def test_read_zipped_json(datapath):
uncompressed_path = datapath("io", "json", "data", "tsframe_v012.json")
uncompressed_df = pd.read_json(uncompressed_path)

compressed_path = tm.get_data_path("tsframe_v012.json.zip")
compressed_path = datapath("io", "json", "data", "tsframe_v012.json.zip")
compressed_df = pd.read_json(compressed_path, compression='zip')

assert_frame_equal(uncompressed_df, compressed_df)
Expand Down
8 changes: 5 additions & 3 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,9 @@

class TestPandasContainer(object):

def setup_method(self, method):
self.dirpath = tm.get_data_path()
@pytest.fixture(scope="function", autouse=True)
def setup(self, datapath):
self.dirpath = datapath("io", "json", "data")

self.ts = tm.makeTimeSeries()
self.ts.name = 'ts'
Expand All @@ -59,7 +60,8 @@ def setup_method(self, method):
self.mixed_frame = _mixed_frame.copy()
self.categorical = _cat_frame.copy()

def teardown_method(self, method):
yield
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I learned that autouse fixtures can include yield, letting us do the tearndown here which is nice.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the other hand, it also makes this more "complex" to understand IMO (the teardown_method is rather explicit and easy to understand what is going on, while the yield is a less standard fancy feature of pytest ).
Has the autouse=True other advantages? (what's the reason you needed that here?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two things

  1. datapath can only be called inside a test, since it inspects the request context, to see if --strict-data-files is set.
  2. datapath is a fixture, and you can't mix xUnit style setup_method with fixtures. autouse fixtures are the easiest way to replicate xUnit stuff, while being able to use fixtures.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done this myself a few times so wanted to chime in. autouse=True allows the fixture to yield every test within it's scope without having to explicitly decorating or passing that fixture as an arg.

This approach is more in line with how pytest suggests doing setup/teardown (see here) so I think that's a +1 for it. It also gives you potential visibility into the context of the yield tests (ex: here I think you could replace the datapath fixture usage as an argument via an assignment to self.datapath before yielding)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, that's a good reason to use an autouse fixture here in this case.

(you don't need to convince me of the benefit of fixtures in general :), however, I think many people are not that familiar with all those pytest special features and it has a steeper learning curve IMO, so there can be a balance in how fancy we go)

Copy link
Contributor Author

@TomAugspurger TomAugspurger Mar 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. When documenting this, I'll recommend against autouse for cases like this. I think it'd be better to just have a categorical fixture and accept that in the test. But that would have meant a lot more potential for breaking things in this PR :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah we should really not use this pattern, rather changing to all fixtures. as a temporary workaround this ok, can you create an issue to 'fix' this properly though.


del self.dirpath

del self.ts
Expand Down
25 changes: 11 additions & 14 deletions pandas/tests/io/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_read_csv(self):
else:
prefix = u("file://")

fname = prefix + compat.text_type(self.csv1)
fname = prefix + compat.text_type(os.path.abspath(self.csv1))
self.read_csv(fname, index_col=0, parse_dates=True)

def test_1000_sep(self):
Expand Down Expand Up @@ -651,21 +651,19 @@ def test_read_csv_parse_simple_list(self):
tm.assert_frame_equal(df, expected)

@tm.network
def test_url(self):
def test_url(self, datapath):
# HTTP(S)
url = ('https://raw.github.com/pandas-dev/pandas/master/'
'pandas/tests/io/parser/data/salaries.csv')
url_table = self.read_table(url)
dirpath = tm.get_data_path()
localtable = os.path.join(dirpath, 'salaries.csv')
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
local_table = self.read_table(localtable)
tm.assert_frame_equal(url_table, local_table)
# TODO: ftp testing

@pytest.mark.slow
def test_file(self):
dirpath = tm.get_data_path()
localtable = os.path.join(dirpath, 'salaries.csv')
def test_file(self, datapath):
localtable = datapath('io', 'parser', 'data', 'salaries.csv')
local_table = self.read_table(localtable)

try:
Expand Down Expand Up @@ -755,8 +753,8 @@ def test_utf16_bom_skiprows(self):

tm.assert_frame_equal(result, expected)

def test_utf16_example(self):
path = tm.get_data_path('utf16_ex.txt')
def test_utf16_example(self, datapath):
path = datapath('io', 'parser', 'data', 'utf16_ex.txt')

# it works! and is the right length
result = self.read_table(path, encoding='utf-16')
Expand All @@ -767,8 +765,8 @@ def test_utf16_example(self):
result = self.read_table(buf, encoding='utf-16')
assert len(result) == 50

def test_unicode_encoding(self):
pth = tm.get_data_path('unicode_series.csv')
def test_unicode_encoding(self, datapath):
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')

result = self.read_csv(pth, header=None, encoding='latin-1')
result = result.set_index(0)
Expand Down Expand Up @@ -1513,10 +1511,9 @@ def test_internal_eof_byte_to_file(self):
result = self.read_csv(path)
tm.assert_frame_equal(result, expected)

def test_sub_character(self):
def test_sub_character(self, datapath):
# see gh-16893
dirpath = tm.get_data_path()
filename = os.path.join(dirpath, "sub_char.csv")
filename = datapath('io', 'parser', 'data', 'sub_char.csv')

expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
result = self.read_csv(filename)
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,9 @@ def test_read_csv_infer_compression(self):

tm.assert_frame_equal(expected, df)

def test_read_csv_compressed_utf16_example(self):
def test_read_csv_compressed_utf16_example(self, datapath):
# GH18071
path = tm.get_data_path('utf16_ex_small.zip')
path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip')

result = self.read_csv(path, encoding='utf-16',
compression='zip', sep='\t')
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/io/parser/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,17 @@ def test_categorical_dtype_high_cardinality_numeric(self):
np.sort(actual.a.cat.categories), ordered=True)
tm.assert_frame_equal(actual, expected)

def test_categorical_dtype_encoding(self):
def test_categorical_dtype_encoding(self, datapath):
# GH 10153
pth = tm.get_data_path('unicode_series.csv')
pth = datapath('io', 'parser', 'data', 'unicode_series.csv')
encoding = 'latin-1'
expected = self.read_csv(pth, header=None, encoding=encoding)
expected[1] = Categorical(expected[1])
actual = self.read_csv(pth, header=None, encoding=encoding,
dtype={1: 'category'})
tm.assert_frame_equal(actual, expected)

pth = tm.get_data_path('utf16_ex.txt')
pth = datapath('io', 'parser', 'data', 'utf16_ex.txt')
encoding = 'utf-16'
expected = self.read_table(pth, encoding=encoding)
expected = expected.apply(Categorical)
Expand Down
Loading