diff --git a/RELEASE.rst b/RELEASE.rst index a542a406fcfaa..ce5ded5cd61c8 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -293,6 +293,7 @@ pandas 0.11.0 - fixed pretty priniting of sets (GH3294_) - Panel() and Panel.from_dict() now respects ordering when give OrderedDict (GH3303_) - DataFrame where with a datetimelike incorrectly selecting (GH3311_) + - Ensure pickles created in py2 can be read in py3 .. _GH3294: https://github.com/pydata/pandas/issues/3294 .. _GH622: https://github.com/pydata/pandas/issues/622 diff --git a/pandas/core/common.py b/pandas/core/common.py index 2ec2fc97258de..4acaa3f421e3a 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1575,12 +1575,12 @@ def load(path): ------- unpickled : type of object stored in file """ - f = open(path, 'rb') try: - return pickle.load(f) - finally: - f.close() - + with open(path,'rb') as fh: + return pickle.load(fh) + except: + with open(path,'rb') as fh: + return pickle.load(fh, encoding='latin1') class UTF8Recoder: """ diff --git a/pandas/io/tests/legacy.h5 b/pandas/io/tests/data/legacy_hdf/legacy.h5 similarity index 100% rename from pandas/io/tests/legacy.h5 rename to pandas/io/tests/data/legacy_hdf/legacy.h5 diff --git a/pandas/io/tests/legacy_0.10.h5 b/pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 similarity index 100% rename from pandas/io/tests/legacy_0.10.h5 rename to pandas/io/tests/data/legacy_hdf/legacy_0.10.h5 diff --git a/pandas/io/tests/legacy_table.h5 b/pandas/io/tests/data/legacy_hdf/legacy_table.h5 similarity index 100% rename from pandas/io/tests/legacy_table.h5 rename to pandas/io/tests/data/legacy_hdf/legacy_table.h5 diff --git a/pandas/io/tests/pytables_native.h5 b/pandas/io/tests/data/legacy_hdf/pytables_native.h5 similarity index 100% rename from pandas/io/tests/pytables_native.h5 rename to pandas/io/tests/data/legacy_hdf/pytables_native.h5 diff --git a/pandas/io/tests/pytables_native2.h5 b/pandas/io/tests/data/legacy_hdf/pytables_native2.h5 similarity index 100% rename from pandas/io/tests/pytables_native2.h5 rename to pandas/io/tests/data/legacy_hdf/pytables_native2.h5 diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle new file mode 100644 index 0000000000000..84fbd0d989569 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.10.1/AMD64_windows_2.7.3.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000..f0787f30c1453 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.10.1/x86_64_linux_2.7.3.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle new file mode 100644 index 0000000000000..f0787f30c1453 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_2.7.3.pickle differ diff --git a/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle new file mode 100644 index 0000000000000..e6ed07d75da64 Binary files /dev/null and b/pandas/io/tests/data/legacy_pickle/0.11.0/x86_64_linux_3.3.0.pickle differ diff --git a/pandas/io/tests/salary.table b/pandas/io/tests/data/salary.table similarity index 100% rename from pandas/io/tests/salary.table rename to pandas/io/tests/data/salary.table diff --git a/pandas/io/tests/test.xls b/pandas/io/tests/data/test.xls similarity index 100% rename from pandas/io/tests/test.xls rename to pandas/io/tests/data/test.xls diff --git a/pandas/io/tests/test.xlsx b/pandas/io/tests/data/test.xlsx similarity index 100% rename from pandas/io/tests/test.xlsx rename to pandas/io/tests/data/test.xlsx diff --git a/pandas/io/tests/test1.csv b/pandas/io/tests/data/test1.csv similarity index 100% rename from pandas/io/tests/test1.csv rename to pandas/io/tests/data/test1.csv diff --git a/pandas/io/tests/test2.csv b/pandas/io/tests/data/test2.csv similarity index 100% rename from pandas/io/tests/test2.csv rename to pandas/io/tests/data/test2.csv diff --git a/pandas/io/tests/test2.xls b/pandas/io/tests/data/test2.xls similarity index 100% rename from pandas/io/tests/test2.xls rename to pandas/io/tests/data/test2.xls diff --git a/pandas/io/tests/test3.xls b/pandas/io/tests/data/test3.xls similarity index 100% rename from pandas/io/tests/test3.xls rename to pandas/io/tests/data/test3.xls diff --git a/pandas/io/tests/data/unicode_series.csv b/pandas/io/tests/data/unicode_series.csv new file mode 100644 index 0000000000000..2485e149edb06 --- /dev/null +++ b/pandas/io/tests/data/unicode_series.csv @@ -0,0 +1,18 @@ +1617,King of New York (1990) +1618,All Things Fair (1996) +1619,"Sixth Man, The (1997)" +1620,Butterfly Kiss (1995) +1621,"Paris, France (1993)" +1622,"C�r�monie, La (1995)" +1623,Hush (1998) +1624,Nightwatch (1997) +1625,Nobody Loves Me (Keiner liebt mich) (1994) +1626,"Wife, The (1995)" +1627,Lamerica (1994) +1628,Nico Icon (1995) +1629,"Silence of the Palace, The (Saimt el Qusur) (1994)" +1630,"Slingshot, The (1993)" +1631,Land and Freedom (Tierra y libertad) (1995) +1632,� k�ldum klaka (Cold Fever) (1994) +1633,Etz Hadomim Tafus (Under the Domin Tree) (1994) +1634,Two Friends (1986) diff --git a/pandas/io/tests/utf16_ex.txt b/pandas/io/tests/data/utf16_ex.txt similarity index 100% rename from pandas/io/tests/utf16_ex.txt rename to pandas/io/tests/data/utf16_ex.txt diff --git a/pandas/io/tests/generate_legacy_pickles.py b/pandas/io/tests/generate_legacy_pickles.py new file mode 100644 index 0000000000000..1838e0907233c --- /dev/null +++ b/pandas/io/tests/generate_legacy_pickles.py @@ -0,0 +1,119 @@ +""" self-contained to write legacy pickle files """ + +def _create_sp_series(): + + import numpy as np + from pandas import bdate_range, SparseSeries + + nan = np.nan + + # nan-based + arr = np.arange(15, dtype=float) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + + date_index = bdate_range('1/1/2011', periods=len(index)) + bseries = SparseSeries(arr, index=index, kind='block') + bseries.name = 'bseries' + return bseries + +def _create_sp_frame(): + import numpy as np + from pandas import bdate_range, SparseDataFrame + + nan = np.nan + + data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C': np.arange(10), + 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + dates = bdate_range('1/1/2011', periods=10) + return SparseDataFrame(data, index=dates) + +def create_data(): + """ create the pickle data """ + + import numpy as np + import pandas + from pandas import (Series,DataFrame,Panel, + SparseSeries,SparseDataFrame,SparsePanel, + Index,MultiIndex,PeriodIndex, + date_range,bdate_range,Timestamp) + nan = np.nan + + data = { + 'A': [0., 1., 2., 3., np.nan], + 'B': [0, 1, 0, 1, 0], + 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D': date_range('1/1/2009', periods=5), + 'E' : [0., 1, Timestamp('20100101'),'foo',2.], + } + + index = dict(int = Index(np.arange(10)), + date = date_range('20130101',periods=10)) + mi = dict(reg = MultiIndex.from_tuples(zip([['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]), + names=['first', 'second'])) + series = dict(float = Series(data['A']), + int = Series(data['B']), + mixed = Series(data['E'])) + frame = dict(float = DataFrame(dict(A = series['float'], B = series['float'] + 1)), + int = DataFrame(dict(A = series['int'] , B = series['int'] + 1)), + mixed = DataFrame(dict([ (k,data[k]) for k in ['A','B','C','D']]))) + panel = dict(float = Panel(dict(ItemA = frame['float'], ItemB = frame['float']+1))) + + + + return dict( series = series, + frame = frame, + panel = panel, + index = index, + mi = mi, + sp_series = dict(float = _create_sp_series()), + sp_frame = dict(float = _create_sp_frame()) + ) + +def write_legacy_pickles(): + + # force our cwd to be the first searched + import sys + sys.path.insert(0,'.') + + import os + import numpy as np + import pandas + import pandas.util.testing as tm + import platform as pl + import cPickle as pickle + + print("This script generates a pickle file for the current arch, system, and python version") + + base_dir, _ = os.path.split(os.path.abspath(__file__)) + base_dir = os.path.join(base_dir,'data/legacy_pickle') + + # could make this a parameter? + version = None + + + if version is None: + version = pandas.__version__ + pth = os.path.join(base_dir, str(version)) + try: + os.mkdir(pth) + except: + pass + + # construct a reasonable platform name + f = '_'.join([ str(pl.machine()), str(pl.system().lower()), str(pl.python_version()) ]) + pth = os.path.abspath(os.path.join(pth,'%s.pickle' % f)) + + fh = open(pth,'wb') + pickle.dump(create_data(),fh,pickle.HIGHEST_PROTOCOL) + fh.close() + + print("created pickle file: %s" % pth) + +if __name__ == '__main__': + write_legacy_pickles() diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index 5a7e646eca0eb..b352b189a74b8 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -31,15 +31,10 @@ import pandas._parser as parser -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - class TestCParser(unittest.TestCase): def setUp(self): - self.dirpath = curpath() + self.dirpath = tm.get_data_path('/') self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 00005c7570a28..ccd9cbc56b2a5 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -65,10 +65,6 @@ def _skip_if_no_excelsuite(): _skip_if_no_openpyxl() -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() _frame = DataFrame(_seriesd)[:10] @@ -81,7 +77,7 @@ def curpath(): class ExcelTests(unittest.TestCase): def setUp(self): - self.dirpath = curpath() + self.dirpath = tm.get_data_path() self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 358f64df685d7..0e64211163ae3 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -59,7 +59,7 @@ def setUp(self): import warnings warnings.filterwarnings(action='ignore', category=FutureWarning) - self.dirpath = curpath() + self.dirpath = tm.get_data_path() self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') @@ -1208,7 +1208,7 @@ def test_url(self): url = ('https://raw.github.com/pydata/pandas/master/' 'pandas/io/tests/salary.table') url_table = self.read_table(url) - dirpath = curpath() + dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table') local_table = self.read_table(localtable) tm.assert_frame_equal(url_table, local_table) @@ -1229,7 +1229,7 @@ def test_file(self): # FILE if sys.version_info[:2] < (2, 6): raise nose.SkipTest("file:// not supported with Python < 2.6") - dirpath = curpath() + dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table') local_table = self.read_table(localtable) @@ -1404,7 +1404,7 @@ def test_utf16_bom_skiprows(self): tm.assert_frame_equal(result, expected) def test_utf16_example(self): - path = os.path.join(self.dirpath, 'utf16_ex.txt') + path = tm.get_data_path('utf16_ex.txt') # it works! and is the right length result = self.read_table(path, encoding='utf-16') @@ -1476,8 +1476,7 @@ def convert_score(x): tm.assert_frame_equal(result, result2) def test_unicode_encoding(self): - pth = psplit(psplit(curpath())[0])[0] - pth = os.path.join(pth, 'tests/data/unicode_series.csv') + pth = tm.get_data_path('unicode_series.csv') result = self.read_csv(pth, header=None, encoding='latin-1') result = result.set_index(0) @@ -2185,11 +2184,6 @@ def assert_same_values_and_dtype(res, exp): assert_almost_equal(res, exp) -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py new file mode 100644 index 0000000000000..b8c3da039cc82 --- /dev/null +++ b/pandas/io/tests/test_pickle.py @@ -0,0 +1,74 @@ +# pylint: disable=E1101,E1103,W0232 + +""" manage legacy pickle tests """ + +from datetime import datetime, timedelta +import operator +import pickle +import unittest +import nose +import os + +import numpy as np +import pandas.util.testing as tm +import pandas as pd +from pandas import Index +from pandas.sparse.tests import test_sparse + +class TestPickle(unittest.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + from pandas.io.tests.generate_legacy_pickles import create_data + self.data = create_data() + + def compare(self, vf): + + # py3 compat when reading py2 pickle + + try: + with open(vf,'rb') as fh: + data = pickle.load(fh) + except (ValueError): + + # we are trying to read a py3 pickle in py2..... + return + except: + with open(vf,'rb') as fh: + data = pickle.load(fh, encoding='latin1') + + for typ, dv in data.items(): + for dt, result in dv.items(): + + expected = self.data[typ][dt] + + if isinstance(expected,Index): + self.assert_(expected.equals(result)) + continue + + if typ.startswith('sp_'): + comparator = getattr(test_sparse,"assert_%s_equal" % typ) + comparator(result,expected,exact_indices=False) + else: + comparator = getattr(tm,"assert_%s_equal" % typ) + comparator(result,expected) + + def test_read_pickles_0_10_1(self): + + pth = tm.get_data_path('legacy_pickle/0.10.1') + for f in os.listdir(pth): + vf = os.path.join(pth,f) + self.compare(vf) + + def test_read_pickles_0_11_0(self): + + pth = tm.get_data_path('legacy_pickle/0.11.0') + for f in os.listdir(pth): + vf = os.path.join(pth,f) + self.compare(vf) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 1973c578cb9e6..6acf17b1220a7 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -2324,25 +2324,23 @@ def _check_roundtrip_table(self, obj, comparator, compression=False): comparator(retrieved, obj) def test_pytables_native_read(self): - pth = curpath() try: - store = HDFStore(os.path.join(pth, 'pytables_native.h5'), 'r') + store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native.h5'), 'r') d2 = store['detector/readout'] finally: safe_close(store) try: - store = HDFStore(os.path.join(pth, 'pytables_native2.h5'), 'r') + store = HDFStore(tm.get_data_path('legacy_hdf/pytables_native2.h5'), 'r') str(store) d1 = store['detector'] finally: safe_close(store) def test_legacy_read(self): - pth = curpath() try: - store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') + store = HDFStore(tm.get_data_path('legacy_hdf/legacy.h5'), 'r') store['a'] store['b'] store['c'] @@ -2352,9 +2350,8 @@ def test_legacy_read(self): def test_legacy_table_read(self): # legacy table types - pth = curpath() try: - store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'r') + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table.h5'), 'r') store.select('df1') store.select('df2') store.select('wp1') @@ -2376,23 +2373,21 @@ def test_legacy_table_read(self): def test_legacy_0_10_read(self): # legacy from 0.10 - pth = curpath() try: - store = HDFStore(os.path.join(pth, 'legacy_0.10.h5'), 'r') + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), 'r') for k in store.keys(): store.select(k) finally: safe_close(store) def test_copy(self): - pth = curpath() def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: import os if f is None: - f = os.path.join(pth, 'legacy_0.10.h5') + f = tm.get_data_path('legacy_hdf/legacy_0.10.h5') store = HDFStore(f, 'r') @@ -2446,11 +2441,10 @@ def test_legacy_table_write(self): raise nose.SkipTest # legacy table types - pth = curpath() df = tm.makeDataFrame() wp = tm.makePanel() - store = HDFStore(os.path.join(pth, 'legacy_table.h5'), 'a') + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table.h5'), 'a') self.assertRaises(Exception, store.append, 'df1', df) self.assertRaises(Exception, store.append, 'wp1', wp) @@ -2532,11 +2526,6 @@ def test_store_datetime_mixed(self): # self.assertRaises(Exception, store.put, 'foo', df, table=True) -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py index 8cc6e01aeeb48..c18e0173b4589 100644 --- a/pandas/sparse/tests/test_sparse.py +++ b/pandas/sparse/tests/test_sparse.py @@ -74,8 +74,7 @@ def _test_data2_zero(): arr[np.isnan(arr)] = 0 return arr, index - -def assert_sp_series_equal(a, b): +def assert_sp_series_equal(a, b, exact_indices=True): assert(a.index.equals(b.index)) assert_sp_array_equal(a, b) diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py index cb55061ae7c3e..1cfcb465ce6a0 100644 --- a/pandas/tests/test_format.py +++ b/pandas/tests/test_format.py @@ -27,11 +27,6 @@ _frame = DataFrame(tm.getSeriesData()) -def curpath(): - pth, _ = os.path.split(os.path.abspath(__file__)) - return pth - - class TestDataFrameFormatting(unittest.TestCase): _multiprocess_can_split_ = True @@ -491,8 +486,7 @@ def test_unicode_problem_decoding_as_ascii(self): unicode(dm.to_string()) def test_string_repr_encoding(self): - pth = curpath() - filepath = os.path.join(pth, 'data', 'unicode_series.csv') + filepath = tm.get_data_path('unicode_series.csv') df = pandas.read_csv(filepath, header=None, encoding='latin1') repr(df) repr(df[1]) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index bc2aa7628bf28..a3205f3834be7 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -93,6 +93,20 @@ def ensure_clean(filename = None): except: pass +def get_data_path(f = None): + """ return the path of a data file, these are relative to the current test dir """ + + if f is None: + f = '' + import inspect, os + + # get our callers file + frame,filename,line_number,function_name,lines,index = \ + inspect.getouterframes(inspect.currentframe())[1] + + base_dir = os.path.abspath(os.path.dirname(filename)) + return os.path.join(base_dir, 'data/%s' % f) + #------------------------------------------------------------------------------ # Comparators diff --git a/setup.py b/setup.py index 8cc08c2a38d8b..707e9e0efc55c 100755 --- a/setup.py +++ b/setup.py @@ -666,12 +666,14 @@ def pxd(name): 'pandas.io.tests', 'pandas.stats.tests', ], - package_data={'pandas.io': ['tests/*.h5', - 'tests/*.csv', - 'tests/*.txt', - 'tests/*.xls', - 'tests/*.xlsx', - 'tests/*.table'], + package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', + 'tests/data/legacy_pickle/0.10.1/*.pickle', + 'tests/data/legacy_pickle/0.11.0/*.pickle', + 'tests/data/*.csv', + 'tests/data/*.txt', + 'tests/data/*.xls', + 'tests/data/*.xlsx', + 'tests/data/*.table'], 'pandas.tools': ['tests/*.csv'], 'pandas.tests': ['data/*.pickle', 'data/*.csv'],