diff --git a/pandas/io/tests/data/blank.xlsm b/pandas/io/tests/data/blank.xlsm new file mode 100755 index 0000000000000..c3c2074276d12 Binary files /dev/null and b/pandas/io/tests/data/blank.xlsm differ diff --git a/pandas/io/tests/data/blank_with_header.xlsm b/pandas/io/tests/data/blank_with_header.xlsm new file mode 100755 index 0000000000000..5b2ed0055b243 Binary files /dev/null and b/pandas/io/tests/data/blank_with_header.xlsm differ diff --git a/pandas/io/tests/data/test.xls b/pandas/io/tests/data/test1.xls similarity index 100% rename from pandas/io/tests/data/test.xls rename to pandas/io/tests/data/test1.xls diff --git a/pandas/io/tests/data/test.xlsm b/pandas/io/tests/data/test1.xlsm similarity index 100% rename from pandas/io/tests/data/test.xlsm rename to pandas/io/tests/data/test1.xlsm diff --git a/pandas/io/tests/data/test.xlsx b/pandas/io/tests/data/test1.xlsx similarity index 100% rename from pandas/io/tests/data/test.xlsx rename to pandas/io/tests/data/test1.xlsx diff --git a/pandas/io/tests/data/test2.xlsm b/pandas/io/tests/data/test2.xlsm new file mode 100644 index 0000000000000..31cfba7ede082 Binary files /dev/null and b/pandas/io/tests/data/test2.xlsm differ diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/io/tests/data/test2.xlsx index 441db5e55e666..94dd951e0bb84 100644 Binary files a/pandas/io/tests/data/test2.xlsx and b/pandas/io/tests/data/test2.xlsx differ diff --git a/pandas/io/tests/data/test3.xlsm b/pandas/io/tests/data/test3.xlsm new file mode 100644 index 0000000000000..54b7ef456a9ea Binary files /dev/null and b/pandas/io/tests/data/test3.xlsm differ diff --git a/pandas/io/tests/data/test3.xlsx b/pandas/io/tests/data/test3.xlsx new file mode 100644 index 0000000000000..c16755c25fabd Binary files /dev/null and b/pandas/io/tests/data/test3.xlsx differ diff --git a/pandas/io/tests/data/test4.xls b/pandas/io/tests/data/test4.xls new file mode 100644 index 0000000000000..0e6f4331e2547 Binary files /dev/null and b/pandas/io/tests/data/test4.xls differ diff --git a/pandas/io/tests/data/test4.xlsm b/pandas/io/tests/data/test4.xlsm new file mode 100644 index 0000000000000..52328c7b28be9 Binary files /dev/null and b/pandas/io/tests/data/test4.xlsm differ diff --git a/pandas/io/tests/data/test4.xlsx b/pandas/io/tests/data/test4.xlsx new file mode 100644 index 0000000000000..441db5e55e666 Binary files /dev/null and b/pandas/io/tests/data/test4.xlsx differ diff --git a/pandas/io/tests/data/test_converters.xlsm b/pandas/io/tests/data/test_converters.xlsm new file mode 100644 index 0000000000000..eaf0b1d0219c5 Binary files /dev/null and b/pandas/io/tests/data/test_converters.xlsm differ diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/io/tests/data/test_multisheet.xls new file mode 100644 index 0000000000000..fa37723fcdefb Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xls differ diff --git a/pandas/io/tests/data/test_multisheet.xlsm b/pandas/io/tests/data/test_multisheet.xlsm new file mode 100644 index 0000000000000..694f8e07d5e29 Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xlsm differ diff --git a/pandas/io/tests/data/test_types.xlsm b/pandas/io/tests/data/test_types.xlsm new file mode 100644 index 0000000000000..c66fdc82dfb67 Binary files /dev/null and b/pandas/io/tests/data/test_types.xlsm differ diff --git a/pandas/io/tests/data/times_1900.xlsm b/pandas/io/tests/data/times_1900.xlsm new file mode 100644 index 0000000000000..1ffdbe223453b Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsm differ diff --git a/pandas/io/tests/data/times_1900.xlsx b/pandas/io/tests/data/times_1900.xlsx new file mode 100644 index 0000000000000..3702289b256fd Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsx differ diff --git a/pandas/io/tests/data/times_1904.xlsm b/pandas/io/tests/data/times_1904.xlsm new file mode 100644 index 0000000000000..e884eca1e7c74 Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsm differ diff --git a/pandas/io/tests/data/times_1904.xlsx b/pandas/io/tests/data/times_1904.xlsx new file mode 100644 index 0000000000000..1a13468e59d1c Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 83db59f9d9029..073fc55357df7 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -24,7 +24,6 @@ from pandas.util.testing import ensure_clean from pandas.core.config import set_option, get_option import pandas.util.testing as tm -import pandas as pd def _skip_if_no_xlrd(): @@ -76,281 +75,215 @@ def _skip_if_no_excelsuite(): class SharedItems(object): def setUp(self): self.dirpath = tm.get_data_path() - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') - self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') - self.multisheet = os.path.join(self.dirpath, 'test_multisheet.xlsx') self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() - def read_csv(self, *args, **kwds): - kwds = kwds.copy() - kwds['engine'] = 'python' - return read_csv(*args, **kwds) + def get_csv_refdf(self, basename): + """ + Obtain the reference data from read_csv with the Python engine. + Test data path is defined by pandas.util.testing.get_data_path() + Parameters + ---------- -class ExcelReaderTests(SharedItems, tm.TestCase): - def test_parse_cols_int(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() + basename : str + File base name, excluding file extension. - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols=3) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['A', 'B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, parse_cols=3) - # TODO add index to xls file) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + Returns + ------- - def test_parse_cols_list(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() + dfref : DataFrame + """ + pref = os.path.join(self.dirpath, basename + '.csv') + dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') + return dfref - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols=[0, 2, 3]) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols=[0, 2, 3]) - # TODO add index to xls file) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + def get_excelfile(self, basename): + """ + Return test data ExcelFile instance. Test data path is defined by + pandas.util.testing.get_data_path() - def test_parse_cols_str(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() + Parameters + ---------- - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A:D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['A', 'B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, parse_cols='A:D') - # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - del df, df2, df3 - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A,C,D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols='A,C,D') - # TODO add index to xls file - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - del df, df2, df3 - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A,C:D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols='A,C:D') - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + basename : str + File base name, excluding file extension. - def test_excel_stop_iterator(self): - _skip_if_no_xlrd() + Returns + ------- - excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) - parsed = excel_data.parse('Sheet1') - expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) - tm.assert_frame_equal(parsed, expected) + excel : io.excel.ExcelFile + """ + return ExcelFile(os.path.join(self.dirpath, basename + self.ext)) - def test_excel_cell_error_na(self): - _skip_if_no_xlrd() + def get_exceldf(self, basename, *args, **kwds): + """ + Return test data DataFrame. Test data path is defined by + pandas.util.testing.get_data_path() - excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) - parsed = excel_data.parse('Sheet1') - expected = DataFrame([[np.nan]], columns=['Test']) - tm.assert_frame_equal(parsed, expected) + Parameters + ---------- - def test_excel_passes_na(self): - _skip_if_no_xlrd() + basename : str + File base name, excluding file extension. - excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xlsx')) - parsed = excel_data.parse('Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) + Returns + ------- - parsed = excel_data.parse('Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) + df : DataFrame + """ + pth = os.path.join(self.dirpath, basename + self.ext) + return read_excel(pth, *args, **kwds) - def check_excel_table_sheet_by_index(self, filename, csvfile): - import xlrd - pth = os.path.join(self.dirpath, filename) - xls = ExcelFile(pth) - df = xls.parse(0, index_col=0, parse_dates=True) - df2 = self.read_csv(csvfile, index_col=0, parse_dates=True) - df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + # + # Base class for test cases to run with different Excel readers. + # To add a reader test, define the following: + # 1. A check_skip function that skips your tests if your reader isn't + # installed. + # 2. Add a property ext, which is the file extension that your reader + # reades from. (needs to start with '.' so it's a valid path) + # 3. Add a property engine_name, which is the name of the reader class. + # For the reader this is not used for anything at the moment. - df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1) - df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) + def setUp(self): + self.check_skip() + super(ReadingTestsBase, self).setUp() - self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf') + def test_parse_cols_int(self): - def test_excel_table_sheet_by_index(self): - _skip_if_no_xlrd() - for filename, csvfile in [(self.xls1, self.csv1), - (self.xlsx1, self.csv1)]: - self.check_excel_table_sheet_by_index(filename, csvfile) + dfref = self.get_csv_refdf('test1') + excel = self.get_excelfile('test1') + dfref = dfref.reindex(columns=['A', 'B', 'C']) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=3) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols=3) + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - def test_excel_table(self): - _skip_if_no_xlrd() - - pth = os.path.join(self.dirpath, 'test.xls') - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - - df4 = xls.parse('Sheet1', index_col=0, parse_dates=True, - skipfooter=1) - df5 = xls.parse('Sheet1', index_col=0, parse_dates=True, - skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) + def test_parse_cols_list(self): - def test_excel_read_buffer(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() + excel = self.get_excelfile('test1') + dfref = self.get_csv_refdf('test1') + dfref = dfref.reindex(columns=['B', 'C']) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=[0, 2, 3]) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols=[0, 2, 3]) + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - pth = os.path.join(self.dirpath, 'test.xls') - f = open(pth, 'rb') - xls = ExcelFile(f) - # it works - xls.parse('Sheet1', index_col=0, parse_dates=True) + def test_parse_cols_str(self): - pth = os.path.join(self.dirpath, 'test.xlsx') - f = open(pth, 'rb') - xl = ExcelFile(f) - xl.parse('Sheet1', index_col=0, parse_dates=True) + excel = self.get_excelfile('test1') + dfref = self.get_csv_refdf('test1') + + df1 = dfref.reindex(columns=['A', 'B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A:D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols='A:D') + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C,D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C,D') + # TODO add index to xls file + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C:D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C:D') + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) - def test_read_xlrd_Book(self): - _skip_if_no_xlrd() - _skip_if_no_xlwt() + def test_excel_stop_iterator(self): - import xlrd + excel = self.get_excelfile('test2') - df = self.frame + parsed = excel.parse('Sheet1') + expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + tm.assert_frame_equal(parsed, expected) - with ensure_clean('.xls') as pth: - df.to_excel(pth, "SheetA") - book = xlrd.open_workbook(pth) + def test_excel_cell_error_na(self): - with ExcelFile(book, engine="xlrd") as xl: - result = xl.parse("SheetA") - tm.assert_frame_equal(df, result) + excel = self.get_excelfile('test3') - result = read_excel(book, sheetname="SheetA", engine="xlrd") - tm.assert_frame_equal(df, result) + parsed = excel.parse('Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + tm.assert_frame_equal(parsed, expected) - @tm.network - def test_read_from_http_url(self): - _skip_if_no_xlrd() + def test_excel_passes_na(self): - url = ('https://raw.github.com/pydata/pandas/master/' - 'pandas/io/tests/data/test.xlsx') - url_table = read_excel(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'test.xlsx') - local_table = read_excel(localtable) - tm.assert_frame_equal(url_table, local_table) + excel = self.get_excelfile('test4') - @slow - def test_read_from_file_url(self): - _skip_if_no_xlrd() + parsed = excel.parse('Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'test.xlsx') - local_table = read_excel(localtable) + parsed = excel.parse('Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) - try: - url_table = read_excel('file://localhost/' + localtable) - except URLError: - # fails on some systems - raise nose.SkipTest("failing on %s" % - ' '.join(platform.uname()).strip()) + def test_excel_table_sheet_by_index(self): - tm.assert_frame_equal(url_table, local_table) + excel = self.get_excelfile('test1') + dfref = self.get_csv_refdf('test1') - def test_xlsx_table(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() + df1 = excel.parse(0, index_col=0, parse_dates=True) + df2 = excel.parse(1, skiprows=[1], index_col=0, parse_dates=True) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - pth = os.path.join(self.dirpath, 'test.xlsx') - xlsx = ExcelFile(pth) - df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) + df3 = excel.parse(0, index_col=0, parse_dates=True, skipfooter=1) + df4 = excel.parse(0, index_col=0, parse_dates=True, skip_footer=1) + tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df4) - # TODO add index to xlsx file - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + import xlrd + self.assertRaises(xlrd.XLRDError, excel.parse, 'asdf') - df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, - skipfooter=1) - df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, - skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) + def test_excel_table(self): - def test_reader_closes_file(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() + excel = self.get_excelfile('test1') + dfref = self.get_csv_refdf('test1') - pth = os.path.join(self.dirpath, 'test.xlsx') - f = open(pth, 'rb') - with ExcelFile(f) as xlsx: - # parses okay - xlsx.parse('Sheet1', index_col=0) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True) + # TODO add index to file + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - self.assertTrue(f.closed) + df3 = excel.parse('Sheet1', index_col=0, parse_dates=True, + skipfooter=1) + df4 = excel.parse('Sheet1', index_col=0, parse_dates=True, + skip_footer=1) + tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df4) def test_reader_special_dtypes(self): - _skip_if_no_xlrd() expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, 4, 0]), @@ -364,44 +297,40 @@ def test_reader_special_dtypes(self): datetime(2015, 3, 14)]) ]) - xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') - xls_path = os.path.join(self.dirpath, 'test_types.xls') + basename = 'test_types' # should read in correctly and infer types - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1') - tm.assert_frame_equal(actual, expected) + actual = self.get_exceldf(basename, 'Sheet1') + tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[1, "Str2Col"] = 3.0 - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1', convert_float=False) - tm.assert_frame_equal(actual, float_expected) + actual = self.get_exceldf(basename, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) - actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) + actual = self.get_exceldf(basename, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) - tm.assert_frame_equal(actual2, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) + actual = self.get_exceldf(basename, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, - convert_float=False) + actual = self.get_exceldf(basename, 'Sheet1', convert_float=False, + converters={"StrCol": str}) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values def test_reader_converters(self): - _skip_if_no_xlrd() + + basename = 'test_converters' expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, -1000, 0]), @@ -416,48 +345,122 @@ def test_reader_converters(self): 3: lambda x: str(x) if x else '', } - xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx') - xls_path = os.path.join(self.dirpath, 'test_converters.xls') - # should read in correctly and set types of single cells (not array dtypes) - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1', converters=converters) - tm.assert_frame_equal(actual, expected) + actual = self.get_exceldf(basename, 'Sheet1', converters=converters) + tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 - - _skip_if_no_xlrd() - - dfs = read_excel(self.multisheet,sheetname=None) - expected_keys = ['Alpha','Beta','Charlie'] - tm.assert_contains_all(expected_keys,dfs.keys()) + basename = 'test_multisheet' + dfs = self.get_exceldf(basename, sheetname=None) + expected_keys = ['Alpha', 'Beta', 'Charlie'] + tm.assert_contains_all(expected_keys, dfs.keys()) def test_reading_multiple_specific_sheets(self): # Test reading specific sheetnames by specifying a mixed list # of integers and strings, and confirm that duplicated sheet # references (positions/names) are removed properly. - # Ensure a dict is returned # See PR #9450 - _skip_if_no_xlrd() - - #Explicitly request duplicates. Only the set should be returned. - expected_keys = [2,'Charlie','Charlie'] - dfs = read_excel(self.multisheet,sheetname=expected_keys) + basename = 'test_multisheet' + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, 'Charlie', 'Charlie'] + dfs = self.get_exceldf(basename, sheetname=expected_keys) expected_keys = list(set(expected_keys)) - tm.assert_contains_all(expected_keys,dfs.keys()) + tm.assert_contains_all(expected_keys, dfs.keys()) assert len(expected_keys) == len(dfs.keys()) + # GH6403 + def test_read_excel_blank(self): + actual = self.get_exceldf('blank', 'Sheet1') + tm.assert_frame_equal(actual, DataFrame()) + + def test_read_excel_blank_with_header(self): + expected = DataFrame(columns=['col_1', 'col_2']) + actual = self.get_exceldf('blank_with_header', 'Sheet1') + tm.assert_frame_equal(actual, expected) + + +class XlrdTests(ReadingTestsBase): + """ + This is the base class for the xlrd tests, and 3 different file formats + are supported: xls, xlsx, xlsm + """ + + def test_excel_read_buffer(self): + + pth = os.path.join(self.dirpath, 'test1' + self.ext) + f = open(pth, 'rb') + xls = ExcelFile(f) + # it works + xls.parse('Sheet1', index_col=0, parse_dates=True) + + def test_read_xlrd_Book(self): + _skip_if_no_xlwt() + + import xlrd + df = self.frame + with ensure_clean('.xls') as pth: + df.to_excel(pth, "SheetA") + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine="xlrd") as xl: + result = xl.parse("SheetA") + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheetname="SheetA", engine="xlrd") + tm.assert_frame_equal(df, result) + + @tm.network + def test_read_from_http_url(self): + # TODO: remove this when merging into master + url = ('https://raw.github.com/davidovitch/pandas/master/' + 'pandas/io/tests/data/test1' + self.ext) +# url = ('https://raw.github.com/pydata/pandas/master/' +# 'pandas/io/tests/data/test' + self.ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1') + tm.assert_frame_equal(url_table, local_table) + + @slow + def test_read_from_file_url(self): + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + + localtable = os.path.join(self.dirpath, 'test1' + self.ext) + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + import platform + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + def test_reader_closes_file(self): + + pth = os.path.join(self.dirpath, 'test1' + self.ext) + f = open(pth, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + xlsx.parse('Sheet1', index_col=0) + + self.assertTrue(f.closed) + def test_creating_and_reading_multiple_sheets(self): # Test reading multiple sheets, from a runtime created excel file # with multiple sheets. # See PR #9450 - _skip_if_no_xlrd() _skip_if_no_xlwt() + _skip_if_no_openpyxl() def tdf(sheetname): d, i = [11,22,33], [1,2,3] @@ -468,17 +471,16 @@ def tdf(sheetname): dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets,dfs)) - with ensure_clean('.xlsx') as pth: + with ensure_clean(self.ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in iteritems(dfs): df.to_excel(ew,sheetname) - dfs_returned = pd.read_excel(pth,sheetname=sheets) + dfs_returned = read_excel(pth,sheetname=sheets) for s in sheets: tm.assert_frame_equal(dfs[s],dfs_returned[s]) def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. - _skip_if_no_xlrd() import xlrd if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): @@ -510,38 +512,30 @@ def test_reader_seconds(self): time(16, 37, 1), time(18, 20, 54)])]) - epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') - epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') - - actual = read_excel(epoch_1900, 'Sheet1') + actual = self.get_exceldf('times_1900', 'Sheet1') tm.assert_frame_equal(actual, expected) - actual = read_excel(epoch_1904, 'Sheet1') + actual = self.get_exceldf('times_1904', 'Sheet1') tm.assert_frame_equal(actual, expected) - # GH6403 - def test_read_excel_blank(self): - _skip_if_no_xlrd() - blank = os.path.join(self.dirpath, 'blank.xls') - actual = read_excel(blank, 'Sheet1') - tm.assert_frame_equal(actual, DataFrame()) +class XlsReaderTests(XlrdTests, tm.TestCase): + ext = '.xls' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) - blank = os.path.join(self.dirpath, 'blank.xlsx') - actual = read_excel(blank, 'Sheet1') - tm.assert_frame_equal(actual, DataFrame()) - def test_read_excel_blank_with_header(self): - _skip_if_no_xlrd() +class XlsxReaderTests(XlrdTests, tm.TestCase): + ext = '.xlsx' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) - expected = DataFrame(columns=['col_1', 'col_2']) - blank = os.path.join(self.dirpath, 'blank_with_header.xls') - actual = read_excel(blank, 'Sheet1') - tm.assert_frame_equal(actual, expected) - blank = os.path.join(self.dirpath, 'blank_with_header.xlsx') - actual = read_excel(blank, 'Sheet1') - tm.assert_frame_equal(actual, expected) +class XlsmReaderTests(XlrdTests, tm.TestCase): + ext = '.xlsm' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) + class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. @@ -961,11 +955,11 @@ def test_to_excel_multiindex_no_write_index(self): # Test writing and re-reading a MI witout the index. GH 5616. # Initial non-MI frame. - frame1 = pd.DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) + frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) # Add a MI. frame2 = frame1.copy() - multi_index = pd.MultiIndex.from_tuples([(70, 80), (90, 100)]) + multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index with ensure_clean(self.ext) as path: @@ -1148,7 +1142,7 @@ def roundtrip(df, header=True, parser_hdr=0, index=True): with ensure_clean(self.ext) as path: df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index) - xf = pd.ExcelFile(path) + xf = ExcelFile(path) res = xf.parse(xf.sheet_names[0], header=parser_hdr) return res @@ -1202,7 +1196,7 @@ def roundtrip2(df, header=True, parser_hdr=0, index=True): with ensure_clean(self.ext) as path: df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index) - xf = pd.ExcelFile(path) + xf = ExcelFile(path) res = xf.parse(xf.sheet_names[0], header=parser_hdr) return res @@ -1269,20 +1263,24 @@ def test_datetimes(self): # GH7074 def test_bytes_io(self): + _skip_if_no_xlrd() + bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) writer = ExcelWriter(bio) df.to_excel(writer) writer.save() bio.seek(0) - reread_df = pd.read_excel(bio) + reread_df = read_excel(bio) tm.assert_frame_equal(df, reread_df) # GH8188 def test_write_lists_dict(self): - df = pd.DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], - 'numeric': [1, 2, 3.0], - 'str': ['apple', 'banana', 'cherry']}) + _skip_if_no_xlrd() + + df = DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], + 'numeric': [1, 2, 3.0], + 'str': ['apple', 'banana', 'cherry']}) expected = df.copy() expected.mixed = expected.mixed.apply(str) expected.numeric = expected.numeric.astype('int64') @@ -1291,6 +1289,7 @@ def test_write_lists_dict(self): read = read_excel(path, 'Sheet1', header=0) tm.assert_frame_equal(read, expected) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): @functools.wraps(orig_method) @@ -1468,20 +1467,20 @@ class XlwtTests(ExcelWriterBase, tm.TestCase): def test_excel_raise_error_on_multiindex_columns_and_no_index(self): _skip_if_no_xlwt() # MultiIndex as columns is not yet implemented 9794 - cols = pd.MultiIndex.from_tuples([('site', ''), + cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) - df = pd.DataFrame(np.random.randn(10, 3), columns=cols) + df = DataFrame(np.random.randn(10, 3), columns=cols) with tm.assertRaises(NotImplementedError): with ensure_clean(self.ext) as path: df.to_excel(path, index=False) def test_excel_warns_verbosely_on_multiindex_columns_and_index_true(self): _skip_if_no_xlwt() - cols = pd.MultiIndex.from_tuples([('site', ''), + cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) - df = pd.DataFrame(np.random.randn(10, 3), columns=cols) + df = DataFrame(np.random.randn(10, 3), columns=cols) with tm.assert_produces_warning(UserWarning): with ensure_clean(self.ext) as path: df.to_excel(path, index=True) @@ -1489,10 +1488,10 @@ def test_excel_warns_verbosely_on_multiindex_columns_and_index_true(self): def test_excel_multiindex_index(self): _skip_if_no_xlwt() # MultiIndex as index works so assert no error #9794 - cols = pd.MultiIndex.from_tuples([('site', ''), + cols = MultiIndex.from_tuples([('site', ''), ('2014', 'height'), ('2014', 'weight')]) - df = pd.DataFrame(np.random.randn(3, 10), index=cols) + df = DataFrame(np.random.randn(3, 10), index=cols) with ensure_clean(self.ext) as path: df.to_excel(path, index=False)