From f41e9c1e87ea58c91da62d2301269c85f75a487a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 11 Jun 2016 23:04:08 +0900 Subject: [PATCH] ENH: Retrieve zipped data from eurostat --- docs/source/whatsnew/v0.2.3.txt | 5 +++ pandas_datareader/_utils.py | 6 +++ pandas_datareader/eurostat.py | 11 ++++- pandas_datareader/io/sdmx.py | 45 ++++++++++++++++++- pandas_datareader/tests/test_eurostat.py | 55 ++++++++++++++++++------ 5 files changed, 106 insertions(+), 16 deletions(-) diff --git a/docs/source/whatsnew/v0.2.3.txt b/docs/source/whatsnew/v0.2.3.txt index e019e20a..629c51a8 100644 --- a/docs/source/whatsnew/v0.2.3.txt +++ b/docs/source/whatsnew/v0.2.3.txt @@ -22,6 +22,11 @@ New features - ``DataReader`` now supports Google options data source, see :ref:`here` (:issue:`148`). - ``DataReader`` now supports Google quotes, see :ref:`here` (:pull:`188`). +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- Eurostat reader now supports larger data returned from API via zip format (:pull:`205`) + .. _whatsnew_023.api_breaking: Backwards incompatible API changes diff --git a/pandas_datareader/_utils.py b/pandas_datareader/_utils.py index 11402be0..d40d375a 100644 --- a/pandas_datareader/_utils.py +++ b/pandas_datareader/_utils.py @@ -3,6 +3,12 @@ from distutils.version import LooseVersion +if pd.compat.PY3: + from urllib.error import HTTPError # noqa +else: + from urllib2 import HTTPError # noqa + + class SymbolWarning(UserWarning): pass diff --git a/pandas_datareader/eurostat.py b/pandas_datareader/eurostat.py index 915f23d4..b5882236 100644 --- a/pandas_datareader/eurostat.py +++ b/pandas_datareader/eurostat.py @@ -18,7 +18,9 @@ def url(self): if not isinstance(self.symbols, compat.string_types): raise ValueError('data name must be string') - return '{0}/data/{1}/?'.format(self._URL, self.symbols) + q = '{0}/data/{1}/?startperiod={2}&endperiod={3}' + return q.format(self._URL, self.symbols, + self.start.year, self.end.year) @property def dsd_url(self): @@ -37,7 +39,12 @@ def _read_one_data(self, url, params): try: data.index = pd.to_datetime(data.index) data = data.sort_index() - data = data.truncate(self.start, self.end) except ValueError: pass + + try: + data = data.truncate(self.start, self.end) + except TypeError: + pass + return data diff --git a/pandas_datareader/io/sdmx.py b/pandas_datareader/io/sdmx.py index 753a26af..fcc80a31 100644 --- a/pandas_datareader/io/sdmx.py +++ b/pandas_datareader/io/sdmx.py @@ -1,10 +1,14 @@ from __future__ import unicode_literals import collections +import time +import zipfile import pandas as pd +import pandas.compat as compat from pandas_datareader.io.util import _read_content +from pandas_datareader._utils import HTTPError _STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}' @@ -47,7 +51,27 @@ def read_sdmx(path_or_buf, dtype='float64', dsd=None): import xml.etree.ElementTree as ET root = ET.fromstring(xdata) - structure = _get_child(root, _MESSAGE + 'Structure') + try: + structure = _get_child(root, _MESSAGE + 'Structure') + except ValueError: + # get zipped path + result = list(root.iter(_COMMON + 'Text'))[1].text + if not result.startswith('http'): + raise ValueError(result) + + for _ in range(60): + # wait zipped data is prepared + try: + data = _read_zipped_sdmx(result) + return read_sdmx(data, dtype=dtype, dsd=dsd) + except HTTPError: + continue + + time.sleep(1) + msg = ('Unable to download zipped data within 60 secs, ' + 'please download it manually from: {0}') + raise ValueError(msg.format(result)) + idx_name = structure.get('dimensionAtObservation') dataset = _get_child(root, _DATASET) @@ -81,7 +105,12 @@ def _construct_series(values, name, dsd=None): for value in values: if name in times: - idx = pd.DatetimeIndex([v[0] for v in value], name=name) + tvalue = [v[0] for v in value] + try: + idx = pd.DatetimeIndex(tvalue, name=name) + except ValueError: + # time may be unsupported format, like '2015-B1' + idx = pd.Index(tvalue, name=name) else: idx = pd.Index([v[0] for v in value], name=name) @@ -197,3 +226,15 @@ def _read_sdmx_dsd(path_or_buf): result = SDMXCode(codes=code_results, ts=times) return result + + +def _read_zipped_sdmx(path_or_buf): + """ Unzipp data contains SDMX-XML """ + data = _read_content(path_or_buf) + + zp = compat.BytesIO() + zp.write(compat.str_to_bytes(data)) + f = zipfile.ZipFile(zp) + files = f.namelist() + assert len(files) == 1 + return f.open(files[0]) diff --git a/pandas_datareader/tests/test_eurostat.py b/pandas_datareader/tests/test_eurostat.py index a9785562..859b2c3e 100644 --- a/pandas_datareader/tests/test_eurostat.py +++ b/pandas_datareader/tests/test_eurostat.py @@ -6,6 +6,8 @@ import pandas.util.testing as tm import pandas_datareader.data as web +from pandas_datareader._utils import PANDAS_0170 + class TestEurostat(tm.TestCase): @@ -40,33 +42,29 @@ def test_get_cdh_e_fos(self): tm.assert_frame_equal(df, expected) def test_get_sts_cobp_a(self): - - raise nose.SkipTest("This raises error because of data amount, " - "should be fixed in #149") - # Building permits - annual data (2010 = 100) df = web.DataReader('sts_cobp_a', 'eurostat', - start=pd.Timestamp('1992-01-01'), + start=pd.Timestamp('2000-01-01'), end=pd.Timestamp('2013-01-01')) - idx = pd.date_range('1992-01-01', '2013-01-01', freq='AS', + idx = pd.date_range('2000-01-01', '2013-01-01', freq='AS', name='TIME_PERIOD') - ne_name = ('Building permits - m2 of useful floor area', + ne_name = ('Index, 2010=100', + 'Building permits - m2 of useful floor area', 'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)', 'Non-residential buildings, except office buildings', 'Netherlands', 'Annual') - ne_values = [np.nan, np.nan, np.nan, 144.53, 136.97, 180.02, 198.36, - 215.12, 200.05, 186.46, 127.33, 130.67, 143.26, 147.83, + ne_values = [200.05, 186.46, 127.33, 130.67, 143.26, 147.83, 176.69, 227.36, 199.45, 128.49, 100.0, 113.83, 89.33, 77.57] ne = pd.Series(ne_values, name=ne_name, index=idx) - uk_name = ('Building permits - m2 of useful floor area', + uk_name = ('Index, 2010=100', + 'Building permits - m2 of useful floor area', 'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)', 'Non-residential buildings, except office buildings', 'United Kingdom', 'Annual') - uk_values = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 120.37, - 115.93, 112.53, 113.32, 110.18, 112.14, 119.06, 112.66, + uk_values = [112.53, 113.32, 110.18, 112.14, 119.06, 112.66, 113.05, 121.8, 113.97, 105.88, 100.0, 98.56, 103.69, 81.32] uk = pd.Series(uk_values, name=uk_name, index=idx) @@ -75,6 +73,39 @@ def test_get_sts_cobp_a(self): result = df[expected.name] tm.assert_series_equal(result, expected) + def test_get_nrg_pc_202(self): + # GH 149 + + if not PANDAS_0170: + raise nose.SkipTest("skip because of comparison failure") + + df = web.DataReader('nrg_pc_202', 'eurostat', + start=pd.Timestamp('2010-01-01'), + end=pd.Timestamp('2013-01-01')) + + name = ('All taxes and levies included', + 'Gigajoules (Gross calorific value = GCV)', + 'Euro', + 'Band D1 : Consumption < 20 GJ', + 'Natural gas', 'Denmark', 'Semi-annual') + + exp_index = pd.Index(['2013-B2', '2013-B1', '2012-B2', '2012-B1', + '2011-B2', '2011-B1', '2010-B2', '2010-B1'], + name='TIME_PERIOD') + exp = pd.Series([27.1403, 27.5854, 26.5285, 27.2187, + 28.5862, 28.6448, 26.8147, 26.4979], + index=exp_index, name=name) + tm.assert_series_equal(df[name], exp) + + def test_get_prc_hicp_manr_exceeds_limit(self): + # GH 149 + msg = 'Query size exceeds maximum limit' + with tm.assertRaisesRegexp(ValueError, msg): + web.DataReader('prc_hicp_manr', 'eurostat', + start=pd.Timestamp('2000-01-01'), + end=pd.Timestamp('2013-01-01')) + + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)