Skip to content

ENH: Retrieve zipped data from eurostat #206

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/source/whatsnew/v0.2.3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ New features
- ``DataReader`` now supports Google options data source, see :ref:`here<remote_data.google_options>` (:issue:`148`).
- ``DataReader`` now supports Google quotes, see :ref:`here<remote_data.google_quotes>` (:pull:`188`).

Other enhancements
^^^^^^^^^^^^^^^^^^

- Eurostat reader now supports larger data returned from API via zip format (:pull:`205`)

.. _whatsnew_023.api_breaking:

Backwards incompatible API changes
Expand Down
6 changes: 6 additions & 0 deletions pandas_datareader/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
from distutils.version import LooseVersion


if pd.compat.PY3:
from urllib.error import HTTPError # noqa
else:
from urllib2 import HTTPError # noqa


class SymbolWarning(UserWarning):
pass

Expand Down
11 changes: 9 additions & 2 deletions pandas_datareader/eurostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def url(self):
if not isinstance(self.symbols, compat.string_types):
raise ValueError('data name must be string')

return '{0}/data/{1}/?'.format(self._URL, self.symbols)
q = '{0}/data/{1}/?startperiod={2}&endperiod={3}'
return q.format(self._URL, self.symbols,
self.start.year, self.end.year)

@property
def dsd_url(self):
Expand All @@ -37,7 +39,12 @@ def _read_one_data(self, url, params):
try:
data.index = pd.to_datetime(data.index)
data = data.sort_index()
data = data.truncate(self.start, self.end)
except ValueError:
pass

try:
data = data.truncate(self.start, self.end)
except TypeError:
pass

return data
45 changes: 43 additions & 2 deletions pandas_datareader/io/sdmx.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
from __future__ import unicode_literals

import collections
import time
import zipfile

import pandas as pd
import pandas.compat as compat

from pandas_datareader.io.util import _read_content
from pandas_datareader._utils import HTTPError


_STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}'
Expand Down Expand Up @@ -47,7 +51,27 @@ def read_sdmx(path_or_buf, dtype='float64', dsd=None):
import xml.etree.ElementTree as ET
root = ET.fromstring(xdata)

structure = _get_child(root, _MESSAGE + 'Structure')
try:
structure = _get_child(root, _MESSAGE + 'Structure')
except ValueError:
# get zipped path
result = list(root.iter(_COMMON + 'Text'))[1].text
if not result.startswith('http'):
raise ValueError(result)

for _ in range(60):
# wait zipped data is prepared
try:
data = _read_zipped_sdmx(result)
return read_sdmx(data, dtype=dtype, dsd=dsd)
except HTTPError:
continue

time.sleep(1)
msg = ('Unable to download zipped data within 60 secs, '
'please download it manually from: {0}')
raise ValueError(msg.format(result))

idx_name = structure.get('dimensionAtObservation')
dataset = _get_child(root, _DATASET)

Expand Down Expand Up @@ -81,7 +105,12 @@ def _construct_series(values, name, dsd=None):
for value in values:

if name in times:
idx = pd.DatetimeIndex([v[0] for v in value], name=name)
tvalue = [v[0] for v in value]
try:
idx = pd.DatetimeIndex(tvalue, name=name)
except ValueError:
# time may be unsupported format, like '2015-B1'
idx = pd.Index(tvalue, name=name)
else:
idx = pd.Index([v[0] for v in value], name=name)

Expand Down Expand Up @@ -197,3 +226,15 @@ def _read_sdmx_dsd(path_or_buf):

result = SDMXCode(codes=code_results, ts=times)
return result


def _read_zipped_sdmx(path_or_buf):
""" Unzipp data contains SDMX-XML """
data = _read_content(path_or_buf)

zp = compat.BytesIO()
zp.write(compat.str_to_bytes(data))
f = zipfile.ZipFile(zp)
files = f.namelist()
assert len(files) == 1
return f.open(files[0])
55 changes: 43 additions & 12 deletions pandas_datareader/tests/test_eurostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pandas.util.testing as tm
import pandas_datareader.data as web

from pandas_datareader._utils import PANDAS_0170


class TestEurostat(tm.TestCase):

Expand Down Expand Up @@ -40,33 +42,29 @@ def test_get_cdh_e_fos(self):
tm.assert_frame_equal(df, expected)

def test_get_sts_cobp_a(self):

raise nose.SkipTest("This raises error because of data amount, "
"should be fixed in #149")

# Building permits - annual data (2010 = 100)
df = web.DataReader('sts_cobp_a', 'eurostat',
start=pd.Timestamp('1992-01-01'),
start=pd.Timestamp('2000-01-01'),
end=pd.Timestamp('2013-01-01'))

idx = pd.date_range('1992-01-01', '2013-01-01', freq='AS',
idx = pd.date_range('2000-01-01', '2013-01-01', freq='AS',
name='TIME_PERIOD')
ne_name = ('Building permits - m2 of useful floor area',
ne_name = ('Index, 2010=100',
'Building permits - m2 of useful floor area',
'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
'Non-residential buildings, except office buildings',
'Netherlands', 'Annual')
ne_values = [np.nan, np.nan, np.nan, 144.53, 136.97, 180.02, 198.36,
215.12, 200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
ne_values = [200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
176.69, 227.36, 199.45, 128.49, 100.0, 113.83, 89.33,
77.57]
ne = pd.Series(ne_values, name=ne_name, index=idx)

uk_name = ('Building permits - m2 of useful floor area',
uk_name = ('Index, 2010=100',
'Building permits - m2 of useful floor area',
'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
'Non-residential buildings, except office buildings',
'United Kingdom', 'Annual')
uk_values = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 120.37,
115.93, 112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
uk_values = [112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
113.05, 121.8, 113.97, 105.88, 100.0, 98.56, 103.69,
81.32]
uk = pd.Series(uk_values, name=uk_name, index=idx)
Expand All @@ -75,6 +73,39 @@ def test_get_sts_cobp_a(self):
result = df[expected.name]
tm.assert_series_equal(result, expected)

def test_get_nrg_pc_202(self):
# GH 149

if not PANDAS_0170:
raise nose.SkipTest("skip because of comparison failure")

df = web.DataReader('nrg_pc_202', 'eurostat',
start=pd.Timestamp('2010-01-01'),
end=pd.Timestamp('2013-01-01'))

name = ('All taxes and levies included',
'Gigajoules (Gross calorific value = GCV)',
'Euro',
'Band D1 : Consumption < 20 GJ',
'Natural gas', 'Denmark', 'Semi-annual')

exp_index = pd.Index(['2013-B2', '2013-B1', '2012-B2', '2012-B1',
'2011-B2', '2011-B1', '2010-B2', '2010-B1'],
name='TIME_PERIOD')
exp = pd.Series([27.1403, 27.5854, 26.5285, 27.2187,
28.5862, 28.6448, 26.8147, 26.4979],
index=exp_index, name=name)
tm.assert_series_equal(df[name], exp)

def test_get_prc_hicp_manr_exceeds_limit(self):
# GH 149
msg = 'Query size exceeds maximum limit'
with tm.assertRaisesRegexp(ValueError, msg):
web.DataReader('prc_hicp_manr', 'eurostat',
start=pd.Timestamp('2000-01-01'),
end=pd.Timestamp('2013-01-01'))


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)