Skip to content

Commit 6e257a1

Browse files
authored
Merge pull request #206 from sinhrks/eurostat
ENH: Retrieve zipped data from eurostat
2 parents 1a50ca7 + f41e9c1 commit 6e257a1

File tree

5 files changed

+106
-16
lines changed

5 files changed

+106
-16
lines changed

docs/source/whatsnew/v0.2.3.txt

+5
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ New features
2222
- ``DataReader`` now supports Google options data source, see :ref:`here<remote_data.google_options>` (:issue:`148`).
2323
- ``DataReader`` now supports Google quotes, see :ref:`here<remote_data.google_quotes>` (:pull:`188`).
2424

25+
Other enhancements
26+
^^^^^^^^^^^^^^^^^^
27+
28+
- Eurostat reader now supports larger data returned from API via zip format (:pull:`205`)
29+
2530
.. _whatsnew_023.api_breaking:
2631

2732
Backwards incompatible API changes

pandas_datareader/_utils.py

+6
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
from distutils.version import LooseVersion
44

55

6+
if pd.compat.PY3:
7+
from urllib.error import HTTPError # noqa
8+
else:
9+
from urllib2 import HTTPError # noqa
10+
11+
612
class SymbolWarning(UserWarning):
713
pass
814

pandas_datareader/eurostat.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ def url(self):
1818
if not isinstance(self.symbols, compat.string_types):
1919
raise ValueError('data name must be string')
2020

21-
return '{0}/data/{1}/?'.format(self._URL, self.symbols)
21+
q = '{0}/data/{1}/?startperiod={2}&endperiod={3}'
22+
return q.format(self._URL, self.symbols,
23+
self.start.year, self.end.year)
2224

2325
@property
2426
def dsd_url(self):
@@ -37,7 +39,12 @@ def _read_one_data(self, url, params):
3739
try:
3840
data.index = pd.to_datetime(data.index)
3941
data = data.sort_index()
40-
data = data.truncate(self.start, self.end)
4142
except ValueError:
4243
pass
44+
45+
try:
46+
data = data.truncate(self.start, self.end)
47+
except TypeError:
48+
pass
49+
4350
return data

pandas_datareader/io/sdmx.py

+43-2
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
from __future__ import unicode_literals
22

33
import collections
4+
import time
5+
import zipfile
46

57
import pandas as pd
8+
import pandas.compat as compat
69

710
from pandas_datareader.io.util import _read_content
11+
from pandas_datareader._utils import HTTPError
812

913

1014
_STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}'
@@ -47,7 +51,27 @@ def read_sdmx(path_or_buf, dtype='float64', dsd=None):
4751
import xml.etree.ElementTree as ET
4852
root = ET.fromstring(xdata)
4953

50-
structure = _get_child(root, _MESSAGE + 'Structure')
54+
try:
55+
structure = _get_child(root, _MESSAGE + 'Structure')
56+
except ValueError:
57+
# get zipped path
58+
result = list(root.iter(_COMMON + 'Text'))[1].text
59+
if not result.startswith('http'):
60+
raise ValueError(result)
61+
62+
for _ in range(60):
63+
# wait zipped data is prepared
64+
try:
65+
data = _read_zipped_sdmx(result)
66+
return read_sdmx(data, dtype=dtype, dsd=dsd)
67+
except HTTPError:
68+
continue
69+
70+
time.sleep(1)
71+
msg = ('Unable to download zipped data within 60 secs, '
72+
'please download it manually from: {0}')
73+
raise ValueError(msg.format(result))
74+
5175
idx_name = structure.get('dimensionAtObservation')
5276
dataset = _get_child(root, _DATASET)
5377

@@ -81,7 +105,12 @@ def _construct_series(values, name, dsd=None):
81105
for value in values:
82106

83107
if name in times:
84-
idx = pd.DatetimeIndex([v[0] for v in value], name=name)
108+
tvalue = [v[0] for v in value]
109+
try:
110+
idx = pd.DatetimeIndex(tvalue, name=name)
111+
except ValueError:
112+
# time may be unsupported format, like '2015-B1'
113+
idx = pd.Index(tvalue, name=name)
85114
else:
86115
idx = pd.Index([v[0] for v in value], name=name)
87116

@@ -197,3 +226,15 @@ def _read_sdmx_dsd(path_or_buf):
197226

198227
result = SDMXCode(codes=code_results, ts=times)
199228
return result
229+
230+
231+
def _read_zipped_sdmx(path_or_buf):
232+
""" Unzipp data contains SDMX-XML """
233+
data = _read_content(path_or_buf)
234+
235+
zp = compat.BytesIO()
236+
zp.write(compat.str_to_bytes(data))
237+
f = zipfile.ZipFile(zp)
238+
files = f.namelist()
239+
assert len(files) == 1
240+
return f.open(files[0])

pandas_datareader/tests/test_eurostat.py

+43-12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pandas.util.testing as tm
77
import pandas_datareader.data as web
88

9+
from pandas_datareader._utils import PANDAS_0170
10+
911

1012
class TestEurostat(tm.TestCase):
1113

@@ -40,33 +42,29 @@ def test_get_cdh_e_fos(self):
4042
tm.assert_frame_equal(df, expected)
4143

4244
def test_get_sts_cobp_a(self):
43-
44-
raise nose.SkipTest("This raises error because of data amount, "
45-
"should be fixed in #149")
46-
4745
# Building permits - annual data (2010 = 100)
4846
df = web.DataReader('sts_cobp_a', 'eurostat',
49-
start=pd.Timestamp('1992-01-01'),
47+
start=pd.Timestamp('2000-01-01'),
5048
end=pd.Timestamp('2013-01-01'))
5149

52-
idx = pd.date_range('1992-01-01', '2013-01-01', freq='AS',
50+
idx = pd.date_range('2000-01-01', '2013-01-01', freq='AS',
5351
name='TIME_PERIOD')
54-
ne_name = ('Building permits - m2 of useful floor area',
52+
ne_name = ('Index, 2010=100',
53+
'Building permits - m2 of useful floor area',
5554
'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
5655
'Non-residential buildings, except office buildings',
5756
'Netherlands', 'Annual')
58-
ne_values = [np.nan, np.nan, np.nan, 144.53, 136.97, 180.02, 198.36,
59-
215.12, 200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
57+
ne_values = [200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
6058
176.69, 227.36, 199.45, 128.49, 100.0, 113.83, 89.33,
6159
77.57]
6260
ne = pd.Series(ne_values, name=ne_name, index=idx)
6361

64-
uk_name = ('Building permits - m2 of useful floor area',
62+
uk_name = ('Index, 2010=100',
63+
'Building permits - m2 of useful floor area',
6564
'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
6665
'Non-residential buildings, except office buildings',
6766
'United Kingdom', 'Annual')
68-
uk_values = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 120.37,
69-
115.93, 112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
67+
uk_values = [112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
7068
113.05, 121.8, 113.97, 105.88, 100.0, 98.56, 103.69,
7169
81.32]
7270
uk = pd.Series(uk_values, name=uk_name, index=idx)
@@ -75,6 +73,39 @@ def test_get_sts_cobp_a(self):
7573
result = df[expected.name]
7674
tm.assert_series_equal(result, expected)
7775

76+
def test_get_nrg_pc_202(self):
77+
# GH 149
78+
79+
if not PANDAS_0170:
80+
raise nose.SkipTest("skip because of comparison failure")
81+
82+
df = web.DataReader('nrg_pc_202', 'eurostat',
83+
start=pd.Timestamp('2010-01-01'),
84+
end=pd.Timestamp('2013-01-01'))
85+
86+
name = ('All taxes and levies included',
87+
'Gigajoules (Gross calorific value = GCV)',
88+
'Euro',
89+
'Band D1 : Consumption < 20 GJ',
90+
'Natural gas', 'Denmark', 'Semi-annual')
91+
92+
exp_index = pd.Index(['2013-B2', '2013-B1', '2012-B2', '2012-B1',
93+
'2011-B2', '2011-B1', '2010-B2', '2010-B1'],
94+
name='TIME_PERIOD')
95+
exp = pd.Series([27.1403, 27.5854, 26.5285, 27.2187,
96+
28.5862, 28.6448, 26.8147, 26.4979],
97+
index=exp_index, name=name)
98+
tm.assert_series_equal(df[name], exp)
99+
100+
def test_get_prc_hicp_manr_exceeds_limit(self):
101+
# GH 149
102+
msg = 'Query size exceeds maximum limit'
103+
with tm.assertRaisesRegexp(ValueError, msg):
104+
web.DataReader('prc_hicp_manr', 'eurostat',
105+
start=pd.Timestamp('2000-01-01'),
106+
end=pd.Timestamp('2013-01-01'))
107+
108+
78109
if __name__ == '__main__':
79110
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
80111
exit=False)

0 commit comments

Comments
 (0)