Merge pull request #206 from sinhrks/eurostat

sinhrks · web-flow · commit 6e257a1305f3 · 2016-06-28T09:07:32.000+09:00
ENH: Retrieve zipped data from eurostat
diff --git a/docs/source/whatsnew/v0.2.3.txt b/docs/source/whatsnew/v0.2.3.txt
@@ -22,6 +22,11 @@ New features
 - ``DataReader`` now supports Google options data source, see :ref:`here<remote_data.google_options>` (:issue:`148`).
 - ``DataReader`` now supports Google quotes, see :ref:`here<remote_data.google_quotes>` (:pull:`188`).
 
+Other enhancements
+^^^^^^^^^^^^^^^^^^
+
+- Eurostat reader now supports larger data returned from API via zip format (:pull:`205`)
+
 .. _whatsnew_023.api_breaking:
 
 Backwards incompatible API changes
diff --git a/pandas_datareader/_utils.py b/pandas_datareader/_utils.py
@@ -3,6 +3,12 @@
 from distutils.version import LooseVersion
 
 
+if pd.compat.PY3:
+    from urllib.error import HTTPError     # noqa
+else:
+    from urllib2 import HTTPError          # noqa
+
+
 class SymbolWarning(UserWarning):
     pass
 
diff --git a/pandas_datareader/eurostat.py b/pandas_datareader/eurostat.py
@@ -18,7 +18,9 @@ def url(self):
         if not isinstance(self.symbols, compat.string_types):
             raise ValueError('data name must be string')
 
-        return '{0}/data/{1}/?'.format(self._URL, self.symbols)
+        q = '{0}/data/{1}/?startperiod={2}&endperiod={3}'
+        return q.format(self._URL, self.symbols,
+                        self.start.year, self.end.year)
 
     @property
     def dsd_url(self):
@@ -37,7 +39,12 @@ def _read_one_data(self, url, params):
         try:
             data.index = pd.to_datetime(data.index)
             data = data.sort_index()
-            data = data.truncate(self.start, self.end)
         except ValueError:
             pass
+
+        try:
+            data = data.truncate(self.start, self.end)
+        except TypeError:
+            pass
+
         return data
diff --git a/pandas_datareader/io/sdmx.py b/pandas_datareader/io/sdmx.py
@@ -1,10 +1,14 @@
 from __future__ import unicode_literals
 
 import collections
+import time
+import zipfile
 
 import pandas as pd
+import pandas.compat as compat
 
 from pandas_datareader.io.util import _read_content
+from pandas_datareader._utils import HTTPError
 
 
 _STRUCTURE = '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}'
@@ -47,7 +51,27 @@ def read_sdmx(path_or_buf, dtype='float64', dsd=None):
     import xml.etree.ElementTree as ET
     root = ET.fromstring(xdata)
 
-    structure = _get_child(root, _MESSAGE + 'Structure')
+    try:
+        structure = _get_child(root, _MESSAGE + 'Structure')
+    except ValueError:
+        # get zipped path
+        result = list(root.iter(_COMMON + 'Text'))[1].text
+        if not result.startswith('http'):
+            raise ValueError(result)
+
+        for _ in range(60):
+            # wait zipped data is prepared
+            try:
+                data = _read_zipped_sdmx(result)
+                return read_sdmx(data, dtype=dtype, dsd=dsd)
+            except HTTPError:
+                continue
+
+            time.sleep(1)
+        msg = ('Unable to download zipped data within 60 secs, '
+               'please download it manually from: {0}')
+        raise ValueError(msg.format(result))
+
     idx_name = structure.get('dimensionAtObservation')
     dataset = _get_child(root, _DATASET)
 
@@ -81,7 +105,12 @@ def _construct_series(values, name, dsd=None):
     for value in values:
 
         if name in times:
-            idx = pd.DatetimeIndex([v[0] for v in value], name=name)
+            tvalue = [v[0] for v in value]
+            try:
+                idx = pd.DatetimeIndex(tvalue, name=name)
+            except ValueError:
+                # time may be unsupported format, like '2015-B1'
+                idx = pd.Index(tvalue, name=name)
         else:
             idx = pd.Index([v[0] for v in value], name=name)
 
@@ -197,3 +226,15 @@ def _read_sdmx_dsd(path_or_buf):
 
     result = SDMXCode(codes=code_results, ts=times)
     return result
+
+
+def _read_zipped_sdmx(path_or_buf):
+    """ Unzipp data contains SDMX-XML """
+    data = _read_content(path_or_buf)
+
+    zp = compat.BytesIO()
+    zp.write(compat.str_to_bytes(data))
+    f = zipfile.ZipFile(zp)
+    files = f.namelist()
+    assert len(files) == 1
+    return f.open(files[0])
diff --git a/pandas_datareader/tests/test_eurostat.py b/pandas_datareader/tests/test_eurostat.py
@@ -6,6 +6,8 @@
 import pandas.util.testing as tm
 import pandas_datareader.data as web
 
+from pandas_datareader._utils import PANDAS_0170
+
 
 class TestEurostat(tm.TestCase):
 
@@ -40,33 +42,29 @@ def test_get_cdh_e_fos(self):
         tm.assert_frame_equal(df, expected)
 
     def test_get_sts_cobp_a(self):
-
-        raise nose.SkipTest("This raises error because of data amount, "
-                            "should be fixed in #149")
-
         # Building permits - annual data (2010 = 100)
         df = web.DataReader('sts_cobp_a', 'eurostat',
-                            start=pd.Timestamp('1992-01-01'),
+                            start=pd.Timestamp('2000-01-01'),
                             end=pd.Timestamp('2013-01-01'))
 
-        idx = pd.date_range('1992-01-01', '2013-01-01', freq='AS',
+        idx = pd.date_range('2000-01-01', '2013-01-01', freq='AS',
                             name='TIME_PERIOD')
-        ne_name = ('Building permits - m2 of useful floor area',
+        ne_name = ('Index, 2010=100',
+                   'Building permits - m2 of useful floor area',
                    'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
                    'Non-residential buildings, except office buildings',
                    'Netherlands', 'Annual')
-        ne_values = [np.nan, np.nan, np.nan, 144.53, 136.97, 180.02, 198.36,
-                     215.12, 200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
+        ne_values = [200.05, 186.46, 127.33, 130.67, 143.26, 147.83,
                      176.69, 227.36, 199.45, 128.49, 100.0, 113.83, 89.33,
                      77.57]
         ne = pd.Series(ne_values, name=ne_name, index=idx)
 
-        uk_name = ('Building permits - m2 of useful floor area',
+        uk_name = ('Index, 2010=100',
+                   'Building permits - m2 of useful floor area',
                    'Unadjusted data (i.e. neither seasonally adjusted nor calendar adjusted data)',
                    'Non-residential buildings, except office buildings',
                    'United Kingdom', 'Annual')
-        uk_values = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 120.37,
-                     115.93, 112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
+        uk_values = [112.53, 113.32, 110.18, 112.14, 119.06, 112.66,
                      113.05, 121.8, 113.97, 105.88, 100.0, 98.56, 103.69,
                      81.32]
         uk = pd.Series(uk_values, name=uk_name, index=idx)
@@ -75,6 +73,39 @@ def test_get_sts_cobp_a(self):
             result = df[expected.name]
             tm.assert_series_equal(result, expected)
 
+    def test_get_nrg_pc_202(self):
+        # GH 149
+
+        if not PANDAS_0170:
+            raise nose.SkipTest("skip because of comparison failure")
+
+        df = web.DataReader('nrg_pc_202', 'eurostat',
+                            start=pd.Timestamp('2010-01-01'),
+                            end=pd.Timestamp('2013-01-01'))
+
+        name = ('All taxes and levies included',
+                'Gigajoules (Gross calorific value = GCV)',
+                'Euro',
+                'Band D1 : Consumption < 20 GJ',
+                'Natural gas', 'Denmark', 'Semi-annual')
+
+        exp_index = pd.Index(['2013-B2', '2013-B1', '2012-B2', '2012-B1',
+                              '2011-B2', '2011-B1', '2010-B2', '2010-B1'],
+                             name='TIME_PERIOD')
+        exp = pd.Series([27.1403, 27.5854, 26.5285, 27.2187,
+                         28.5862, 28.6448, 26.8147, 26.4979],
+                        index=exp_index, name=name)
+        tm.assert_series_equal(df[name], exp)
+
+    def test_get_prc_hicp_manr_exceeds_limit(self):
+        # GH 149
+        msg = 'Query size exceeds maximum limit'
+        with tm.assertRaisesRegexp(ValueError, msg):
+            web.DataReader('prc_hicp_manr', 'eurostat',
+                           start=pd.Timestamp('2000-01-01'),
+                           end=pd.Timestamp('2013-01-01'))
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)