pydata · rgkimball · May 17, 2017 · May 17, 2017 · May 17, 2017 · May 17, 2017
diff --git a/pandas_datareader/base.py b/pandas_datareader/base.py
@@ -53,6 +53,7 @@ def __init__(self, symbols, start=None, end=None,
         self.retry_count = retry_count
         self.pause = pause
         self.timeout = timeout
+        self.pause_multiplier = 1
         self.session = _init_session(session, retry_count)
 
     @property
@@ -85,6 +86,10 @@ def _read_url_as_StringIO(self, url, params=None):
         response = self._get_response(url, params=params)
         text = self._sanitize_response(response)
         out = StringIO()
+        if len(text) == 0:
+            service = self.__class__.__name__
+            raise IOError("{} request returned no data; check URL for invalid "
+                          "inputs: {}".format(service, self.url))
         if isinstance(text, compat.binary_type):
             out.write(bytes_to_str(text))
         else:
@@ -99,7 +104,7 @@ def _sanitize_response(response):
         """
         return response.content
 
-    def _get_response(self, url, params=None):
+    def _get_response(self, url, params=None, headers=None):
         """ send raw HTTP request to get requests.Response from the specified url
         Parameters
         ----------
@@ -110,15 +115,26 @@ def _get_response(self, url, params=None):
         """
 
         # initial attempt + retry
+        pause = self.pause
         for i in range(self.retry_count + 1):
-            response = self.session.get(url, params=params)
+            response = self.session.get(url, params=params, headers=headers)
             if response.status_code == requests.codes.ok:
                 return response
-            time.sleep(self.pause)
+            time.sleep(pause)
+
+            # Increase time between subsequent requests, per subclass.
+            pause *= self.pause_multiplier
+            # Get a new breadcrumb if necessary, in case ours is invalidated
+            if isinstance(params, list) and 'crumb' in params:
+                params['crumb'] = self._get_crumb(self.retry_count)
         if params is not None and len(params) > 0:
             url = url + "?" + urlencode(params)
         raise RemoteDataError('Unable to read URL: {0}'.format(url))
 
+    def _get_crumb(self, *args):
+        """ To be implemented by subclass """
+        raise NotImplementedError("Subclass has not implemented method.")
+
     def _read_lines(self, out):
         rs = read_csv(out, index_col=0, parse_dates=True, na_values='-')[::-1]
         # Yahoo! Finance sometimes does this awesome thing where they

diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py
@@ -9,7 +9,7 @@
 
 from pandas_datareader.yahoo.daily import YahooDailyReader
 from pandas_datareader.yahoo.quotes import YahooQuotesReader
-from pandas_datareader.yahoo.actions import YahooActionReader
+from pandas_datareader.yahoo.actions import (YahooActionReader, YahooDivReader)
 from pandas_datareader.yahoo.components import _get_data as get_components_yahoo  # noqa
 from pandas_datareader.yahoo.options import Options as YahooOptions
 from pandas_datareader.google.options import Options as GoogleOptions
@@ -121,10 +121,10 @@ def DataReader(name, data_source=None, start=None, end=None,
                                  retry_count=retry_count, pause=pause,
                                  session=session).read()
     elif data_source == "yahoo-dividends":
-        return YahooDailyReader(symbols=name, start=start, end=end,
-                                adjust_price=False, chunksize=25,
-                                retry_count=retry_count, pause=pause,
-                                session=session, interval='v').read()
+        return YahooDivReader(symbols=name, start=start, end=end,
+                              adjust_price=False, chunksize=25,
+                              retry_count=retry_count, pause=pause,
+                              session=session, interval='d').read()
 
     elif data_source == "google":
         return GoogleDailyReader(symbols=name, start=start, end=end,

diff --git a/pandas_datareader/tests/yahoo/test_yahoo.py b/pandas_datareader/tests/yahoo/test_yahoo.py
@@ -108,18 +108,13 @@ def test_get_data_interval(self):
         # weekly interval data
         pan = web.get_data_yahoo('XOM', '2013-01-01',
                                  '2013-12-31', interval='w')
-        assert len(pan) == 53
+        assert len(pan) == 52
 
-        # montly interval data
-        pan = web.get_data_yahoo('XOM', '2013-01-01',
+        # monthly interval data
+        pan = web.get_data_yahoo('XOM', '2012-12-31',
                                  '2013-12-31', interval='m')
         assert len(pan) == 12
 
-        # dividend data
-        pan = web.get_data_yahoo('XOM', '2013-01-01',
-                                 '2013-12-31', interval='v')
-        assert len(pan) == 4
-
         # test fail on invalid interval
         with pytest.raises(ValueError):
             web.get_data_yahoo('XOM', interval='NOT VALID')
@@ -132,17 +127,18 @@ def test_get_data_multiple_symbols(self):
     def test_get_data_multiple_symbols_two_dates(self):
         pan = web.get_data_yahoo(['GE', 'MSFT', 'INTC'], 'JAN-01-12',
                                  'JAN-31-12')
-        result = pan.Close.ix['01-18-12']
-        assert len(result) == 3
+        result = pan.Close['01-18-12'].T
+        assert result.size == 3
 
         # sanity checking
-        assert np.issubdtype(result.dtype, np.floating)
+        assert result.dtypes.all() == np.floating
 
         expected = np.array([[18.99, 28.4, 25.18],
                              [18.58, 28.31, 25.13],
                              [19.03, 28.16, 25.52],
                              [18.81, 28.82, 25.87]])
-        result = pan.Open.ix['Jan-15-12':'Jan-20-12']
+        df = pan.Open
+        result = df[(df.index >= 'Jan-15-12') & (df.index <= 'Jan-20-12')]
         assert expected.shape == result.shape
 
     def test_get_date_ret_index(self):
@@ -212,6 +208,8 @@ def test_yahoo_DataReader(self):
                                       0.47, 0.43571, 0.43571, 0.43571,
                                       0.43571, 0.37857, 0.37857, 0.37857]},
                            index=exp_idx)
+        exp.index.name = 'Date'
+
         tm.assert_frame_equal(result, exp)
 
     def test_yahoo_DataReader_multi(self):

diff --git a/pandas_datareader/yahoo/actions.py b/pandas_datareader/yahoo/actions.py
@@ -1,61 +1,53 @@
-import csv
-from pandas import to_datetime, DataFrame
+from pandas import (concat, DataFrame)
+from pandas_datareader.yahoo.daily import YahooDailyReader
 
-from pandas_datareader.base import _DailyBaseReader
-
-
-class YahooActionReader(_DailyBaseReader):
 
+class YahooActionReader(YahooDailyReader):
     """
     Returns DataFrame of historical corporate actions (dividends and stock
     splits) from symbols, over date range, start to end. All dates in the
     resulting DataFrame correspond with dividend and stock split ex-dates.
     """
+    def read(self):
+        dividends = YahooDivReader(symbols=self.symbols,
+                                   start=self.start,
+                                   end=self.end,
+                                   retry_count=self.retry_count,
+                                   pause=self.pause,
+                                   session=self.session).read()
+        # Add a label column so we can combine our two DFs
+        if isinstance(dividends, DataFrame):
+            dividends["action"] = "DIVIDEND"
+            dividends = dividends.rename(columns={'Dividends': 'value'})
+
+        splits = YahooSplitReader(symbols=self.symbols,
+                                  start=self.start,
+                                  end=self.end,
+                                  retry_count=self.retry_count,
+                                  pause=self.pause,
+                                  session=self.session).read()
+        # Add a label column so we can combine our two DFs
+        if isinstance(splits, DataFrame):
+            splits["action"] = "SPLIT"
+            splits = splits.rename(columns={'Stock Splits': 'value'})
+            # Converts fractional form splits (i.e. "2/1") into conversion
+            # ratios, then take the reciprocal
+            splits['value'] = splits.apply(lambda x: 1/eval(x['value']), axis=1)  # noqa
+
+        output = concat([dividends, splits]).sort_index(ascending=False)
+
+        return output
+
+
+class YahooDivReader(YahooDailyReader):
+
+    @property
+    def service(self):
+        return 'div'
+
+
+class YahooSplitReader(YahooDailyReader):
 
     @property
-    def url(self):
-        return 'http://ichart.finance.yahoo.com/x'
-
-    def _get_params(self, symbols=None):
-        params = {
-            's': self.symbols,
-            'a': self.start.month - 1,
-            'b': self.start.day,
-            'c': self.start.year,
-            'd': self.end.month - 1,
-            'e': self.end.day,
-            'f': self.end.year,
-            'g': 'v'
-        }
-        return params
-
-    def _read_lines(self, out):
-        actions_index = []
-        actions_entries = []
-
-        for line in csv.reader(out.readlines()):
-            # Ignore lines that aren't dividends or splits (Yahoo
-            # add a bunch of irrelevant fields.)
-            if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
-                continue
-
-            action, date, value = line
-            if action == 'DIVIDEND':
-                actions_index.append(to_datetime(date))
-                actions_entries.append({
-                    'action': action,
-                    'value': float(value)
-                })
-            elif action == 'SPLIT' and ':' in value:
-                # Convert the split ratio to a fraction. For example a
-                # 4:1 split expressed as a fraction is 1/4 = 0.25.
-                denominator, numerator = value.split(':', 1)
-                split_fraction = float(numerator) / float(denominator)
-
-                actions_index.append(to_datetime(date))
-                actions_entries.append({
-                    'action': action,
-                    'value': split_fraction
-                })
-
-        return DataFrame(actions_entries, index=actions_index)
+    def service(self):
+        return 'split'
diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py
@@ -1,4 +1,10 @@
-from pandas_datareader.base import _DailyBaseReader
+import re
+import time
+import warnings
+import numpy as np
+from pandas import Panel
+from pandas_datareader.base import (_DailyBaseReader, _in_chunks)
+from pandas_datareader._utils import (RemoteDataError, SymbolWarning)
 
 
 class YahooDailyReader(_DailyBaseReader):
@@ -39,36 +45,66 @@ class YahooDailyReader(_DailyBaseReader):
     """
 
     def __init__(self, symbols=None, start=None, end=None, retry_count=3,
-                 pause=0.001, session=None, adjust_price=False,
+                 pause=0.35, session=None, adjust_price=False,
                  ret_index=False, chunksize=25, interval='d'):
         super(YahooDailyReader, self).__init__(symbols=symbols,
                                                start=start, end=end,
                                                retry_count=retry_count,
                                                pause=pause, session=session,
                                                chunksize=chunksize)
+        # Ladder up the wait time between subsequent requests to improve
+        # probability of a successful retry
+        self.pause_multiplier = 2.5
+
+        self.headers = {
+            'Connection': 'keep-alive',
+            'Expires': str(-1),
+            'Upgrade-Insecure-Requests': str(1),
+            # Google Chrome:
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'  # noqa
+        }
+
         self.adjust_price = adjust_price
         self.ret_index = ret_index
-
-        if interval not in ['d', 'w', 'm', 'v']:
-            raise ValueError("Invalid interval: valid values are "
-                             "'d', 'w', 'm' and 'v'")
         self.interval = interval
 
+        if self.interval not in ['d', 'wk', 'mo', 'm', 'w']:
+            raise ValueError("Invalid interval: valid values are  'd', 'wk' and 'mo'. 'm' and 'w' have been implemented for "  # noqa
+                             "backward compatibility. 'v' has been moved to the yahoo-actions or yahoo-dividends APIs.")  # noqa
+        elif self.interval in ['m', 'mo']:
+            self.pdinterval = 'm'
+            self.interval = 'mo'
+        elif self.interval in ['w', 'wk']:
+            self.pdinterval = 'w'
+            self.interval = 'wk'
+
+        self.interval = '1' + self.interval
+        self.crumb = self._get_crumb(retry_count)
+
+    @property
+    def service(self):
+        return 'history'
+
     @property
     def url(self):
-        return 'http://ichart.finance.yahoo.com/table.csv'
+        return 'https://query1.finance.yahoo.com/v7/finance/download/{}'\
+            .format(self.symbols)
+
+    @staticmethod
+    def yurl(symbol):
+        return 'https://query1.finance.yahoo.com/v7/finance/download/{}'\
+            .format(symbol)
 
     def _get_params(self, symbol):
+        unix_start = int(time.mktime(self.start.timetuple()))
+        unix_end = int(time.mktime(self.end.timetuple()))
+
         params = {
-            's': symbol,
-            'a': self.start.month - 1,
-            'b': self.start.day,
-            'c': self.start.year,
-            'd': self.end.month - 1,
-            'e': self.end.day,
-            'f': self.end.year,
-            'g': self.interval,
-            'ignore': '.csv'
+            'period1': unix_start,
+            'period2': unix_end,
+            'interval': self.interval,
+            'events': self.service,
+            'crumb': self.crumb
         }
         return params
 
@@ -79,7 +115,49 @@ def read(self):
             df['Ret_Index'] = _calc_return_index(df['Adj Close'])
         if self.adjust_price:
             df = _adjust_prices(df)
-        return df
+        return df.sort_index()
+
+    def _dl_mult_symbols(self, symbols):
+        stocks = {}
+        failed = []
+        passed = []
+        for sym_group in _in_chunks(symbols, self.chunksize):
+            for sym in sym_group:
+                try:
+                    stocks[sym] = self._read_one_data(self.yurl(sym),
+                                                      self._get_params(sym))
+                    passed.append(sym)
+                except IOError:
+                    msg = 'Failed to read symbol: {0!r}, replacing with NaN.'
+                    warnings.warn(msg.format(sym), SymbolWarning)
+                    failed.append(sym)
+
+        if len(passed) == 0:
+            msg = "No data fetched using {0!r}"
+            raise RemoteDataError(msg.format(self.__class__.__name__))
+        try:
+            if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0:
+                df_na = stocks[passed[0]].copy()
+                df_na[:] = np.nan
+                for sym in failed:
+                    stocks[sym] = df_na
+            return Panel(stocks).swapaxes('items', 'minor')
+        except AttributeError:
+            # cannot construct a panel with just 1D nans indicating no data
+            msg = "No data fetched using {0!r}"
+            raise RemoteDataError(msg.format(self.__class__.__name__))
+
+    def _get_crumb(self, retries):
+        # Scrape a history page for a valid crumb ID:
+        tu = "https://finance.yahoo.com/quote/{}/history".format(self.symbols)
+        response = self._get_response(tu,
+                                      params=self.params, headers=self.headers)
+        out = str(self._sanitize_response(response))
+        # Matches: {"crumb":"AlphaNumeric"}
+        rpat = '"CrumbStore":{"crumb":"([^"]+)"}'
+
+        crumb = re.findall(rpat, out)[0]
+        return crumb.encode('ascii').decode('unicode-escape')
 
 
 def _adjust_prices(hist_data, price_list=None):