pandas-dev · nehalecky · Jan 29, 2013 · Feb 2, 2013 · Feb 5, 2013
diff --git a/pandas/io/data.py b/pandas/io/data.py
@@ -3,6 +3,7 @@
 
 
 """
+import warnings
 
 import numpy as np
 import datetime as dt
@@ -13,7 +14,7 @@
 from zipfile import ZipFile
 from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str
 
-from pandas import DataFrame, read_csv, concat
+from pandas import Panel, DataFrame, Series, read_csv, concat
 from pandas.io.parsers import TextParser
 
 
@@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None,
     start, end = _sanitize_dates(start, end)
 
     if(data_source == "yahoo"):
-        return get_data_yahoo(name=name, start=start, end=end,
+        return get_data_yahoo(symbols=name, start=start, end=end,
+                              adjust_price=False, chunk=25,
                               retry_count=retry_count, pause=pause)
     elif(data_source == "fred"):
         return get_data_fred(name=name, start=start, end=end)
@@ -73,14 +75,27 @@ def _sanitize_dates(start, end):
     return start, end
 
 
+def _in_chunks(seq, size):
+    """
+    Return sequence in 'chunks' of size defined by size
+    """
+    return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
+
+
 def get_quote_yahoo(symbols):
     """
     Get current yahoo quote
 
     Returns a DataFrame
     """
-    if not isinstance(symbols, list):
-        raise TypeError("symbols must be a list")
+    if isinstance(symbols, str):
+        sym_list = symbols
+    elif not isinstance(symbols, Series):
+        symbols  = Series(symbols)
+        sym_list = str.join('+', symbols)
+    else:
+        sym_list = str.join('+', symbols)
+
     # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
     codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r',
              'time': 't1', 'short_ratio': 's7'}
@@ -90,7 +105,7 @@ def get_quote_yahoo(symbols):
     data = dict(zip(codes.keys(), [[] for i in range(len(codes))]))
 
     urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (
-        str.join('+', symbols), request)
+        sym_list, request)
 
     try:
         lines = urllib2.urlopen(urlStr).readlines()
@@ -117,22 +132,23 @@ def get_quote_yahoo(symbols):
     return DataFrame(data, index=idx)
 
 
-def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
+def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
+                    pause=0):
     """
     Get historical data for the given name from yahoo.
     Date format is datetime
 
     Returns a DataFrame.
     """
-    start, end = _sanitize_dates(start, end)
-
-    if(name is None):
-        print "Need to provide a name"
+    if(sym is None):
+        warnings.warn("Need to provide a name.")
         return None
 
+    start, end = _sanitize_dates(start, end)
+
     yahoo_URL = 'http://ichart.yahoo.com/table.csv?'
 
-    url = yahoo_URL + 's=%s' % name + \
+    url = yahoo_URL + 's=%s' % sym + \
         '&a=%s' % (start.month - 1) + \
         '&b=%s' % start.day + \
         '&c=%s' % start.year + \
@@ -162,6 +178,164 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
                     "return a 200 for url %s" % (pause, url))
 
 
+def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
+    """
+    Return modifed DataFrame or Panel with adjusted prices based on
+    'Adj Close' price. Adds 'Adj_Ratio' column.
+    """
+    adj_ratio = hist_data['Adj Close'] / hist_data['Close']
+
+    data = hist_data.copy()
+    for item in price_list:
+        data[item] = hist_data[item] * adj_ratio
+    data['Adj_Ratio'] = adj_ratio
+    del data['Adj Close']
+    return data
+
+
+def _calc_return_index(price_df):
+    """
+    Return a returns index from a input price df or series.
+    """
+
+    ret_index =  price_df.pct_change().add(1).cumprod()
+    ret_index.ix[0] = 1
+    return ret_index
+
+
+def get_components_yahoo(idx_sym):
+    """
+    Returns DataFrame containing list of component information for
+    index represented in idx_sym from yahoo. Includes component symbol
+    (ticker), exchange, and name.
+
+    Parameters
+    ----------
+    idx_sym : str
+        Stock index symbol
+        Examples:
+        '^DJI' (Dow Jones Industrial Average)
+        '^NYA' (NYSE Composite)
+        '^IXIC' (NASDAQ Composite)
+
+        See: http://finance.yahoo.com/indices for other index symbols
+
+    Returns
+    -------
+    idx_df : DataFrame
+    """
+    stats = 'snx'
+    #URL of form:
+    #http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
+    url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
+          '&e=.csv&h={2}'
+
+    idx_mod = idx_sym.replace('^', '@%5E')
+    urlStr = url.format(idx_mod, stats, 1)
+
+    idx_df = DataFrame()
+    mask = [True]
+    comp_idx = 1
+
+    #LOOP across component index structure,
+    #break when no new components are found
+    while (True in mask):
+        urlStr = url.format(idx_mod, stats,  comp_idx)
+        lines = (urllib.urlopen(urlStr).read().strip().
+                 strip('"').split('"\r\n"'))
+
+        lines = [line.strip().split('","') for line in lines]
+
+        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
+        temp_df = temp_df.drop_duplicates()
+        temp_df = temp_df.set_index('ticker')
+        mask = ~temp_df.index.isin(idx_df.index)
+
+        comp_idx = comp_idx + 50
+        idx_df = idx_df.append(temp_df[mask])
+
+    return idx_df
+
+
+def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
+                   adjust_price=False, ret_index=False, chunksize=25, **kwargs):
+    """
+    Returns DataFrame/Panel of historical stock prices from symbols, over date
+    range, start to end. To avoid being penalized by Yahoo! Finance servers,
+    pauses between downloading 'chunks' of symbols can be specified.
+
+    Parameters
+    ----------
+    symbols : string, list-like object (list, tupel, Series), or DataFrame
+        Single stock symbol (ticker), list-like object of symbols or
+        DataFrame with index containing stock symbols.
+    start : string, (defaults to '1/1/2010')
+        Starting date, timestamp. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end : string, (defaults to today)
+        Ending date, timestamp. Same format as starting date.
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
+    adjust_price : bool, default False
+        If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close')
+        based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
+        'Adj Close'.
+    ret_index : bool, default False
+        If True, includes a simple return index 'Ret_Index' in hist_data.
+    chunksize : int, default 25
+        Number of symbols to download consecutively before intiating pause.
+
+    Returns
+    -------
+    hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
+    """
+
+    def dl_mult_symbols(symbols):
+        stocks = {}
+        for sym_group in _in_chunks(symbols, chunksize):
+            for sym in sym_group:
+                try:
+                    stocks[sym] = _get_hist_yahoo(sym, start=start,
+                                                  end=end, **kwargs)
+                except:
+                    warnings.warn('Error with sym: ' + sym + '... skipping.')
+
+            time.sleep(pause)
+
+        return Panel(stocks).swapaxes('items', 'minor')
+
+    if 'name' in kwargs:
+        warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
+                      FutureWarning)
+        symbols = kwargs['name']
+
+    #If a single symbol, (e.g., 'GOOG')
+    if isinstance(symbols, (str, int)):
+        sym = symbols
+        hist_data = _get_hist_yahoo(sym, start=start, end=end)
+    #Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
+    elif isinstance(symbols, DataFrame):
+        try:
+            hist_data = dl_mult_symbols(Series(symbols.index))
+        except ValueError:
+            raise
+    else: #Guess a Series
+        try:
+            hist_data = dl_mult_symbols(symbols)
+        except TypeError:
+            hist_data = dl_mult_symbols(Series(symbols))
+
+    if(ret_index):
+        hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
+    if(adjust_price):
+        hist_data = _adjust_prices(hist_data)
+
+    return hist_data
+
+
 def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
                   end=dt.datetime.today()):
     """

diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py
@@ -1,14 +1,16 @@
-from pandas.util.py3compat import StringIO, BytesIO
-from datetime import datetime
-import csv
-import os
-import sys
-import re
 import unittest
-import pandas.io.data as pd
 import nose
-from pandas.util.testing import network
+from datetime import datetime
+
+from pandas.util.py3compat import StringIO, BytesIO
+
+import pandas as pd
+import pandas.io.data as web
+from pandas.util.testing import (network, assert_frame_equal,
+                                 assert_series_equal,
+                                 assert_almost_equal)
 from numpy.testing.decorators import slow
+
 import urllib2
 
 
@@ -21,16 +23,16 @@ def test_yahoo(self):
         # an excecption when DataReader can't get a 200 response from
         # yahoo
         start = datetime(2010, 1, 1)
-        end = datetime(2012, 1, 24)
+        end = datetime(2013, 01, 27)
 
         try:
             self.assertEquals(
-                pd.DataReader("F", 'yahoo', start, end)['Close'][-1],
-                12.82)
+                web.DataReader("F", 'yahoo', start, end)['Close'][-1],
+                13.68)
 
             self.assertRaises(
                 Exception,
-                lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo',
+                lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo',
                                       start, end))
         except urllib2.URLError:
             try:
@@ -40,7 +42,71 @@ def test_yahoo(self):
             else:
                 raise
 
+
+    @slow
+    @network
+    def test_get_quote(self):
+        df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
+        assert_series_equal(df.ix[0], df.ix[2])
+
+
+    @slow
+    @network
+    def test_get_components(self):
+
+        df = web.get_components_yahoo('^DJI') #Dow Jones
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 30
+
+        df = web.get_components_yahoo('^GDAXI') #DAX
+        assert isinstance(df, pd.DataFrame)
+        assert len(df) == 30
+        assert df[df.name.str.contains('adidas', case=False)].index == 'ADS.DE'
+
+        df = web.get_components_yahoo('^NDX') #NASDAQ-100
+        assert isinstance(df, pd.DataFrame)
+        #assert len(df) == 100
+        #Usual culprits, should be around for a while
+        assert 'AAPL' in df.index
+        assert 'GOOG' in df.index
+        assert 'AMZN' in df.index
+
+    @slow
+    @network
+    def test_get_data(self):
+        #single symbol
+        #http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d
+        df = web.get_data_yahoo('GOOG')
+        assert df.Volume.ix['OCT-08-2010'] == 2859200
+
+        sl = ['AAPL', 'AMZN', 'GOOG']
+        pan = web.get_data_yahoo(sl, '2012')
+        ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
+        assert ts[0].dayofyear == 96
+
+        dfi = web.get_components_yahoo('^DJI')
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12')
+        expected = [19.02, 28.23, 25.39]
+        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
+        assert result == expected
+
+        pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12',
+                                 adjust_price=True)
+        expected = [18.38, 27.45, 24.54]
+        result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
+        assert result == expected
+
+        pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
+        d = [[ 1.01757469,  1.01130524,  1.02414183],
+             [ 1.00292912,  1.00770812,  1.01735194],
+             [ 1.00820152,  1.00462487,  1.01320257],
+             [ 1.08025776,  0.99845838,  1.00113165]]
+
+        expected = pd.DataFrame(d)
+        result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']]
+        assert_almost_equal(result.values, expected.values)
+
+
 if __name__ == '__main__':
-    import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
                    exit=False)