Skip to content

ENH: Improved Yahoo finance api functionality #2795

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 185 additions & 11 deletions pandas/io/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@


"""
import warnings

import numpy as np
import datetime as dt
Expand All @@ -13,7 +14,7 @@
from zipfile import ZipFile
from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str

from pandas import DataFrame, read_csv, concat
from pandas import Panel, DataFrame, Series, read_csv, concat
from pandas.io.parsers import TextParser


Expand Down Expand Up @@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None,
start, end = _sanitize_dates(start, end)

if(data_source == "yahoo"):
return get_data_yahoo(name=name, start=start, end=end,
return get_data_yahoo(symbols=name, start=start, end=end,
adjust_price=False, chunk=25,
retry_count=retry_count, pause=pause)
elif(data_source == "fred"):
return get_data_fred(name=name, start=start, end=end)
Expand All @@ -73,14 +75,27 @@ def _sanitize_dates(start, end):
return start, end


def _in_chunks(seq, size):
"""
Return sequence in 'chunks' of size defined by size
"""
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))


def get_quote_yahoo(symbols):
"""
Get current yahoo quote

Returns a DataFrame
"""
if not isinstance(symbols, list):
raise TypeError("symbols must be a list")
if isinstance(symbols, str):
sym_list = symbols
elif not isinstance(symbols, Series):
symbols = Series(symbols)
sym_list = str.join('+', symbols)
else:
sym_list = str.join('+', symbols)

# for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r',
'time': 't1', 'short_ratio': 's7'}
Expand All @@ -90,7 +105,7 @@ def get_quote_yahoo(symbols):
data = dict(zip(codes.keys(), [[] for i in range(len(codes))]))

urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (
str.join('+', symbols), request)
sym_list, request)

try:
lines = urllib2.urlopen(urlStr).readlines()
Expand All @@ -117,22 +132,23 @@ def get_quote_yahoo(symbols):
return DataFrame(data, index=idx)


def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
pause=0):
"""
Get historical data for the given name from yahoo.
Date format is datetime

Returns a DataFrame.
"""
start, end = _sanitize_dates(start, end)

if(name is None):
print "Need to provide a name"
if(sym is None):
warnings.warn("Need to provide a name.")
return None

start, end = _sanitize_dates(start, end)

yahoo_URL = 'http://ichart.yahoo.com/table.csv?'

url = yahoo_URL + 's=%s' % name + \
url = yahoo_URL + 's=%s' % sym + \
'&a=%s' % (start.month - 1) + \
'&b=%s' % start.day + \
'&c=%s' % start.year + \
Expand Down Expand Up @@ -162,6 +178,164 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
"return a 200 for url %s" % (pause, url))


def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
"""
Return modifed DataFrame or Panel with adjusted prices based on
'Adj Close' price. Adds 'Adj_Ratio' column.
"""
adj_ratio = hist_data['Adj Close'] / hist_data['Close']

data = hist_data.copy()
for item in price_list:
data[item] = hist_data[item] * adj_ratio
data['Adj_Ratio'] = adj_ratio
del data['Adj Close']
return data


def _calc_return_index(price_df):
"""
Return a returns index from a input price df or series.
"""

ret_index = price_df.pct_change().add(1).cumprod()
ret_index.ix[0] = 1
return ret_index


def get_components_yahoo(idx_sym):
"""
Returns DataFrame containing list of component information for
index represented in idx_sym from yahoo. Includes component symbol
(ticker), exchange, and name.

Parameters
----------
idx_sym : str
Stock index symbol
Examples:
'^DJI' (Dow Jones Industrial Average)
'^NYA' (NYSE Composite)
'^IXIC' (NASDAQ Composite)

See: http://finance.yahoo.com/indices for other index symbols

Returns
-------
idx_df : DataFrame
"""
stats = 'snx'
#URL of form:
#http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
'&e=.csv&h={2}'

idx_mod = idx_sym.replace('^', '@%5E')
urlStr = url.format(idx_mod, stats, 1)

idx_df = DataFrame()
mask = [True]
comp_idx = 1

#LOOP across component index structure,
#break when no new components are found
while (True in mask):
urlStr = url.format(idx_mod, stats, comp_idx)
lines = (urllib.urlopen(urlStr).read().strip().
strip('"').split('"\r\n"'))

lines = [line.strip().split('","') for line in lines]

temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
temp_df = temp_df.drop_duplicates()
temp_df = temp_df.set_index('ticker')
mask = ~temp_df.index.isin(idx_df.index)

comp_idx = comp_idx + 50
idx_df = idx_df.append(temp_df[mask])

return idx_df


def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3, pause=0,
adjust_price=False, ret_index=False, chunksize=25, **kwargs):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Yahoo! Finance servers,
pauses between downloading 'chunks' of symbols can be specified.

Parameters
----------
symbols : string, list-like object (list, tupel, Series), or DataFrame
Single stock symbol (ticker), list-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
adjust_price : bool, default False
If True, adjusts all prices in hist_data ('Open', 'High', 'Low', 'Close')
based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
'Adj Close'.
ret_index : bool, default False
If True, includes a simple return index 'Ret_Index' in hist_data.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.

Returns
-------
hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
"""

def dl_mult_symbols(symbols):
stocks = {}
for sym_group in _in_chunks(symbols, chunksize):
for sym in sym_group:
try:
stocks[sym] = _get_hist_yahoo(sym, start=start,
end=end, **kwargs)
except:
warnings.warn('Error with sym: ' + sym + '... skipping.')

time.sleep(pause)

return Panel(stocks).swapaxes('items', 'minor')

if 'name' in kwargs:
warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
FutureWarning)
symbols = kwargs['name']

#If a single symbol, (e.g., 'GOOG')
if isinstance(symbols, (str, int)):
sym = symbols
hist_data = _get_hist_yahoo(sym, start=start, end=end)
#Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(symbols, DataFrame):
try:
hist_data = dl_mult_symbols(Series(symbols.index))
except ValueError:
raise
else: #Guess a Series
try:
hist_data = dl_mult_symbols(symbols)
except TypeError:
hist_data = dl_mult_symbols(Series(symbols))

if(ret_index):
hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
if(adjust_price):
hist_data = _adjust_prices(hist_data)

return hist_data


def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
end=dt.datetime.today()):
"""
Expand Down
92 changes: 79 additions & 13 deletions pandas/io/tests/test_yahoo.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from pandas.util.py3compat import StringIO, BytesIO
from datetime import datetime
import csv
import os
import sys
import re
import unittest
import pandas.io.data as pd
import nose
from pandas.util.testing import network
from datetime import datetime

from pandas.util.py3compat import StringIO, BytesIO

import pandas as pd
import pandas.io.data as web
from pandas.util.testing import (network, assert_frame_equal,
assert_series_equal,
assert_almost_equal)
from numpy.testing.decorators import slow

import urllib2


Expand All @@ -21,16 +23,16 @@ def test_yahoo(self):
# an excecption when DataReader can't get a 200 response from
# yahoo
start = datetime(2010, 1, 1)
end = datetime(2012, 1, 24)
end = datetime(2013, 01, 27)

try:
self.assertEquals(
pd.DataReader("F", 'yahoo', start, end)['Close'][-1],
12.82)
web.DataReader("F", 'yahoo', start, end)['Close'][-1],
13.68)

self.assertRaises(
Exception,
lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo',
lambda: web.DataReader("NON EXISTENT TICKER", 'yahoo',
start, end))
except urllib2.URLError:
try:
Expand All @@ -40,7 +42,71 @@ def test_yahoo(self):
else:
raise


@slow
@network
def test_get_quote(self):
df = web.get_quote_yahoo(pd.Series(['GOOG', 'AAPL', 'GOOG']))
assert_series_equal(df.ix[0], df.ix[2])


@slow
@network
def test_get_components(self):

df = web.get_components_yahoo('^DJI') #Dow Jones
assert isinstance(df, pd.DataFrame)
assert len(df) == 30

df = web.get_components_yahoo('^GDAXI') #DAX
assert isinstance(df, pd.DataFrame)
assert len(df) == 30
assert df[df.name.str.contains('adidas', case=False)].index == 'ADS.DE'

df = web.get_components_yahoo('^NDX') #NASDAQ-100
assert isinstance(df, pd.DataFrame)
#assert len(df) == 100
#Usual culprits, should be around for a while
assert 'AAPL' in df.index
assert 'GOOG' in df.index
assert 'AMZN' in df.index

@slow
@network
def test_get_data(self):
#single symbol
#http://finance.yahoo.com/q/hp?s=GOOG&a=09&b=08&c=2010&d=09&e=10&f=2010&g=d
df = web.get_data_yahoo('GOOG')
assert df.Volume.ix['OCT-08-2010'] == 2859200

sl = ['AAPL', 'AMZN', 'GOOG']
pan = web.get_data_yahoo(sl, '2012')
ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
assert ts[0].dayofyear == 96

dfi = web.get_components_yahoo('^DJI')
pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12')
expected = [19.02, 28.23, 25.39]
result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
assert result == expected

pan = web.get_data_yahoo(dfi, 'JAN-01-12', 'JAN-31-12',
adjust_price=True)
expected = [18.38, 27.45, 24.54]
result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
assert result == expected

pan = web.get_data_yahoo(dfi, '2011', ret_index=True)
d = [[ 1.01757469, 1.01130524, 1.02414183],
[ 1.00292912, 1.00770812, 1.01735194],
[ 1.00820152, 1.00462487, 1.01320257],
[ 1.08025776, 0.99845838, 1.00113165]]

expected = pd.DataFrame(d)
result = pan.Ret_Index.ix['01-18-11':'01-21-11'][['GE', 'INTC', 'MSFT']]
assert_almost_equal(result.values, expected.values)


if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)