Skip to content

Commit db05f9a

Browse files
nehaleckywesm
authored andcommitted
EHN: Expand Yahoo finance features, idx components
1 parent b13a1cc commit db05f9a

File tree

1 file changed

+173
-9
lines changed

1 file changed

+173
-9
lines changed

pandas/io/data.py

+173-9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
44
55
"""
6+
import warnings
67

78
import numpy as np
89
import datetime as dt
@@ -13,7 +14,7 @@
1314
from zipfile import ZipFile
1415
from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str
1516

16-
from pandas import DataFrame, read_csv, concat
17+
from pandas import Panel, DataFrame, Series, read_csv, concat
1718
from pandas.io.parsers import TextParser
1819

1920

@@ -54,7 +55,8 @@ def DataReader(name, data_source=None, start=None, end=None,
5455
start, end = _sanitize_dates(start, end)
5556

5657
if(data_source == "yahoo"):
57-
return get_data_yahoo(name=name, start=start, end=end,
58+
return get_data_yahoo(symbols=name, start=start, end=end,
59+
adjust_price=False, chunk=25,
5860
retry_count=retry_count, pause=pause)
5961
elif(data_source == "fred"):
6062
return get_data_fred(name=name, start=start, end=end)
@@ -73,14 +75,27 @@ def _sanitize_dates(start, end):
7375
return start, end
7476

7577

78+
def _in_chunks(seq, size):
79+
"""
80+
Return sequence in 'chunks' of size defined by size
81+
"""
82+
return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))
83+
84+
7685
def get_quote_yahoo(symbols):
7786
"""
7887
Get current yahoo quote
7988
8089
Returns a DataFrame
8190
"""
82-
if not isinstance(symbols, list):
83-
raise TypeError("symbols must be a list")
91+
if isinstance(symbols, str):
92+
sym_list = symbols
93+
elif not isinstance(symbols, Series):
94+
symbols = Series(symbols)
95+
sym_list = str.join('+', symbols)
96+
else:
97+
sym_list = str.join('+', symbols)
98+
8499
# for codes see: http://www.gummy-stuff.org/Yahoo-data.htm
85100
codes = {'symbol': 's', 'last': 'l1', 'change_pct': 'p2', 'PE': 'r',
86101
'time': 't1', 'short_ratio': 's7'}
@@ -90,7 +105,7 @@ def get_quote_yahoo(symbols):
90105
data = dict(zip(codes.keys(), [[] for i in range(len(codes))]))
91106

92107
urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (
93-
str.join('+', symbols), request)
108+
sym_list, request)
94109

95110
try:
96111
lines = urllib2.urlopen(urlStr).readlines()
@@ -117,19 +132,20 @@ def get_quote_yahoo(symbols):
117132
return DataFrame(data, index=idx)
118133

119134

120-
def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
135+
def _get_hist_yahoo(name=None, start=None, end=None, retry_count=3,
136+
pause=0):
121137
"""
122138
Get historical data for the given name from yahoo.
123139
Date format is datetime
124140
125141
Returns a DataFrame.
126142
"""
127-
start, end = _sanitize_dates(start, end)
128-
129143
if(name is None):
130-
print "Need to provide a name"
144+
warnings.warn("Need to provide a name.")
131145
return None
132146

147+
start, end = _sanitize_dates(start, end)
148+
133149
yahoo_URL = 'http://ichart.yahoo.com/table.csv?'
134150

135151
url = yahoo_URL + 's=%s' % name + \
@@ -162,6 +178,154 @@ def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0):
162178
"return a 200 for url %s" % (pause, url))
163179

164180

181+
def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
182+
"""
183+
Return modifed DataFrame or Panel with adjusted prices based on
184+
'Adj Close' price. Adds 'Adj_Ratio' column.
185+
"""
186+
adj_ratio = hist_data['Adj Close'] / hist_data['Close']
187+
188+
data = hist_data.copy()
189+
for item in price_list:
190+
data[item] = hist_data[item] * adj_ratio
191+
data['Adj_Ratio'] = adj_ratio
192+
del data['Adj Close']
193+
return data
194+
195+
196+
def _calc_return_index(price_df):
197+
"""
198+
Return a returns index from a input price df or series.
199+
"""
200+
201+
ret_index = price_df.pct_change().add(1).cumprod()
202+
ret_index.ix[0] = 1
203+
return ret_index
204+
205+
206+
def get_components_yahoo(idx_sym='^DJI'):
207+
"""
208+
Returns DataFrame containing list of component information for index
209+
represented in idx_sym from yahoo. Includes component symbol
210+
(ticker), exchange, and name.
211+
212+
Parameters
213+
----------
214+
idx_sym : str
215+
Index symbol, default '^DJI' (Dow Jones Industrial Average)
216+
Examples:
217+
'^NYA' (NYSE Composite)
218+
'^IXIC' (NASDAQ Composite)
219+
220+
See: http://finance.yahoo.com/indices for other index symbols
221+
222+
Returns
223+
-------
224+
idx_df : DataFrame
225+
"""
226+
stats = 'snx'
227+
#URL of form:
228+
#http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
229+
url = 'http://download.finance.yahoo.com/d/quotes.csv?s={0}&f={1}' \
230+
'&e=.csv&h={2}'
231+
232+
idx_mod = idx_sym.replace('^', '@%5E')
233+
urlStr = url.format(idx_mod, stats, 1)
234+
235+
idx_df = DataFrame()
236+
mask = [True]
237+
comp_idx = 1
238+
239+
#LOOP across component index structure,
240+
#break when no new components are found
241+
while (True in mask):
242+
urlStr = url.format(idx_mod, stats, comp_idx)
243+
lines = (urllib.urlopen(urlStr).read().strip().
244+
strip('"').split('"\r\n"'))
245+
246+
lines = [line.strip().split('","') for line in lines]
247+
248+
temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
249+
temp_df = temp_df.drop_duplicates()
250+
temp_df = temp_df.set_index('ticker')
251+
mask = ~temp_df.index.isin(idx_df.index)
252+
253+
comp_idx = comp_idx + 50
254+
idx_df = idx_df.append(temp_df[mask])
255+
256+
return idx_df
257+
258+
259+
def get_data_yahoo(symbols=None, start=None, end=None, adjust_price=False,
260+
ret_index=False, chunk=25, pause=0, **kwargs):
261+
"""
262+
Returns DataFrame/Panel of historical stock prices from symbols, over date
263+
range, start to end. To avoid being penalized by Yahoo! Finance servers,
264+
pauses between downloading 'chunks' of symbols can be specified.
265+
266+
Parameters
267+
----------
268+
symbols : string, list-like object (list, tupel, Series), DataFrame
269+
Single stock symbol (ticker), list-like object of symbols or
270+
DataFrame with index containing of stock symbols
271+
start : string, (defaults to '1/1/2010')
272+
Starting date, timestamp. Parses many different kind of date
273+
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
274+
end : string, (defaults to today)
275+
Ending date, timestamp. Same format as starting date.
276+
adjust_price : bool, default False
277+
Adjust all prices in hist_data ('Open', 'High', 'Low', 'Close') via
278+
'Adj Close' price. Adds 'Adj_Ratio' column and drops 'Adj Close'.
279+
ret_index: bool, default False
280+
Include a simple return index 'Ret_Index' in hist_data.
281+
chunk : int, default 25
282+
Number of symbols to download consecutively before intiating pause.
283+
pause : int, default 0
284+
Time, in seconds, to pause between consecutive chunks.
285+
**kwargs: additional arguments to pass to _get_hist_yahoo
286+
287+
Returns
288+
-------
289+
hist_data : DataFrame (str) or Panel (list-like object, DataFrame)
290+
"""
291+
def dl_mult_symbols(symbols):
292+
stocks = {}
293+
for sym_group in _in_chunks(symbols, chunk):
294+
for sym in sym_group:
295+
try:
296+
stocks[sym] = _get_hist_yahoo(name=sym, start=start,
297+
end=end, **kwargs)
298+
except:
299+
warnings.warn('Error with sym: ' + sym + '... skipping.')
300+
301+
time.sleep(pause)
302+
303+
return Panel(stocks).swapaxes('items', 'minor')
304+
305+
#If a scalar (single symbol, e.g. 'GOOG')
306+
if isinstance(symbols, (str, int)):
307+
sym = symbols
308+
hist_data = _get_hist_yahoo(sym, start=start, end=end, **kwargs)
309+
#Multiple symbols
310+
elif isinstance(symbols, DataFrame):
311+
try:
312+
hist_data = dl_mult_symbols(Series(symbols.index))
313+
except ValueError:
314+
raise
315+
else: #Guess a Series
316+
try:
317+
hist_data = dl_mult_symbols(symbols)
318+
except TypeError:
319+
hist_data = dl_mult_symbols(Series(symbols))
320+
321+
if(ret_index):
322+
hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
323+
if(adjust_price):
324+
hist_data = _adjust_prices(hist_data)
325+
326+
return hist_data
327+
328+
165329
def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
166330
end=dt.datetime.today()):
167331
"""

0 commit comments

Comments
 (0)