Skip to content

Commit 5496613

Browse files
committed
Merge pull request #3814 from gliptak/googledata
Implement historical finance data from Google Finance
2 parents 241db0d + 0aadb11 commit 5496613

File tree

2 files changed

+190
-0
lines changed

2 files changed

+190
-0
lines changed

pandas/io/data.py

+108
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ def DataReader(name, data_source=None, start=None, end=None,
5858
return get_data_yahoo(symbols=name, start=start, end=end,
5959
adjust_price=False, chunk=25,
6060
retry_count=retry_count, pause=pause)
61+
elif(data_source == "google"):
62+
return get_data_google(symbols=name, start=start, end=end,
63+
adjust_price=False, chunk=25,
64+
retry_count=retry_count, pause=pause)
6165
elif(data_source == "fred"):
6266
return get_data_fred(name=name, start=start, end=end)
6367
elif(data_source == "famafrench"):
@@ -132,6 +136,9 @@ def get_quote_yahoo(symbols):
132136
return DataFrame(data, index=idx)
133137

134138

139+
def get_quote_google(symbols):
140+
raise NotImplementedError("Google Finance doesn't have this functionality")
141+
135142
def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
136143
pause=0, **kwargs):
137144
"""
@@ -178,6 +185,41 @@ def _get_hist_yahoo(sym=None, start=None, end=None, retry_count=3,
178185
"return a 200 for url %s" % (pause, url))
179186

180187

188+
def _get_hist_google(sym=None, start=None, end=None, retry_count=3,
189+
pause=0, **kwargs):
190+
"""
191+
Get historical data for the given name from google.
192+
Date format is datetime
193+
194+
Returns a DataFrame.
195+
"""
196+
if(sym is None):
197+
warnings.warn("Need to provide a name.")
198+
return None
199+
200+
start, end = _sanitize_dates(start, end)
201+
202+
google_URL = 'http://www.google.com/finance/historical?'
203+
204+
# www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
205+
url = google_URL + urllib.urlencode({"q": sym, \
206+
"startdate": start.strftime('%b %d, %Y'), \
207+
"enddate": end.strftime('%b %d, %Y'), "output": "csv" })
208+
for _ in range(retry_count):
209+
resp = urllib2.urlopen(url)
210+
if resp.code == 200:
211+
lines = resp.read()
212+
rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
213+
parse_dates=True)[::-1]
214+
215+
return rs
216+
217+
time.sleep(pause)
218+
219+
raise Exception("after %d tries, Google did not "
220+
"return a 200 for url %s" % (pause, url))
221+
222+
181223
def _adjust_prices(hist_data, price_list=['Open', 'High', 'Low', 'Close']):
182224
"""
183225
Return modifed DataFrame or Panel with adjusted prices based on
@@ -347,6 +389,72 @@ def dl_mult_symbols(symbols):
347389

348390
return hist_data
349391

392+
def get_data_google(symbols=None, start=None, end=None, retry_count=3, pause=0,
393+
chunksize=25, **kwargs):
394+
"""
395+
Returns DataFrame/Panel of historical stock prices from symbols, over date
396+
range, start to end. To avoid being penalized by Google Finance servers,
397+
pauses between downloading 'chunks' of symbols can be specified.
398+
399+
Parameters
400+
----------
401+
symbols : string, array-like object (list, tuple, Series), or DataFrame
402+
Single stock symbol (ticker), array-like object of symbols or
403+
DataFrame with index containing stock symbols.
404+
start : string, (defaults to '1/1/2010')
405+
Starting date, timestamp. Parses many different kind of date
406+
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
407+
end : string, (defaults to today)
408+
Ending date, timestamp. Same format as starting date.
409+
retry_count : int, default 3
410+
Number of times to retry query request.
411+
pause : int, default 0
412+
Time, in seconds, to pause between consecutive queries of chunks. If
413+
single value given for symbol, represents the pause between retries.
414+
chunksize : int, default 25
415+
Number of symbols to download consecutively before intiating pause.
416+
417+
Returns
418+
-------
419+
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
420+
"""
421+
422+
def dl_mult_symbols(symbols):
423+
stocks = {}
424+
for sym_group in _in_chunks(symbols, chunksize):
425+
for sym in sym_group:
426+
try:
427+
stocks[sym] = _get_hist_google(sym, start=start,
428+
end=end, **kwargs)
429+
except:
430+
warnings.warn('Error with sym: ' + sym + '... skipping.')
431+
432+
time.sleep(pause)
433+
434+
return Panel(stocks).swapaxes('items', 'minor')
435+
436+
if 'name' in kwargs:
437+
warnings.warn("Arg 'name' is deprecated, please use 'symbols' instead.",
438+
FutureWarning)
439+
symbols = kwargs['name']
440+
441+
#If a single symbol, (e.g., 'GOOG')
442+
if isinstance(symbols, (str, int)):
443+
sym = symbols
444+
hist_data = _get_hist_google(sym, start=start, end=end)
445+
#Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
446+
elif isinstance(symbols, DataFrame):
447+
try:
448+
hist_data = dl_mult_symbols(Series(symbols.index))
449+
except ValueError:
450+
raise
451+
else: #Guess a Series
452+
try:
453+
hist_data = dl_mult_symbols(symbols)
454+
except TypeError:
455+
hist_data = dl_mult_symbols(Series(symbols))
456+
457+
return hist_data
350458

351459
def get_data_fred(name=None, start=dt.datetime(2010, 1, 1),
352460
end=dt.datetime.today()):

pandas/io/tests/test_google.py

+82
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import unittest
2+
import nose
3+
from datetime import datetime
4+
5+
import pandas as pd
6+
import pandas.io.data as web
7+
from pandas.util.testing import (network, assert_series_equal)
8+
from numpy.testing.decorators import slow
9+
10+
import urllib2
11+
12+
13+
class TestGoogle(unittest.TestCase):
14+
15+
@network
16+
def test_google(self):
17+
# asserts that google is minimally working and that it throws
18+
# an excecption when DataReader can't get a 200 response from
19+
# google
20+
start = datetime(2010, 1, 1)
21+
end = datetime(2013, 01, 27)
22+
23+
try:
24+
self.assertEquals(
25+
web.DataReader("F", 'google', start, end)['Close'][-1],
26+
13.68)
27+
28+
self.assertRaises(
29+
Exception,
30+
lambda: web.DataReader("NON EXISTENT TICKER", 'google',
31+
start, end))
32+
except urllib2.URLError:
33+
try:
34+
urllib2.urlopen('http://www.google.com')
35+
except urllib2.URLError:
36+
raise nose.SkipTest
37+
else:
38+
raise
39+
40+
41+
@network
42+
def test_get_quote(self):
43+
self.assertRaises(NotImplementedError,
44+
lambda: web.get_quote_google(pd.Series(['GOOG', 'AAPL', 'GOOG'])))
45+
46+
@network
47+
def test_get_data(self):
48+
import numpy as np
49+
df = web.get_data_google('GOOG')
50+
print(df.Volume.ix['OCT-08-2010'])
51+
assert df.Volume.ix['OCT-08-2010'] == 2863473
52+
53+
sl = ['AAPL', 'AMZN', 'GOOG']
54+
pan = web.get_data_google(sl, '2012')
55+
ts = pan.Close.GOOG.index[pan.Close.AAPL > pan.Close.GOOG]
56+
assert ts[0].dayofyear == 96
57+
58+
pan = web.get_data_google(['GE', 'MSFT', 'INTC'], 'JAN-01-12', 'JAN-31-12')
59+
expected = [19.02, 28.23, 25.39]
60+
result = pan.Close.ix['01-18-12'][['GE', 'MSFT', 'INTC']].tolist()
61+
assert result == expected
62+
63+
# sanity checking
64+
t= np.array(result)
65+
assert np.issubdtype(t.dtype, np.floating)
66+
assert t.shape == (3,)
67+
68+
expected = [[ 18.99, 28.4 , 25.18],
69+
[ 18.58, 28.31, 25.13],
70+
[ 19.03, 28.16, 25.52],
71+
[ 18.81, 28.82, 25.87]]
72+
result = pan.Open.ix['Jan-15-12':'Jan-20-12'][['GE', 'MSFT', 'INTC']].values
73+
assert (result == expected).all()
74+
75+
# sanity checking
76+
t= np.array(pan)
77+
assert np.issubdtype(t.dtype, np.floating)
78+
79+
80+
if __name__ == '__main__':
81+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
82+
exit=False)

0 commit comments

Comments
 (0)