Skip to content

One file per datareader #59

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 12 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,255 changes: 13 additions & 1,242 deletions pandas_datareader/data.py

Large diffs are not rendered by default.

Empty file.
43 changes: 43 additions & 0 deletions pandas_datareader/datareaders/famafrench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import tempfile
import numpy as np
from pandas.io.common import urlopen, ZipFile
from pandas.compat import lmap
from pandas import DataFrame

_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'


def get_data_famafrench(name):
# path of zip files
zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name)

with urlopen(zip_file_path) as url:
raw = url.read()

with tempfile.TemporaryFile() as tmpf:
tmpf.write(raw)

with ZipFile(tmpf, 'r') as zf:
data = zf.open(zf.namelist()[0]).readlines()

line_lengths = np.array(lmap(len, data))
file_edges = np.where(line_lengths == 2)[0]

datasets = {}
edges = zip(file_edges + 1, file_edges[1:])
for i, (left_edge, right_edge) in enumerate(edges):
dataset = [d.split() for d in data[left_edge:right_edge]]
if len(dataset) > 10:
ncol_raw = np.array(lmap(len, dataset))
ncol = np.median(ncol_raw)
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
header = dataset[header_index]
ds_header = dataset[header_index + 1:]
# to ensure the header is unique
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
start=1)]
index = np.array([d[0] for d in ds_header], dtype=int)
dataset = np.array([d[1:] for d in ds_header], dtype=float)
datasets[i] = DataFrame(dataset, index, columns=header)

return datasets
46 changes: 46 additions & 0 deletions pandas_datareader/datareaders/fred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import datetime as dt
from pandas.core.common import is_list_like
from pandas.io.common import urlopen
from pandas import concat, read_csv

from pandas_datareader.date_chunks import _sanitize_dates

_FRED_URL = "http://research.stlouisfed.org/fred2/series/"


def get_data_fred(name, start=dt.datetime(2010, 1, 1),
end=dt.datetime.today()):
"""
Get data for the given name from the St. Louis FED (FRED).
Date format is datetime

Returns a DataFrame.

If multiple names are passed for "series" then the index of the
DataFrame is the outer join of the indicies of each series.
"""
start, end = _sanitize_dates(start, end)

if not is_list_like(name):
names = [name]
else:
names = name

urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for
n in names]

def fetch_data(url, name):
with urlopen(url) as resp:
data = read_csv(resp, index_col=0, parse_dates=True,
header=None, skiprows=1, names=["DATE", name],
na_values='.')
try:
return data.truncate(start, end)
except KeyError:
if data.ix[3].name[7:12] == 'Error':
raise IOError("Failed to get the data. Check that {0!r} is "
"a valid FRED series.".format(name))
raise
df = concat([fetch_data(url, n) for url, n in zip(urls, names)],
axis=1, join='outer')
return df
34 changes: 34 additions & 0 deletions pandas_datareader/datareaders/google/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from pandas_datareader.shared import _get_data_from
from pandas_datareader.datareaders.google.daily import _get_hist_google

def get_data_google(symbols=None, start=None, end=None, retry_count=3,
pause=0.001, chunksize=25):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Google Finance servers,
pauses between downloading 'chunks' of symbols can be specified.

Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.

Returns
-------
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
"""
return _get_data_from(symbols, start, end, None, retry_count, pause,
chunksize, _get_hist_google)
23 changes: 23 additions & 0 deletions pandas_datareader/datareaders/google/daily.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pandas.io.common import urlencode
from pandas_datareader.url_request import _retry_read_url
from pandas_datareader.date_chunks import _sanitize_dates

_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?'


def _get_hist_google(sym, start, end, interval, retry_count, pause):
"""
Get historical data for the given name from google.
Date format is datetime

Returns a DataFrame.
"""
start, end = _sanitize_dates(start, end)

# www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
url = "%s%s" % (_HISTORICAL_GOOGLE_URL,
urlencode({"q": sym,
"startdate": start.strftime('%b %d, ' '%Y'),
"enddate": end.strftime('%b %d, %Y'),
"output": "csv"}))
return _retry_read_url(url, retry_count, pause, 'Google')
10 changes: 10 additions & 0 deletions pandas_datareader/datareaders/google/quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
def get_quote_google(symbols):
"""
Get current yahoo quote

(Should) Returns a DataFrame

ToDo: Not implemented
"""
msg = "Google Finance doesn't have this functionality - can't get quote for %s" % symbols
raise NotImplementedError(msg)
88 changes: 88 additions & 0 deletions pandas_datareader/datareaders/yahoo/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from pandas_datareader.shared import _get_data_from
from pandas_datareader.datareaders.yahoo.daily import _get_hist_yahoo

def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3,
pause=0.001, adjust_price=False, ret_index=False,
chunksize=25, interval='d'):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Yahoo! Finance servers,
pauses between downloading 'chunks' of symbols can be specified.

Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
adjust_price : bool, default False
If True, adjusts all prices in hist_data ('Open', 'High', 'Low',
'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
'Adj Close'.
ret_index : bool, default False
If True, includes a simple return index 'Ret_Index' in hist_data.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.
interval : string, default 'd'
Time interval code, valid values are 'd' for daily, 'w' for weekly,
'm' for monthly and 'v' for dividend.

Returns
-------
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
"""
if interval not in ['d', 'w', 'm', 'v']:
raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'")
hist_data = _get_data_from(symbols, start, end, interval, retry_count, pause, \
chunksize, _get_hist_yahoo)
if ret_index:
hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
if adjust_price:
hist_data = _adjust_prices(hist_data)
return hist_data

def _adjust_prices(hist_data, price_list=None):
"""
Return modifed DataFrame or Panel with adjusted prices based on
'Adj Close' price. Adds 'Adj_Ratio' column.
"""
if price_list is None:
price_list = 'Open', 'High', 'Low', 'Close'
adj_ratio = hist_data['Adj Close'] / hist_data['Close']

data = hist_data.copy()
for item in price_list:
data[item] = hist_data[item] * adj_ratio
data['Adj_Ratio'] = adj_ratio
del data['Adj Close']
return data


def _calc_return_index(price_df):
"""
Return a returns index from a input price df or series. Initial value
(typically NaN) is set to 1.
"""
df = price_df.pct_change().add(1).cumprod()
mask = df.ix[1].notnull() & df.ix[0].isnull()
df.ix[0][mask] = 1

# Check for first stock listings after starting date of index in ret_index
# If True, find first_valid_index and set previous entry to 1.
if (~mask).any():
for sym in mask.index[~mask]:
tstamp = df[sym].first_valid_index()
t_idx = df.index.get_loc(tstamp) - 1
df[sym].ix[t_idx] = 1

return df

82 changes: 82 additions & 0 deletions pandas_datareader/datareaders/yahoo/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import time
import csv
from pandas import to_datetime, DataFrame
from pandas.io.common import urlopen
from pandas.util.testing import _network_error_classes
from pandas.compat import StringIO, bytes_to_str
from pandas_datareader.date_chunks import _sanitize_dates

_HISTORICAL_YAHOO_ACTIONS_URL = 'http://ichart.finance.yahoo.com/x?'


def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, pause=0.001):
"""
Returns DataFrame of historical corporate actions (dividends and stock
splits) from symbols, over date range, start to end. All dates in the
resulting DataFrame correspond with dividend and stock split ex-dates.

Parameters
----------
sym : string with a single Single stock symbol (ticker).
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, of the pause between retries.
"""

start, end = _sanitize_dates(start, end)
url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + \
'&a=%s' % (start.month - 1) + \
'&b=%s' % start.day + \
'&c=%s' % start.year + \
'&d=%s' % (end.month - 1) + \
'&e=%s' % end.day + \
'&f=%s' % end.year + \
'&g=v')

for _ in range(retry_count):
time.sleep(pause)

try:
with urlopen(url) as resp:
lines = resp.read()
except _network_error_classes:
pass
else:
actions_index = []
actions_entries = []

for line in csv.reader(StringIO(bytes_to_str(lines))):
# Ignore lines that aren't dividends or splits (Yahoo
# add a bunch of irrelevant fields.)
if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
continue

action, date, value = line
if action == 'DIVIDEND':
actions_index.append(to_datetime(date))
actions_entries.append({
'action': action,
'value': float(value)
})
elif action == 'SPLIT' and ':' in value:
# Convert the split ratio to a fraction. For example a
# 4:1 split expressed as a fraction is 1/4 = 0.25.
denominator, numerator = value.split(':', 1)
split_fraction = float(numerator) / float(denominator)

actions_index.append(to_datetime(date))
actions_entries.append({
'action': action,
'value': split_fraction
})

return DataFrame(actions_entries, index=actions_index)

raise IOError("after %d tries, Yahoo! did not " \
"return a 200 for url %r" % (retry_count, url))
57 changes: 57 additions & 0 deletions pandas_datareader/datareaders/yahoo/components.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pandas import DataFrame
from pandas.io.common import urlopen

_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?'


def get_components_yahoo(idx_sym):
"""
Returns DataFrame containing list of component information for
index represented in idx_sym from yahoo. Includes component symbol
(ticker), exchange, and name.

Parameters
----------
idx_sym : str
Stock index symbol
Examples:
'^DJI' (Dow Jones Industrial Average)
'^NYA' (NYSE Composite)
'^IXIC' (NASDAQ Composite)

See: http://finance.yahoo.com/indices for other index symbols

Returns
-------
idx_df : DataFrame
"""
stats = 'snx'
# URL of form:
# http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}'

idx_mod = idx_sym.replace('^', '@%5E')
url_str = url.format(idx_mod, stats, 1)

idx_df = DataFrame()
mask = [True]
comp_idx = 1

# LOOP across component index structure,
# break when no new components are found
while True in mask:
url_str = url.format(idx_mod, stats, comp_idx)
with urlopen(url_str) as resp:
raw = resp.read()
lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"')
lines = [line.strip().split('","') for line in lines]

temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
temp_df = temp_df.drop_duplicates()
temp_df = temp_df.set_index('ticker')
mask = ~temp_df.index.isin(idx_df.index)

comp_idx = comp_idx + 50
idx_df = idx_df.append(temp_df[mask])

return idx_df
Loading