pydata · femtotrader · Aug 21, 2015 · Aug 21, 2015 · Aug 21, 2015 · Aug 22, 2015
diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py
diff --git a/pandas_datareader/datareaders/__init__.py b/pandas_datareader/datareaders/__init__.py
diff --git a/pandas_datareader/datareaders/famafrench.py b/pandas_datareader/datareaders/famafrench.py
@@ -0,0 +1,43 @@
+import tempfile
+import numpy as np
+from pandas.io.common import urlopen, ZipFile
+from pandas.compat import lmap
+from pandas import DataFrame
+
+_FAMAFRENCH_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
+
+
+def get_data_famafrench(name):
+    # path of zip files
+    zip_file_path = '{0}/{1}_TXT.zip'.format(_FAMAFRENCH_URL, name)
+
+    with urlopen(zip_file_path) as url:
+        raw = url.read()
+
+    with tempfile.TemporaryFile() as tmpf:
+        tmpf.write(raw)
+
+        with ZipFile(tmpf, 'r') as zf:
+            data = zf.open(zf.namelist()[0]).readlines()
+
+    line_lengths = np.array(lmap(len, data))
+    file_edges = np.where(line_lengths == 2)[0]
+
+    datasets = {}
+    edges = zip(file_edges + 1, file_edges[1:])
+    for i, (left_edge, right_edge) in enumerate(edges):
+        dataset = [d.split() for d in data[left_edge:right_edge]]
+        if len(dataset) > 10:
+            ncol_raw = np.array(lmap(len, dataset))
+            ncol = np.median(ncol_raw)
+            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
+            header = dataset[header_index]
+            ds_header = dataset[header_index + 1:]
+            # to ensure the header is unique
+            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
+                                                                     start=1)]
+            index = np.array([d[0] for d in ds_header], dtype=int)
+            dataset = np.array([d[1:] for d in ds_header], dtype=float)
+            datasets[i] = DataFrame(dataset, index, columns=header)
+
+    return datasets
diff --git a/pandas_datareader/datareaders/fred.py b/pandas_datareader/datareaders/fred.py
@@ -0,0 +1,46 @@
+import datetime as dt
+from pandas.core.common import is_list_like
+from pandas.io.common import urlopen
+from pandas import concat, read_csv
+
+from pandas_datareader.date_chunks import _sanitize_dates
+
+_FRED_URL = "http://research.stlouisfed.org/fred2/series/"
+
+
+def get_data_fred(name, start=dt.datetime(2010, 1, 1),
+                  end=dt.datetime.today()):
+    """
+    Get data for the given name from the St. Louis FED (FRED).
+    Date format is datetime
+
+    Returns a DataFrame.
+
+    If multiple names are passed for "series" then the index of the
+    DataFrame is the outer join of the indicies of each series.
+    """
+    start, end = _sanitize_dates(start, end)
+
+    if not is_list_like(name):
+        names = [name]
+    else:
+        names = name
+
+    urls = [_FRED_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for
+            n in names]
+
+    def fetch_data(url, name):
+        with urlopen(url) as resp:
+            data = read_csv(resp, index_col=0, parse_dates=True,
+                            header=None, skiprows=1, names=["DATE", name],
+                            na_values='.')
+        try:
+            return data.truncate(start, end)
+        except KeyError:
+            if data.ix[3].name[7:12] == 'Error':
+                raise IOError("Failed to get the data. Check that {0!r} is "
+                              "a valid FRED series.".format(name))
+            raise
+    df = concat([fetch_data(url, n) for url, n in zip(urls, names)],
+                axis=1, join='outer')
+    return df
diff --git a/pandas_datareader/datareaders/google/__init__.py b/pandas_datareader/datareaders/google/__init__.py
@@ -0,0 +1,34 @@
+from pandas_datareader.shared import _get_data_from
+from pandas_datareader.datareaders.google.daily import _get_hist_google
+
+def get_data_google(symbols=None, start=None, end=None, retry_count=3,
+                    pause=0.001, chunksize=25):
+    """
+    Returns DataFrame/Panel of historical stock prices from symbols, over date
+    range, start to end. To avoid being penalized by Google Finance servers,
+    pauses between downloading 'chunks' of symbols can be specified.
+
+    Parameters
+    ----------
+    symbols : string, array-like object (list, tuple, Series), or DataFrame
+        Single stock symbol (ticker), array-like object of symbols or
+        DataFrame with index containing stock symbols.
+    start : string, (defaults to '1/1/2010')
+        Starting date, timestamp. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end : string, (defaults to today)
+        Ending date, timestamp. Same format as starting date.
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
+    chunksize : int, default 25
+        Number of symbols to download consecutively before intiating pause.
+
+    Returns
+    -------
+    hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
+    """
+    return _get_data_from(symbols, start, end, None, retry_count, pause,
+                          chunksize, _get_hist_google)
diff --git a/pandas_datareader/datareaders/google/daily.py b/pandas_datareader/datareaders/google/daily.py
@@ -0,0 +1,23 @@
+from pandas.io.common import urlencode
+from pandas_datareader.url_request import _retry_read_url
+from pandas_datareader.date_chunks import _sanitize_dates
+
+_HISTORICAL_GOOGLE_URL = 'http://www.google.com/finance/historical?'
+
+
+def _get_hist_google(sym, start, end, interval, retry_count, pause):
+    """
+    Get historical data for the given name from google.
+    Date format is datetime
+
+    Returns a DataFrame.
+    """
+    start, end = _sanitize_dates(start, end)
+
+    # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
+    url = "%s%s" % (_HISTORICAL_GOOGLE_URL,
+                    urlencode({"q": sym,
+                               "startdate": start.strftime('%b %d, ' '%Y'),
+                               "enddate": end.strftime('%b %d, %Y'),
+                               "output": "csv"}))
+    return _retry_read_url(url, retry_count, pause, 'Google')
diff --git a/pandas_datareader/datareaders/google/quotes.py b/pandas_datareader/datareaders/google/quotes.py
@@ -0,0 +1,10 @@
+def get_quote_google(symbols):
+    """
+    Get current yahoo quote
+
+    (Should) Returns a DataFrame
+
+    ToDo: Not implemented
+    """
+    msg = "Google Finance doesn't have this functionality - can't get quote for %s" % symbols
+    raise NotImplementedError(msg)
diff --git a/pandas_datareader/datareaders/yahoo/__init__.py b/pandas_datareader/datareaders/yahoo/__init__.py
@@ -0,0 +1,88 @@
+from pandas_datareader.shared import _get_data_from
+from pandas_datareader.datareaders.yahoo.daily import _get_hist_yahoo
+
+def get_data_yahoo(symbols=None, start=None, end=None, retry_count=3,
+                   pause=0.001, adjust_price=False, ret_index=False,
+                   chunksize=25, interval='d'):
+    """
+    Returns DataFrame/Panel of historical stock prices from symbols, over date
+    range, start to end. To avoid being penalized by Yahoo! Finance servers,
+    pauses between downloading 'chunks' of symbols can be specified.
+
+    Parameters
+    ----------
+    symbols : string, array-like object (list, tuple, Series), or DataFrame
+        Single stock symbol (ticker), array-like object of symbols or
+        DataFrame with index containing stock symbols.
+    start : string, (defaults to '1/1/2010')
+        Starting date, timestamp. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end : string, (defaults to today)
+        Ending date, timestamp. Same format as starting date.
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
+    adjust_price : bool, default False
+        If True, adjusts all prices in hist_data ('Open', 'High', 'Low',
+        'Close') based on 'Adj Close' price. Adds 'Adj_Ratio' column and drops
+        'Adj Close'.
+    ret_index : bool, default False
+        If True, includes a simple return index 'Ret_Index' in hist_data.
+    chunksize : int, default 25
+        Number of symbols to download consecutively before intiating pause.
+    interval : string, default 'd'
+        Time interval code, valid values are 'd' for daily, 'w' for weekly,
+        'm' for monthly and 'v' for dividend.
+
+    Returns
+    -------
+    hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
+    """
+    if interval not in ['d', 'w', 'm', 'v']:
+        raise ValueError("Invalid interval: valid values are 'd', 'w', 'm' and 'v'")
+    hist_data = _get_data_from(symbols, start, end, interval, retry_count, pause, \
+                    chunksize, _get_hist_yahoo)
+    if ret_index:
+        hist_data['Ret_Index'] = _calc_return_index(hist_data['Adj Close'])
+    if adjust_price:
+        hist_data = _adjust_prices(hist_data)
+    return hist_data
+
+def _adjust_prices(hist_data, price_list=None):
+    """
+    Return modifed DataFrame or Panel with adjusted prices based on
+    'Adj Close' price. Adds 'Adj_Ratio' column.
+    """
+    if price_list is None:
+        price_list = 'Open', 'High', 'Low', 'Close'
+    adj_ratio = hist_data['Adj Close'] / hist_data['Close']
+
+    data = hist_data.copy()
+    for item in price_list:
+        data[item] = hist_data[item] * adj_ratio
+    data['Adj_Ratio'] = adj_ratio
+    del data['Adj Close']
+    return data
+
+
+def _calc_return_index(price_df):
+    """
+    Return a returns index from a input price df or series. Initial value
+    (typically NaN) is set to 1.
+    """
+    df = price_df.pct_change().add(1).cumprod()
+    mask = df.ix[1].notnull() & df.ix[0].isnull()
+    df.ix[0][mask] = 1
+
+    # Check for first stock listings after starting date of index in ret_index
+    # If True, find first_valid_index and set previous entry to 1.
+    if (~mask).any():
+        for sym in mask.index[~mask]:
+            tstamp = df[sym].first_valid_index()
+            t_idx = df.index.get_loc(tstamp) - 1
+            df[sym].ix[t_idx] = 1
+
+    return df
+
diff --git a/pandas_datareader/datareaders/yahoo/actions.py b/pandas_datareader/datareaders/yahoo/actions.py
@@ -0,0 +1,82 @@
+import time
+import csv
+from pandas import to_datetime, DataFrame
+from pandas.io.common import urlopen
+from pandas.util.testing import _network_error_classes
+from pandas.compat import StringIO, bytes_to_str
+from pandas_datareader.date_chunks import _sanitize_dates
+
+_HISTORICAL_YAHOO_ACTIONS_URL = 'http://ichart.finance.yahoo.com/x?'
+
+
+def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, pause=0.001):
+    """
+    Returns DataFrame of historical corporate actions (dividends and stock
+    splits) from symbols, over date range, start to end. All dates in the
+    resulting DataFrame correspond with dividend and stock split ex-dates.
+
+    Parameters
+    ----------
+        sym : string with a single Single stock symbol (ticker).
+        start : string, (defaults to '1/1/2010')
+                Starting date, timestamp. Parses many different kind of date
+                representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+        end : string, (defaults to today)
+                Ending date, timestamp. Same format as starting date.
+        retry_count : int, default 3
+                Number of times to retry query request.
+        pause : int, default 0
+                Time, in seconds, of the pause between retries.
+    """
+
+    start, end = _sanitize_dates(start, end)
+    url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + \
+                '&a=%s' % (start.month - 1) + \
+                '&b=%s' % start.day + \
+                '&c=%s' % start.year + \
+                '&d=%s' % (end.month - 1) + \
+                '&e=%s' % end.day + \
+                '&f=%s' % end.year + \
+                '&g=v')
+
+    for _ in range(retry_count):
+        time.sleep(pause)
+
+        try:
+            with urlopen(url) as resp:
+                lines = resp.read()
+        except _network_error_classes:
+            pass
+        else:
+            actions_index = []
+            actions_entries = []
+
+            for line in csv.reader(StringIO(bytes_to_str(lines))):
+                # Ignore lines that aren't dividends or splits (Yahoo
+                # add a bunch of irrelevant fields.)
+                if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
+                    continue
+
+                action, date, value = line
+                if action == 'DIVIDEND':
+                    actions_index.append(to_datetime(date))
+                    actions_entries.append({
+                        'action': action,
+                        'value': float(value)
+                    })
+                elif action == 'SPLIT' and ':' in value:
+                    # Convert the split ratio to a fraction. For example a
+                    # 4:1 split expressed as a fraction is 1/4 = 0.25.
+                    denominator, numerator = value.split(':', 1)
+                    split_fraction = float(numerator) / float(denominator)
+
+                    actions_index.append(to_datetime(date))
+                    actions_entries.append({
+                        'action': action,
+                        'value': split_fraction
+                    })
+
+            return DataFrame(actions_entries, index=actions_index)
+
+    raise IOError("after %d tries, Yahoo! did not " \
+                                "return a 200 for url %r" % (retry_count, url))
diff --git a/pandas_datareader/datareaders/yahoo/components.py b/pandas_datareader/datareaders/yahoo/components.py
@@ -0,0 +1,57 @@
+from pandas import DataFrame
+from pandas.io.common import urlopen
+
+_YAHOO_COMPONENTS_URL = 'http://download.finance.yahoo.com/d/quotes.csv?'
+
+
+def get_components_yahoo(idx_sym):
+    """
+    Returns DataFrame containing list of component information for
+    index represented in idx_sym from yahoo. Includes component symbol
+    (ticker), exchange, and name.
+
+    Parameters
+    ----------
+    idx_sym : str
+        Stock index symbol
+        Examples:
+        '^DJI' (Dow Jones Industrial Average)
+        '^NYA' (NYSE Composite)
+        '^IXIC' (NASDAQ Composite)
+
+        See: http://finance.yahoo.com/indices for other index symbols
+
+    Returns
+    -------
+    idx_df : DataFrame
+    """
+    stats = 'snx'
+    # URL of form:
+    # http://download.finance.yahoo.com/d/quotes.csv?s=@%5EIXIC&f=snxl1d1t1c1ohgv
+    url = _YAHOO_COMPONENTS_URL + 's={0}&f={1}&e=.csv&h={2}'
+
+    idx_mod = idx_sym.replace('^', '@%5E')
+    url_str = url.format(idx_mod, stats, 1)
+
+    idx_df = DataFrame()
+    mask = [True]
+    comp_idx = 1
+
+    # LOOP across component index structure,
+    # break when no new components are found
+    while True in mask:
+        url_str = url.format(idx_mod, stats, comp_idx)
+        with urlopen(url_str) as resp:
+            raw = resp.read()
+        lines = raw.decode('utf-8').strip().strip('"').split('"\r\n"')
+        lines = [line.strip().split('","') for line in lines]
+
+        temp_df = DataFrame(lines, columns=['ticker', 'name', 'exchange'])
+        temp_df = temp_df.drop_duplicates()
+        temp_df = temp_df.set_index('ticker')
+        mask = ~temp_df.index.isin(idx_df.index)
+
+        comp_idx = comp_idx + 50
+        idx_df = idx_df.append(temp_df[mask])
+
+    return idx_df