Skip to content

ENH: Dukascopy tick data #235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
femtotrader opened this issue Sep 8, 2016 · 0 comments
Open

ENH: Dukascopy tick data #235

femtotrader opened this issue Sep 8, 2016 · 0 comments

Comments

@femtotrader
Copy link
Contributor

femtotrader commented Sep 8, 2016

Dukascopy (also #153) provides ticks data

http://www.dukascopy.com/datafeed/EURUSD/2016/02/14/20h_ticks.bi5
is ticks for 2016-02-14 (datetime.datetime(2016, 2, 14)) from 08PM to 09PM
This is LZMA compressed data.

https://github.com/thalesians/pythalesians provides some Python code to download and process data.
https://github.com/thalesians/pythalesians/blob/4974a26c58fde1b4a86e6b683494a7ccd8fb6e2e/pythalesians/market/loaders/lowlevel/brokers/loaderdukascopy.py
they are using Struct https://docs.python.org/2/library/struct.html
but Numpy fromfile http://docs.scipy.org/doc/numpy-1.10.0/reference/generated/numpy.fromfile.html
or Construct http://construct.readthedocs.org/ might also be used

see also https://github.com/ninety47/dukascopy
http://eareview.net/tick-data/dukascopy-php-scripts
https://github.com/FX31337/FX-BT-Scripts
http://stackoverflow.com/questions/14035808/reading-data-from-dukascopy-tick-binary-file
http://stackoverflow.com/questions/30389417/not-sure-how-to-uncompress-read-result-from-binary-file

WIP:

import click

# import struct
import datetime

try:
    import lzma
except ImportError:
    # pip install backports.lzma
    from backports import lzma

import numpy as np
import pandas as pd
from collections import OrderedDict

import requests
import requests_cache

import warnings
from pandas_datareader._utils import RemoteDataError, SymbolWarning


def _init_session(session):
    if session is None:
        session = requests.Session()
    return session


# def chunks(lst, n):
#     if n < 1:
#         n = 1
#     return [lst[i:i + n] for i in range(0, len(lst), n)]


def _sanitize_dates(start, end):
    """
    Return (datetime_start, datetime_end) tuple
    if start is None - default is 2016/02/04
    if end is None - default is today
    """
    start = pd.to_datetime(start)
    end = pd.to_datetime(end)

    lst_not_a_time = [None, pd.NaT]

    if start in lst_not_a_time:
        start = datetime.datetime(2016, 2, 4)
    else:
        start = datetime.datetime(start.year, start.month, start.day,
                                  start.hour)

    if end in lst_not_a_time:
        end = start
    else:
        end = datetime.datetime(end.year, end.month, end.day, end.hour)

    return start, end


def _sanitize_symbol(symb):
    return symb.replace("/", "").upper()


def read(symbols, start, stop=None, session=None):
    return _read_several_several_symbols(_read_one_symbol, symbols, start, stop, session)


def _read_several_several_symbols(fct_read, symbols, start, stop=None, session=None):
    if len(symbols) == 1:
        df = fct_read(symbols[0], start, stop, session)
        return df
    else:
        d_df_symb = OrderedDict()
        failed = []
        for symb in symbols:
            try:
                d_df_symb[symb] = fct_read(symb, start, stop, session)
            except RemoteDataError:
                msg = 'Failed to read symbol: {0!r}'
                warnings.warn(msg.format(symb), SymbolWarning)
                failed.append(symb)
        return pd.Panel(d_df_symb).swapaxes('items', 'minor')


def _read_one_symbol(symb, start, stop, session):
    start, end = _sanitize_dates(start, stop)

    dt_chunks = pd.date_range(start, stop, freq="1H")

    lst = []
    for dt_chunk in dt_chunks:
        df_chunk = _read_chunk(symb, dt_chunk, session)
        lst.append(df_chunk)
    df_symb = pd.concat(lst, axis=0)
    return df_symb


def _read_chunk(symb, dt_chunk, session=None):
    session = _init_session(session)
    symb = _sanitize_symbol(symb)

    base_url = "http://www.dukascopy.com/datafeed"
    endpoint = "/%s/%04d/%02d/%02d/%02dh_ticks.bi5" \
        % (symb, dt_chunk.year, dt_chunk.month, dt_chunk.day, dt_chunk.hour)
    url = base_url + endpoint
    # Example:
    # http://www.dukascopy.com/datafeed/EURUSD/2016/02/14/20h_ticks.bi5

    response = session.get(url)

    if response.status_code != 200:
        raise RemoteDataError("Can't download %r using %r" % (symb, url))

    compressed_data = response.content
    columns = ["Date", "Ask", "Bid", "AskVolume", "BidVolume"]

    if len(compressed_data) > 0:
        raw_data = lzma.decompress(response.content)
        # chks = chunks(raw_data, 20)
        # data = [struct.unpack(">LLLff", chk) for chk in chks]
        """
        ">LLLff"
            >   big-endian
            L   unsigned long
            L   unsigned long
            L   unsigned long
            f   float
            f   float
        """
        record_dtype = np.dtype([
            ('Date', '>u4'),
            ('Ask', '>u4'),
            ('Bid', '>u4'),
            ('AskVolume', '>f4'),
            ('BidVolume', '>f4'),
        ])        
        data = np.fromstring(raw_data, record_dtype)

        df = pd.DataFrame(data, columns=columns)
        if symb[3:] == "JPY":
            p_digits = 3
        else:
            p_digits = 5
        for p in ["Ask", "Bid"]:
            df[p] = df[p] / 10**p_digits
        df["Date"] = dt_chunk + pd.to_timedelta(df["Date"], unit="ms")
        df = df.set_index("Date")
        return df
    else:
        return pd.DataFrame(columns=columns).set_index("Date")


@click.command()
#@click.option('--symb', default="EURUSD,USDJPY", help='Symbol.')
@click.option('--symb', default="EURUSD", help='Symbol.')
@click.option('--start', default="2016-02-15 00:00:00", help='Start.')
@click.option('--stop', default="2016-02-15 23:00:00", help='Stop.')
def main(symb, start, stop):
    pd.set_option("max_rows", 10)

    symb = symb.split(",")

    expire_after = datetime.timedelta(days=3)
    session = requests_cache.CachedSession(cache_name='cache',
                                           backend='sqlite',
                                           expire_after=expire_after)

    df = read(symb, start, stop, session=session)

    print(df)


if __name__ == '__main__':
    main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants