zipline icon indicating copy to clipboard operation
zipline copied to clipboard

ingest futures data issue

Open marlowequart opened this issue 5 years ago • 2 comments

Environment

  • Operating System: mac osx
  • Python Version: 3.5.6
  • How did you install Zipline: conda

Description of Issue

I believe I was able to successfully ingest my futures data, but I seem to be having a slight problem with the way it was done. I have followed the process listed here to check if the data was ingested correctly: https://github.com/quantopian/zipline/issues/2293 It seems like zipline is attaching a continuation of the volume to the bundle data when it ingests it. Does this make any sense? Is that possible? How might I fix this if so?

Here is what happens when I plot the individual contracts using pandas: pandas_data

And here is what I get when I plot the individual contracts from the bundle: bundle_data

I really appreciate any insights!

This is my code for plotting the bundle data:

import os
import pandas as pd
from pandas import Timestamp
import matplotlib.pyplot as plt
import itertools

from zipline.data import bundles
from zipline.data.data_portal import DataPortal
from zipline.utils.calendars import get_calendar
from zipline.assets._assets import Future
from zipline.utils.run_algo import load_extensions

from zipline.data.bundles import load
from zipline.data.bundles.quandl import quandl_bundle


load_extensions(
    default=True,
    extensions=[],
    strict=True,
    environ=os.environ,
)



bundle_name = 'full_pinnacle_futures'
trading_calendar = get_calendar('CME')

bundle_data = bundles.load(bundle_name)

data = DataPortal(asset_finder= bundle_data.asset_finder,
				trading_calendar = trading_calendar,
				first_trading_day = bundle_data.equity_daily_bar_reader.first_trading_day,
				equity_minute_reader=None,
				equity_daily_reader=bundle_data.equity_daily_bar_reader,
				future_daily_reader=bundle_data.equity_daily_bar_reader,
				adjustment_reader=bundle_data.adjustment_reader)




continuous_future = bundle_data.asset_finder.create_continuous_future
history = data.get_history_window


start_date='2000-01-05'
end_date='2001-06-01'


contract_list=['CLF01', 'CLG01', 'CLH01', 'CLJ01', 'CLK01', 'CLM01']
# ~ contract_list=['ESH01', 'ESM01', 'ESU01', 'ESZ01']


all_contracts = [
    bundle_data.asset_finder.lookup_future_symbol(x) 
    for x
    in contract_list
]


start_dt = pd.Timestamp(start_date, tz='UTC', offset='C')
end_dt = pd.Timestamp(end_date, tz='UTC', offset='C')

end_loc = trading_calendar.closes.index.get_loc(end_dt)
start_loc = trading_calendar.closes.index.get_loc(start_dt)    
    
all_consecutive_contract_volume = history(
    assets=all_contracts,
    end_dt=end_dt,
    bar_count=end_loc - start_loc,
    frequency='1d',
    field='volume',
    data_frequency='daily'
)

all_consecutive_contract_close = history(
    assets=all_contracts,
    end_dt=end_dt,
    bar_count=end_loc - start_loc,
    frequency='1d',
    field='close',
    data_frequency='daily'
)


all_consecutive_contract_volume.plot(legend=False)
plt.ylabel('Volume')
plt.xlabel(r'Date')

plt.legend(loc='upper left')


plt.show()

And my code for ingesting:

import pandas as pd
from os import listdir
from tqdm import tqdm # Used for progress bar
# ~ import dateutil.relativedelta

# Change the path to where you have your data
base_path = "/Users/Marlowe/gitsite/trending/zipline/data/"
data_path = base_path + 'pinnacle_data_full_modified/'
meta_path = 'futures_meta/pinnacle_meta.csv'
futures_lookup = pd.read_csv(base_path + meta_path, index_col=0)

"""
The ingest function needs to have this exact signature,
meaning these arguments passed, as shown below.
"""
def full_pinnacle_futures(environ,
                  asset_db_writer,
                  minute_bar_writer,
                  daily_bar_writer,
                  adjustment_writer,
                  calendar,
                  start_session,
                  end_session,
                  cache,
                  show_progress,
                  output_dir):
    
    # Get list of files from path
    # Slicing off the last part, strip off .csv
    # 'example.csv'[:-4] = 'example'
    symbols = [f[:-4] for f in listdir(data_path)]
    
    symbols = symbols[1:]	# adding this line because I was getting a weird file .DS_S in the symbols list
    # ~ print(symbols)
    # ~ return
    
    if not symbols:
        raise ValueError("No symbols found in folder.")
        
    # Prepare an empty DataFrame for dividends
    divs = pd.DataFrame(columns=['sid', 
                                 'amount',
                                 'ex_date', 
                                 'record_date',
                                 'declared_date', 
                                 'pay_date']
    )
    
    # Prepare an empty DataFrame for splits
    splits = pd.DataFrame(columns=['sid',
                                   'ratio',
                                   'effective_date']
    )

    # Prepare an empty DataFrame for metadata
    metadata = pd.DataFrame(columns=('start_date',
                                      'end_date',
                                      'auto_close_date',
                                      'symbol',
                                      'root_symbol',
                                      'expiration_date',
                                      'notice_date',
                                      'tick_size',
                                      'exchange'
                                      )
                            )

    # Check valid trading dates, according to the selected exchange calendar
    sessions = calendar.sessions_in_range(start_session, end_session)
    
    # Get data for all stocks and write to Zipline
    daily_bar_writer.write(
            process_futures(symbols, sessions, metadata)
            )
    
    adjustment_writer.write(splits=splits, dividends=divs)    
    
    # Prepare root level metadata
    root_symbols = futures_lookup.copy()
    root_symbols['root_symbol_id'] = root_symbols.index.values
    del root_symbols['minor_fx_adj']
    
    #write the meta data
    asset_db_writer.write(futures=metadata, root_symbols=root_symbols)
   
def process_futures(symbols, sessions, metadata):
    # Loop the stocks, setting a unique Security ID (SID)
    sid = 0
    
    # Loop the symbols with progress bar, using tqdm
    for symbol in tqdm(symbols, desc='Loading data...'):
        sid += 1
        # ~ print('ingesting: ',symbol)
        # 3/30/20 I was running into issues because some of the contracts that ended in 00 had a last trade date of 12/31/99
        # in line 129, the dataframe gets rid of anything before 2000, so it could return an empty dataframe
        # Read the stock data from csv file.
        df = pd.read_csv('{}/{}.csv'.format(data_path, symbol), index_col=[0], parse_dates=[0]) 
        # ~ print(symbol)
        # ~ print(df.head(10))
        # ~ return
        # Check for minor currency quotes
        adjustment_factor = futures_lookup.loc[
                futures_lookup['root_symbol'] == df.iloc[0]['root_symbol']
                ]['minor_fx_adj'].iloc[0]
        
        df['open'] *= adjustment_factor
        df['high'] *= adjustment_factor
        df['low'] *= adjustment_factor
        df['close'] *= adjustment_factor

        # Avoid potential high / low data errors in data set
        # And apply minor currency adjustment for USc quotes
        df['high'] = df[['high', 'close']].max(axis=1) 
        df['low'] = df[['low', 'close']].min(axis=1) 
        df['high'] = df[['high', 'open']].max(axis=1)
        df['low'] = df[['low', 'open']].min(axis=1) 

        # Synch to the official exchange calendar
        df = df.reindex(sessions.tz_localize(None))[df.index[0]:df.index[-1] ]
        
        # Forward fill missing data
        df.fillna(method='ffill', inplace=True)
        
        # Drop remaining NaN
        df.dropna(inplace=True)     
        
        # Cut dates before 2000, avoiding Zipline issue
        # MQ 3/6/20: Is this still an issue? I would want to use data before 2000
        # MQ 4/18/20: ignoring this for full data dump
        # ~ df = df['2000-01-01':]
        
        # Prepare contract metadata
        # MQ 3/6/20: need to know sector
        sector=futures_lookup.loc[futures_lookup['root_symbol'] == df.iloc[0]['root_symbol']]['sector'].iloc[0]
        
        make_meta(sid, metadata, df, sessions, sector)
        
        del df['openinterest']
        del df['expiration_date']
        del df['root_symbol']
        del df['symbol']
        
        yield sid, df        
        
def make_meta(sid, metadata, df, sessions, sector):
        # Check first and last date.
        start_date = df.index[0]
        end_date = df.index[-1]        

        # The auto_close date is the day after the last trade.
        
        ac_date_equities = end_date + pd.Timedelta(days=1)
        # MQ 3/6/20: need an ac_date for equitys and different one for commodities
        # should use date one month prior to expiry
        ac_date_commodities = end_date - pd.Timedelta(days=30)
        
        
        symbol = df.iloc[0]['symbol']
        root_sym = df.iloc[0]['root_symbol']
        exchng = futures_lookup.loc[futures_lookup['root_symbol'] == root_sym ]['exchange'].iloc[0]
        exp_date = end_date
        
        tick_size = 0.0001   # Placeholder
        
        # Add notice day if you have.
        # Tip to improve: Set notice date to one month prior to
        # expiry for commodity markets.
        if sector=='Rates':
            notice_date = ac_date_equities
            # Add a row to the metadata DataFrame.
            metadata.loc[sid] = start_date, end_date, ac_date_equities, symbol, root_sym, exp_date, notice_date, tick_size, exchng
        elif sector=='Equitites':
            notice_date = ac_date_equities
            # Add a row to the metadata DataFrame.
            metadata.loc[sid] = start_date, end_date, ac_date_equities, symbol, root_sym, exp_date, notice_date, tick_size, exchng
        else:
            notice_date = ac_date_commodities
            # Add a row to the metadata DataFrame.
            metadata.loc[sid] = start_date, end_date, ac_date_commodities, symbol, root_sym, exp_date, notice_date, tick_size, exchng

marlowequart avatar May 13 '20 01:05 marlowequart

Is it possible that this issue could have to do with discontinuities in my data? It doesn't appear to be the case in the example here, but I do know that there are some dates where the volume shows up as zero contracts. Could that be messing up the ingest of the volume data?

marlowequart avatar May 14 '20 01:05 marlowequart

I figured out what my issue was, I got the open interest and volume columns switched. Case of user error haha.

marlowequart avatar Aug 08 '20 15:08 marlowequart