zipline
zipline copied to clipboard
ingest futures data issue
Environment
- Operating System: mac osx
- Python Version: 3.5.6
- How did you install Zipline: conda
Description of Issue
I believe I was able to successfully ingest my futures data, but I seem to be having a slight problem with the way it was done. I have followed the process listed here to check if the data was ingested correctly: https://github.com/quantopian/zipline/issues/2293 It seems like zipline is attaching a continuation of the volume to the bundle data when it ingests it. Does this make any sense? Is that possible? How might I fix this if so?
Here is what happens when I plot the individual contracts using pandas:

And here is what I get when I plot the individual contracts from the bundle:

I really appreciate any insights!
This is my code for plotting the bundle data:
import os
import pandas as pd
from pandas import Timestamp
import matplotlib.pyplot as plt
import itertools
from zipline.data import bundles
from zipline.data.data_portal import DataPortal
from zipline.utils.calendars import get_calendar
from zipline.assets._assets import Future
from zipline.utils.run_algo import load_extensions
from zipline.data.bundles import load
from zipline.data.bundles.quandl import quandl_bundle
load_extensions(
default=True,
extensions=[],
strict=True,
environ=os.environ,
)
bundle_name = 'full_pinnacle_futures'
trading_calendar = get_calendar('CME')
bundle_data = bundles.load(bundle_name)
data = DataPortal(asset_finder= bundle_data.asset_finder,
trading_calendar = trading_calendar,
first_trading_day = bundle_data.equity_daily_bar_reader.first_trading_day,
equity_minute_reader=None,
equity_daily_reader=bundle_data.equity_daily_bar_reader,
future_daily_reader=bundle_data.equity_daily_bar_reader,
adjustment_reader=bundle_data.adjustment_reader)
continuous_future = bundle_data.asset_finder.create_continuous_future
history = data.get_history_window
start_date='2000-01-05'
end_date='2001-06-01'
contract_list=['CLF01', 'CLG01', 'CLH01', 'CLJ01', 'CLK01', 'CLM01']
# ~ contract_list=['ESH01', 'ESM01', 'ESU01', 'ESZ01']
all_contracts = [
bundle_data.asset_finder.lookup_future_symbol(x)
for x
in contract_list
]
start_dt = pd.Timestamp(start_date, tz='UTC', offset='C')
end_dt = pd.Timestamp(end_date, tz='UTC', offset='C')
end_loc = trading_calendar.closes.index.get_loc(end_dt)
start_loc = trading_calendar.closes.index.get_loc(start_dt)
all_consecutive_contract_volume = history(
assets=all_contracts,
end_dt=end_dt,
bar_count=end_loc - start_loc,
frequency='1d',
field='volume',
data_frequency='daily'
)
all_consecutive_contract_close = history(
assets=all_contracts,
end_dt=end_dt,
bar_count=end_loc - start_loc,
frequency='1d',
field='close',
data_frequency='daily'
)
all_consecutive_contract_volume.plot(legend=False)
plt.ylabel('Volume')
plt.xlabel(r'Date')
plt.legend(loc='upper left')
plt.show()
And my code for ingesting:
import pandas as pd
from os import listdir
from tqdm import tqdm # Used for progress bar
# ~ import dateutil.relativedelta
# Change the path to where you have your data
base_path = "/Users/Marlowe/gitsite/trending/zipline/data/"
data_path = base_path + 'pinnacle_data_full_modified/'
meta_path = 'futures_meta/pinnacle_meta.csv'
futures_lookup = pd.read_csv(base_path + meta_path, index_col=0)
"""
The ingest function needs to have this exact signature,
meaning these arguments passed, as shown below.
"""
def full_pinnacle_futures(environ,
asset_db_writer,
minute_bar_writer,
daily_bar_writer,
adjustment_writer,
calendar,
start_session,
end_session,
cache,
show_progress,
output_dir):
# Get list of files from path
# Slicing off the last part, strip off .csv
# 'example.csv'[:-4] = 'example'
symbols = [f[:-4] for f in listdir(data_path)]
symbols = symbols[1:] # adding this line because I was getting a weird file .DS_S in the symbols list
# ~ print(symbols)
# ~ return
if not symbols:
raise ValueError("No symbols found in folder.")
# Prepare an empty DataFrame for dividends
divs = pd.DataFrame(columns=['sid',
'amount',
'ex_date',
'record_date',
'declared_date',
'pay_date']
)
# Prepare an empty DataFrame for splits
splits = pd.DataFrame(columns=['sid',
'ratio',
'effective_date']
)
# Prepare an empty DataFrame for metadata
metadata = pd.DataFrame(columns=('start_date',
'end_date',
'auto_close_date',
'symbol',
'root_symbol',
'expiration_date',
'notice_date',
'tick_size',
'exchange'
)
)
# Check valid trading dates, according to the selected exchange calendar
sessions = calendar.sessions_in_range(start_session, end_session)
# Get data for all stocks and write to Zipline
daily_bar_writer.write(
process_futures(symbols, sessions, metadata)
)
adjustment_writer.write(splits=splits, dividends=divs)
# Prepare root level metadata
root_symbols = futures_lookup.copy()
root_symbols['root_symbol_id'] = root_symbols.index.values
del root_symbols['minor_fx_adj']
#write the meta data
asset_db_writer.write(futures=metadata, root_symbols=root_symbols)
def process_futures(symbols, sessions, metadata):
# Loop the stocks, setting a unique Security ID (SID)
sid = 0
# Loop the symbols with progress bar, using tqdm
for symbol in tqdm(symbols, desc='Loading data...'):
sid += 1
# ~ print('ingesting: ',symbol)
# 3/30/20 I was running into issues because some of the contracts that ended in 00 had a last trade date of 12/31/99
# in line 129, the dataframe gets rid of anything before 2000, so it could return an empty dataframe
# Read the stock data from csv file.
df = pd.read_csv('{}/{}.csv'.format(data_path, symbol), index_col=[0], parse_dates=[0])
# ~ print(symbol)
# ~ print(df.head(10))
# ~ return
# Check for minor currency quotes
adjustment_factor = futures_lookup.loc[
futures_lookup['root_symbol'] == df.iloc[0]['root_symbol']
]['minor_fx_adj'].iloc[0]
df['open'] *= adjustment_factor
df['high'] *= adjustment_factor
df['low'] *= adjustment_factor
df['close'] *= adjustment_factor
# Avoid potential high / low data errors in data set
# And apply minor currency adjustment for USc quotes
df['high'] = df[['high', 'close']].max(axis=1)
df['low'] = df[['low', 'close']].min(axis=1)
df['high'] = df[['high', 'open']].max(axis=1)
df['low'] = df[['low', 'open']].min(axis=1)
# Synch to the official exchange calendar
df = df.reindex(sessions.tz_localize(None))[df.index[0]:df.index[-1] ]
# Forward fill missing data
df.fillna(method='ffill', inplace=True)
# Drop remaining NaN
df.dropna(inplace=True)
# Cut dates before 2000, avoiding Zipline issue
# MQ 3/6/20: Is this still an issue? I would want to use data before 2000
# MQ 4/18/20: ignoring this for full data dump
# ~ df = df['2000-01-01':]
# Prepare contract metadata
# MQ 3/6/20: need to know sector
sector=futures_lookup.loc[futures_lookup['root_symbol'] == df.iloc[0]['root_symbol']]['sector'].iloc[0]
make_meta(sid, metadata, df, sessions, sector)
del df['openinterest']
del df['expiration_date']
del df['root_symbol']
del df['symbol']
yield sid, df
def make_meta(sid, metadata, df, sessions, sector):
# Check first and last date.
start_date = df.index[0]
end_date = df.index[-1]
# The auto_close date is the day after the last trade.
ac_date_equities = end_date + pd.Timedelta(days=1)
# MQ 3/6/20: need an ac_date for equitys and different one for commodities
# should use date one month prior to expiry
ac_date_commodities = end_date - pd.Timedelta(days=30)
symbol = df.iloc[0]['symbol']
root_sym = df.iloc[0]['root_symbol']
exchng = futures_lookup.loc[futures_lookup['root_symbol'] == root_sym ]['exchange'].iloc[0]
exp_date = end_date
tick_size = 0.0001 # Placeholder
# Add notice day if you have.
# Tip to improve: Set notice date to one month prior to
# expiry for commodity markets.
if sector=='Rates':
notice_date = ac_date_equities
# Add a row to the metadata DataFrame.
metadata.loc[sid] = start_date, end_date, ac_date_equities, symbol, root_sym, exp_date, notice_date, tick_size, exchng
elif sector=='Equitites':
notice_date = ac_date_equities
# Add a row to the metadata DataFrame.
metadata.loc[sid] = start_date, end_date, ac_date_equities, symbol, root_sym, exp_date, notice_date, tick_size, exchng
else:
notice_date = ac_date_commodities
# Add a row to the metadata DataFrame.
metadata.loc[sid] = start_date, end_date, ac_date_commodities, symbol, root_sym, exp_date, notice_date, tick_size, exchng
Is it possible that this issue could have to do with discontinuities in my data? It doesn't appear to be the case in the example here, but I do know that there are some dates where the volume shows up as zero contracts. Could that be messing up the ingest of the volume data?
I figured out what my issue was, I got the open interest and volume columns switched. Case of user error haha.