featuretools
featuretools copied to clipboard
Add LagByTimedelta primitive
- Add LagByTimedelta primitive, which can be helpful for time-series problems
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import DatetimeTimeIndex, Variable
import pandas as pd
class LagByTime(TransformPrimitive):
"""Shift an array of values by a specified timedelta.
Description:
Given a list of values and a corresponding datetime index, shift
all the values by the specified timedelta. By default all gaps in
the shifted data will contain nan. The method used to fill
gaps in the shifted data can be specified with the 'method' parameter.
Args:
delta (timedelta): The timedelta by which to shift the input.
Default is 1 second.
method (str): Method to use for filling nan values in shifted Series.
Possible values are [None, 'pad', 'ffill', 'backfill', 'bfill', 'nearest'].
Default is None.
None: gaps will contain nan
`pad / ffill`: propagate last valid observation forward
to fill gap
`backfill / bfill`: propagate next valid observation backward
to fill gap
'nearest': use nearest valid observations to fill gap
Examples:
>>> import pandas as pd
>>> from datetime import datetime
>>> lag_by_time = LagByTime()
>>> times = pd.Series([datetime(2018, 4, 9, 10, 30, i) for i in range(5)])
>>> data = pd.Series([1, 2, 3, 4, 5])
>>> lag_by_time(data, times).tolist()
[nan, 1.0, 2.0, 3.0, 4.0]
The data can be shifted by a specified timedelta
>>> delta = pd.to_timedelta(2, unit='h')
>>> lag_by_time_delta = LagByTime(delta=delta)
>>> times = pd.Series([datetime(2018, 4, 9, 10+i, 30, 0) for i in range(5)])
>>> data = pd.Series([1, 2, 3, 4, 5])
>>> lag_by_time_delta(data, times).tolist()
[nan, nan, 1.0, 2.0, 3.0]
The method used to fill the gaps can be specified
>>> lag_by_time_method = LagByTime(method="bfill")
>>> times = pd.Series([datetime(2018, 4, 9, 10, 30, i) for i in range(5)])
>>> data = pd.Series([1, 2, 3, 4, 5])
>>> lag_by_time_method(data, times).tolist()
[1, 1, 2, 3, 4]
"""
name = "lag_by_time"
input_types = [Variable, DatetimeTimeIndex]
def __init__(self, delta=pd.to_timedelta(1, unit='s'), method=None):
self.delta = delta
if method not in [None, 'backfill', 'bfill', 'pad', 'ffill', 'nearest']:
raise ValueError("Invalid method")
self.method = method
def get_function(self):
def lag_by_time(values, times):
df = pd.DataFrame({'values': values}).set_index(times)
return df.reindex(df.index - self.delta,
method=self.method)['values'].values
return lag_by_time
def generate_name(self, base_feature_names):
param_string = "delta=%s" % (self.delta)
if self.method is not None:
param_string += "method=%s, " % (self.method)
name = u"{}(".format(self.name.upper())
name += u", ".join(base_feature_names)
name += u", {}".format(param_string)
name += u")"
return name
I wonder if there is overlap with this primitive: https://github.com/alteryx/featuretools/issues/1781