featuretools icon indicating copy to clipboard operation
featuretools copied to clipboard

Add LagByTimedelta primitive

Open gsheni opened this issue 1 year ago • 1 comments

  • Add LagByTimedelta primitive, which can be helpful for time-series problems
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import DatetimeTimeIndex, Variable
import pandas as pd


class LagByTime(TransformPrimitive):
    """Shift an array of values by a specified timedelta.

    Description:
        Given a list of values and a corresponding datetime index, shift
        all the values by the specified timedelta. By default all gaps in
        the shifted data will contain nan. The method used to fill
        gaps in the shifted data can be specified with the 'method' parameter.

    Args:
        delta (timedelta): The timedelta by which to shift the input.
            Default is 1 second.

        method (str): Method to use for filling nan values in shifted Series.
        Possible values are [None, 'pad', 'ffill', 'backfill', 'bfill', 'nearest'].
        Default is None.
            None: gaps will contain nan
            `pad / ffill`: propagate last valid observation forward
                to fill gap
            `backfill / bfill`: propagate next valid observation backward
                to fill gap
            'nearest': use nearest valid observations to fill gap

    Examples:
        >>> import pandas as pd
        >>> from datetime import datetime
        >>> lag_by_time = LagByTime()
        >>> times = pd.Series([datetime(2018, 4, 9, 10, 30, i) for i in range(5)])
        >>> data = pd.Series([1, 2, 3, 4, 5])
        >>> lag_by_time(data, times).tolist()
        [nan, 1.0, 2.0, 3.0, 4.0]

        The data can be shifted by a specified timedelta

        >>> delta = pd.to_timedelta(2, unit='h')
        >>> lag_by_time_delta = LagByTime(delta=delta)
        >>> times = pd.Series([datetime(2018, 4, 9, 10+i, 30, 0) for i in range(5)])
        >>> data = pd.Series([1, 2, 3, 4, 5])
        >>> lag_by_time_delta(data, times).tolist()
        [nan, nan, 1.0, 2.0, 3.0]

        The method used to fill the gaps can be specified

        >>> lag_by_time_method = LagByTime(method="bfill")
        >>> times = pd.Series([datetime(2018, 4, 9, 10, 30, i) for i in range(5)])
        >>> data = pd.Series([1, 2, 3, 4, 5])
        >>> lag_by_time_method(data, times).tolist()
        [1, 1, 2, 3, 4]
    """
    name = "lag_by_time"
    input_types = [Variable, DatetimeTimeIndex]

    def __init__(self, delta=pd.to_timedelta(1, unit='s'), method=None):
        self.delta = delta
        if method not in [None, 'backfill', 'bfill', 'pad', 'ffill', 'nearest']:
            raise ValueError("Invalid method")
        self.method = method

    def get_function(self):
        def lag_by_time(values, times):
            df = pd.DataFrame({'values': values}).set_index(times)
            return df.reindex(df.index - self.delta,
                              method=self.method)['values'].values
        return lag_by_time

    def generate_name(self, base_feature_names):
        param_string = "delta=%s" % (self.delta)
        if self.method is not None:
            param_string += "method=%s, " % (self.method)
        name = u"{}(".format(self.name.upper())
        name += u", ".join(base_feature_names)
        name += u", {}".format(param_string)
        name += u")"
        return name

gsheni avatar Jul 25 '22 18:07 gsheni

I wonder if there is overlap with this primitive: https://github.com/alteryx/featuretools/issues/1781

gsheni avatar Jul 25 '22 18:07 gsheni