featuretools icon indicating copy to clipboard operation
featuretools copied to clipboard

Add LagByPeriods primitive

Open gsheni opened this issue 3 years ago • 4 comments

  • Add Lag primitive, which can be helpful for time-series problems
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Variable


class LagByPeriods(TransformPrimitive):
    """Shifts an array of values by a specified number of periods.

    Args:
        periods (int): The number of periods by which to shift the input.
            Default is 1.

        fill_value (int, float, string, bool): The value to use to fill in
            the gaps left after shifting the input. Default is None.

    Examples:
        >>> lag = Lag()
        >>> lag([1, 2, 3, 4, 5]).tolist()
        [nan, 1.0, 2.0, 3.0, 4.0]

        You can specify the number of periods to shift the values

        >>> lag_periods = Lag(periods=3)
        >>> lag_periods([1, 2, 3, 4, 5]).tolist()
        [nan, nan, nan, 1.0, 2.0]

        You can specify the fill value to use

        >>> lag_fill_value = Lag(fill_value=100)
        >>> lag_fill_value([1, 2, 3, 4]).tolist()
        [100, 1, 2, 3]
    """
    name = "lag"
    input_types = [Variable]

    def __init__(self, periods=1, fill_value=None):
        self.periods = periods
        self.fill_value = fill_value

    def get_function(self):
        def lag(x):
            return x.shift(periods=self.periods, fill_value=self.fill_value)
        return lag

gsheni avatar Nov 12 '21 20:11 gsheni

Unit Tests

import numpy as np
import pandas as pd

from .utils import BaseTestTransform, find_applicable_primitives, valid_dfs

from premium_primitives.lag import Lag


class TestLag(BaseTestTransform):
    primitive = Lag

    # test datetimes

    def test_regular(self):
        primitive_instance = self.primitive()
        primitive_func = primitive_instance.get_function()
        array = pd.Series([1, 2, 3, 4])
        answer = primitive_func(array)
        correct_answer = pd.Series([np.nan, 1, 2, 3])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_period(self):
        primitive_instance = self.primitive(periods=3)
        primitive_func = primitive_instance.get_function()
        array = pd.Series([1, 2, 3, 4])
        answer = primitive_func(array)
        correct_answer = pd.Series([np.nan, np.nan, np.nan, 1])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_negative_period(self):
        primitive_instance = self.primitive(periods=-2)
        primitive_func = primitive_instance.get_function()
        array = pd.Series([1, 2, 3, 4])
        answer = primitive_func(array)
        correct_answer = pd.Series([3, 4, np.nan, np.nan])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_fill_value(self):
        primitive_instance = self.primitive(fill_value=10)
        primitive_func = primitive_instance.get_function()
        array = pd.Series([1, 2, 3, 4])
        answer = primitive_func(array)
        correct_answer = pd.Series([10, 1, 2, 3])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_strings(self):
        primitive_instance = self.primitive()
        primitive_func = primitive_instance.get_function()
        array = pd.Series(['one', 'two', 'three', 'four'])
        answer = primitive_func(array)
        correct_answer = pd.Series([np.nan, 'one', 'two', 'three'])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_datetimes(self):
        primitive_instance = self.primitive()
        primitive_func = primitive_instance.get_function()
        array = pd.Series([pd.to_datetime('2018-01-01'),
                           pd.to_datetime('2018-01-02'),
                           pd.to_datetime('2018-01-03'),
                           pd.to_datetime('2018-01-04')])
        answer = primitive_func(array)
        correct_answer = pd.Series([pd.NaT,
                                    pd.to_datetime('2018-01-01'),
                                    pd.to_datetime('2018-01-02'),
                                    pd.to_datetime('2018-01-03')])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_starts_with_nan(self):
        primitive_instance = self.primitive()
        primitive_func = primitive_instance.get_function()
        array = pd.Series([np.nan, 2, 3, 4])
        answer = primitive_func(array)
        correct_answer = pd.Series([np.nan, np.nan, 2, 3])
        pd.testing.assert_series_equal(answer, correct_answer)

    def test_ends_with_nan(self):
        primitive_instance = self.primitive()
        primitive_func = primitive_instance.get_function()
        array = pd.Series([1, 2, 3, np.nan])
        answer = primitive_func(array)
        correct_answer = pd.Series([np.nan, 1, 2, 3])
        pd.testing.assert_series_equal(answer, correct_answer)

gsheni avatar Nov 12 '21 20:11 gsheni

Delayed because of problems outlined in this draft PR: https://github.com/alteryx/featuretools/pull/1788

tamargrey avatar Dec 15 '21 21:12 tamargrey

Part of this issue will also to verify the Featuretools NumericLag is identical (or comparable) to the numeric lag behavior for numeric columns in TimeSeriesFeaturizer._compute_delays.

gsheni avatar Jul 13 '22 21:07 gsheni

I wonder if there is overlap with this primitive: https://github.com/alteryx/featuretools/issues/2203

gsheni avatar Jul 25 '22 18:07 gsheni