featuretools
featuretools copied to clipboard
Add LagByPeriods primitive
- Add Lag primitive, which can be helpful for time-series problems
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Variable
class LagByPeriods(TransformPrimitive):
"""Shifts an array of values by a specified number of periods.
Args:
periods (int): The number of periods by which to shift the input.
Default is 1.
fill_value (int, float, string, bool): The value to use to fill in
the gaps left after shifting the input. Default is None.
Examples:
>>> lag = Lag()
>>> lag([1, 2, 3, 4, 5]).tolist()
[nan, 1.0, 2.0, 3.0, 4.0]
You can specify the number of periods to shift the values
>>> lag_periods = Lag(periods=3)
>>> lag_periods([1, 2, 3, 4, 5]).tolist()
[nan, nan, nan, 1.0, 2.0]
You can specify the fill value to use
>>> lag_fill_value = Lag(fill_value=100)
>>> lag_fill_value([1, 2, 3, 4]).tolist()
[100, 1, 2, 3]
"""
name = "lag"
input_types = [Variable]
def __init__(self, periods=1, fill_value=None):
self.periods = periods
self.fill_value = fill_value
def get_function(self):
def lag(x):
return x.shift(periods=self.periods, fill_value=self.fill_value)
return lag
Unit Tests
import numpy as np
import pandas as pd
from .utils import BaseTestTransform, find_applicable_primitives, valid_dfs
from premium_primitives.lag import Lag
class TestLag(BaseTestTransform):
primitive = Lag
# test datetimes
def test_regular(self):
primitive_instance = self.primitive()
primitive_func = primitive_instance.get_function()
array = pd.Series([1, 2, 3, 4])
answer = primitive_func(array)
correct_answer = pd.Series([np.nan, 1, 2, 3])
pd.testing.assert_series_equal(answer, correct_answer)
def test_period(self):
primitive_instance = self.primitive(periods=3)
primitive_func = primitive_instance.get_function()
array = pd.Series([1, 2, 3, 4])
answer = primitive_func(array)
correct_answer = pd.Series([np.nan, np.nan, np.nan, 1])
pd.testing.assert_series_equal(answer, correct_answer)
def test_negative_period(self):
primitive_instance = self.primitive(periods=-2)
primitive_func = primitive_instance.get_function()
array = pd.Series([1, 2, 3, 4])
answer = primitive_func(array)
correct_answer = pd.Series([3, 4, np.nan, np.nan])
pd.testing.assert_series_equal(answer, correct_answer)
def test_fill_value(self):
primitive_instance = self.primitive(fill_value=10)
primitive_func = primitive_instance.get_function()
array = pd.Series([1, 2, 3, 4])
answer = primitive_func(array)
correct_answer = pd.Series([10, 1, 2, 3])
pd.testing.assert_series_equal(answer, correct_answer)
def test_strings(self):
primitive_instance = self.primitive()
primitive_func = primitive_instance.get_function()
array = pd.Series(['one', 'two', 'three', 'four'])
answer = primitive_func(array)
correct_answer = pd.Series([np.nan, 'one', 'two', 'three'])
pd.testing.assert_series_equal(answer, correct_answer)
def test_datetimes(self):
primitive_instance = self.primitive()
primitive_func = primitive_instance.get_function()
array = pd.Series([pd.to_datetime('2018-01-01'),
pd.to_datetime('2018-01-02'),
pd.to_datetime('2018-01-03'),
pd.to_datetime('2018-01-04')])
answer = primitive_func(array)
correct_answer = pd.Series([pd.NaT,
pd.to_datetime('2018-01-01'),
pd.to_datetime('2018-01-02'),
pd.to_datetime('2018-01-03')])
pd.testing.assert_series_equal(answer, correct_answer)
def test_starts_with_nan(self):
primitive_instance = self.primitive()
primitive_func = primitive_instance.get_function()
array = pd.Series([np.nan, 2, 3, 4])
answer = primitive_func(array)
correct_answer = pd.Series([np.nan, np.nan, 2, 3])
pd.testing.assert_series_equal(answer, correct_answer)
def test_ends_with_nan(self):
primitive_instance = self.primitive()
primitive_func = primitive_instance.get_function()
array = pd.Series([1, 2, 3, np.nan])
answer = primitive_func(array)
correct_answer = pd.Series([np.nan, 1, 2, 3])
pd.testing.assert_series_equal(answer, correct_answer)
Delayed because of problems outlined in this draft PR: https://github.com/alteryx/featuretools/pull/1788
Part of this issue will also to verify the Featuretools NumericLag is identical (or comparable) to the numeric lag behavior for numeric columns in TimeSeriesFeaturizer._compute_delays.
I wonder if there is overlap with this primitive: https://github.com/alteryx/featuretools/issues/2203