evidently
evidently copied to clipboard
Saving many snapshots aggregates in memory
I work with the latest evidently
(0.4.31) locally and face memory consumption that increases as I save more snapshots.
After running the example below for a few minutes, there are about 2500 snapshots in my local workspace project's folder. The memory consumption reached 1.5 GB and continued increasing.
Python Example
import time
from typing import cast
import pandas as pd
from evidently.metrics import (
ColumnDriftMetric,
ColumnSummaryMetric,
DatasetDriftMetric,
DatasetMissingValuesMetric,
)
from evidently.report.report import Report
from evidently.test_preset import DataDriftTestPreset
from evidently.test_suite import TestSuite
from evidently.ui.base import Project
from evidently.ui.workspace import Workspace
from loguru import logger
from sklearn.datasets import load_iris
class EvidentlyApplication:
def __init__(self, workspace_path: str, project_name: str):
logger.debug("Initializing the class")
self.workspace = Workspace(workspace_path)
project = self.workspace.create_project(name=project_name)
self.project_id = project.id
logger.debug(f"Project id = {self.project_id}")
self.project = cast(Project, self.workspace.get_project(self.project_id))
iris = load_iris()
self.columns = iris.feature_names
self.train_set = pd.DataFrame(iris.data, columns=self.columns)
def perform_analysis(self) -> None:
logger.info("Started analysis")
time = pd.Timestamp.now("utc")
sample_df = self.train_set
data_drift_report = self.create_report(sample_df, time)
self.project.add_snapshot(data_drift_report.to_snapshot())
data_drift_test_suite = self.create_test_suite(sample_df, time)
self.project.add_snapshot(data_drift_test_suite.to_snapshot())
logger.info("Finished analysis")
def create_report(
self, sample_df: pd.DataFrame, schedule_time: pd.Timestamp
) -> Report:
metrics = [
DatasetDriftMetric(),
DatasetMissingValuesMetric(),
]
for col_name in self.columns:
metrics.extend(
[
ColumnDriftMetric(column_name=col_name, stattest="wasserstein"),
ColumnSummaryMetric(column_name=col_name),
]
)
data_drift_report = Report(metrics=metrics, timestamp=schedule_time)
data_drift_report.run(reference_data=self.train_set, current_data=sample_df)
return data_drift_report
def create_test_suite(
self, sample_df: pd.DataFrame, schedule_time: pd.Timestamp
) -> TestSuite:
data_drift_test_suite = TestSuite(
tests=[DataDriftTestPreset()], timestamp=schedule_time
)
data_drift_test_suite.run(reference_data=self.train_set, current_data=sample_df)
return data_drift_test_suite
if __name__ == "__main__":
app = EvidentlyApplication(
workspace_path="./evidently_1/", project_name="eternal-loop"
)
while True:
app.perform_analysis()
time.sleep(0.1)
After I had terminated the process, I loaded the workspace:
from time import time
from evidently.ui.workspace import Workspace
t = time()
ws = Workspace("./evidently_1")
print(time() - t)
51.66515111923218
It took more than 50 seconds, and the Python's process memory consumption is 1.5 GB, the same as it ended with.
To summarize, I have two questions:
- Why is the memory not freed by evidently?
- IIUC, loading the local workspace loads all the snapshots. Is there a way to avoid that?
Thank you in advance 🙂