evidently icon indicating copy to clipboard operation
evidently copied to clipboard

Saving many snapshots aggregates in memory

Open jond01 opened this issue 7 months ago • 0 comments

I work with the latest evidently (0.4.31) locally and face memory consumption that increases as I save more snapshots.

After running the example below for a few minutes, there are about 2500 snapshots in my local workspace project's folder. The memory consumption reached 1.5 GB and continued increasing.

Python Example
import time
from typing import cast

import pandas as pd
from evidently.metrics import (
    ColumnDriftMetric,
    ColumnSummaryMetric,
    DatasetDriftMetric,
    DatasetMissingValuesMetric,
)
from evidently.report.report import Report
from evidently.test_preset import DataDriftTestPreset
from evidently.test_suite import TestSuite
from evidently.ui.base import Project
from evidently.ui.workspace import Workspace
from loguru import logger
from sklearn.datasets import load_iris


class EvidentlyApplication:
    def __init__(self, workspace_path: str, project_name: str):
        logger.debug("Initializing the class")
        self.workspace = Workspace(workspace_path)
        project = self.workspace.create_project(name=project_name)
        self.project_id = project.id
        logger.debug(f"Project id = {self.project_id}")
        self.project = cast(Project, self.workspace.get_project(self.project_id))
        iris = load_iris()
        self.columns = iris.feature_names
        self.train_set = pd.DataFrame(iris.data, columns=self.columns)

    def perform_analysis(self) -> None:
        logger.info("Started analysis")
        time = pd.Timestamp.now("utc")
        sample_df = self.train_set

        data_drift_report = self.create_report(sample_df, time)
        self.project.add_snapshot(data_drift_report.to_snapshot())
        data_drift_test_suite = self.create_test_suite(sample_df, time)
        self.project.add_snapshot(data_drift_test_suite.to_snapshot())

        logger.info("Finished analysis")

    def create_report(
        self, sample_df: pd.DataFrame, schedule_time: pd.Timestamp
    ) -> Report:
        metrics = [
            DatasetDriftMetric(),
            DatasetMissingValuesMetric(),
        ]
        for col_name in self.columns:
            metrics.extend(
                [
                    ColumnDriftMetric(column_name=col_name, stattest="wasserstein"),
                    ColumnSummaryMetric(column_name=col_name),
                ]
            )

        data_drift_report = Report(metrics=metrics, timestamp=schedule_time)

        data_drift_report.run(reference_data=self.train_set, current_data=sample_df)
        return data_drift_report

    def create_test_suite(
        self, sample_df: pd.DataFrame, schedule_time: pd.Timestamp
    ) -> TestSuite:
        data_drift_test_suite = TestSuite(
            tests=[DataDriftTestPreset()], timestamp=schedule_time
        )

        data_drift_test_suite.run(reference_data=self.train_set, current_data=sample_df)
        return data_drift_test_suite


if __name__ == "__main__":
    app = EvidentlyApplication(
        workspace_path="./evidently_1/", project_name="eternal-loop"
    )

    while True:
        app.perform_analysis()
        time.sleep(0.1)

After I had terminated the process, I loaded the workspace:

from time import time
from evidently.ui.workspace import Workspace

t = time()
ws = Workspace("./evidently_1")
print(time() - t)
51.66515111923218

It took more than 50 seconds, and the Python's process memory consumption is 1.5 GB, the same as it ended with.

To summarize, I have two questions:

  1. Why is the memory not freed by evidently?
  2. IIUC, loading the local workspace loads all the snapshots. Is there a way to avoid that?

Thank you in advance 🙂

jond01 avatar Jul 23 '24 14:07 jond01