deepdiff High memory usage performing diffs of scipy.io.matlab.mio5_params.mat

High memory usage performing diffs of scipy.io.matlab.mio5_params.mat_struct

Open David-Herman opened this issue 5 years ago • 5 comments

Creating a new issue from, https://github.com/seperman/deepdiff/issues/194#issuecomment-631799918

using the current dev branch as of today other than adding a wrapper on the init method. I did try to add the decorator via monkey patching but that ended up throwing an error.

from memory_profiler import profile
from deepdiff import DeepDiff  # For Deep Difference of 2 objects
from scipy.io import loadmat
import os
import wrapt


if __name__ == '__main__':
    # get the data structures and sub-select to filter the profiling
    matlist = [f for f in os.listdir() if f.endswith('.mat')]
    mat1 = loadmat(matlist[0], squeeze_me=True, struct_as_record=False)
    mat2 = loadmat(matlist[1], squeeze_me=True, struct_as_record=False)
    key = list(mat1.keys())[-10]
    mat1 = mat1[key]
    mat2 = mat2[key]
    # monkey patch and add the profile decorator
    #wrapt.wrap_function_wrapper(DeepDiff, '__init__', profile) #threw an error regarding hashing the backend for memory_profile
    # now call profile decorated DeepDiff
    DeepDiff(mat1,mat2)

(base)python example.py
Filename: \lib\site-packages\deepdiff\diff.py

Line #    Mem usage    Increment   Line Contents
================================================
    80     64.5 MiB     64.5 MiB       @profile
    81                                 def __init__(self,
    82                                              t1,
    83                                              t2,
    84                                              ignore_order=False,
    85                                              report_repetition=False,
    86                                              significant_digits=None,
    87                                              number_format_notation="f",
    88                                              exclude_paths=None,
    89                                              exclude_regex_paths=None,
    90                                              exclude_types=None,
    91                                              ignore_type_in_groups=None,
    92                                              ignore_string_type_changes=False,
    93                                              ignore_numeric_type_changes=False,
    94                                              ignore_type_subclasses=False,
    95                                              ignore_string_case=False,
    96                                              exclude_obj_callback=None,
    97                                              number_to_string_func=None,
    98                                              ignore_nan_inequality=False,
    99                                              ignore_private_variables=True,
   100                                              verbose_level=1,
   101                                              view=TEXT_VIEW,
   102                                              hasher=None,
   103                                              hashes=None,
   104                                              parameters=None,
   105                                              shared_parameters=None,
   106                                              max_passes=10000000,
   107                                              max_distances_to_keep_track_per_item=10000,
   108                                              max_diffs=None,
   109                                              cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
   110                                              log_frequency_in_sec=0,
   111                                              progress_logger=logger.info,
   112                                              cache_size=5000,
   113                                              _stats=None,
   114                                              _cache=None,
   115                                              _numpy_paths=None,
   116                                              _original_type=None,
   117                                              **kwargs):
   118     64.5 MiB      0.0 MiB           if kwargs:
   119                                         raise ValueError((
   120                                             "The following parameter(s) are not valid: %s\n"
   121                                             "The valid parameters are ignore_order, report_repetition, significant_digits, "
   122                                             "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
   123                                             "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, "
   124                                             "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
   125                                             "view, hasher, hashes, max_passes, max_distances_to_keep_track_per_item, max_diffs, "
   126                                             "cutoff_distance_for_pairs, log_frequency_in_sec, cache_size, _stats, _numpy_paths, _original_type "
   127                                             "parameters and shared_parameters.") % ', '.join(kwargs.keys()))
   128
   129     64.5 MiB      0.0 MiB           if parameters:
   130                                         self.__dict__ = deepcopy(parameters)
   131                                     else:
   132     64.5 MiB      0.0 MiB               self.ignore_order = ignore_order
   133     64.5 MiB      0.0 MiB               ignore_type_in_groups = ignore_type_in_groups or []
   134     64.5 MiB      0.0 MiB               if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
   135                                             ignore_numeric_type_changes = True
   136     64.5 MiB      0.0 MiB               self.ignore_numeric_type_changes = ignore_numeric_type_changes
   137     64.5 MiB      0.0 MiB               if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
   138                                             ignore_string_type_changes = True
   139     64.5 MiB      0.0 MiB               self.ignore_string_type_changes = ignore_string_type_changes
   140     64.5 MiB      0.0 MiB               self.ignore_type_in_groups = self.get_ignore_types_in_groups(
   141     64.5 MiB      0.0 MiB                   ignore_type_in_groups=ignore_type_in_groups,
   142     64.5 MiB      0.0 MiB                   ignore_string_type_changes=ignore_string_type_changes,
   143     64.5 MiB      0.0 MiB                   ignore_numeric_type_changes=ignore_numeric_type_changes,
   144     64.5 MiB      0.0 MiB                   ignore_type_subclasses=ignore_type_subclasses)
   145     64.5 MiB      0.0 MiB               self.report_repetition = report_repetition
   146     64.5 MiB      0.0 MiB               self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
   147     64.5 MiB      0.0 MiB               self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
   148     64.5 MiB      0.0 MiB               self.exclude_types = set(exclude_types) if exclude_types else None
   149     64.5 MiB      0.0 MiB               self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None  # we need tuple for checking isinstance
   150     64.5 MiB      0.0 MiB               self.ignore_type_subclasses = ignore_type_subclasses
   151     64.5 MiB      0.0 MiB               self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
   152     64.5 MiB      0.0 MiB               self.ignore_string_case = ignore_string_case
   153     64.5 MiB      0.0 MiB               self.exclude_obj_callback = exclude_obj_callback
   154     64.5 MiB      0.0 MiB               self.number_to_string = number_to_string_func or number_to_string
   155     64.5 MiB      0.0 MiB               self.ignore_private_variables = ignore_private_variables
   156     64.5 MiB      0.0 MiB               self.ignore_nan_inequality = ignore_nan_inequality
   157     64.5 MiB      0.0 MiB               self.hasher = hasher
   158
   159     64.5 MiB      0.0 MiB               self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
   160     64.5 MiB      0.0 MiB               self.number_format_notation = number_format_notation
   161     64.5 MiB      0.0 MiB               self.verbose_level = verbose_level
   162     64.5 MiB      0.0 MiB               self.view = view
   163                                         # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
   164     64.5 MiB      0.0 MiB               self.max_passes = max_passes
   165     64.5 MiB      0.0 MiB               self.max_diffs = max_diffs
   166                                         # Maximum number of calculated distances between pairs to be tracked.
   167                                         # For huge lists, this number may need to be modified at the cost of more memory usage.
   168                                         # Only used when ignore_order = True.
   169     64.5 MiB      0.0 MiB               self.max_distances_to_keep_track_per_item = max_distances_to_keep_track_per_item
   170     64.5 MiB      0.0 MiB               self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
   171                                         # _deep_distance_buckets_exponent is only used for the number of buckets of distances when doing ignore_order=True calculations
   172                                         # The actual number of buckets will be 10 to the power of the _deep_distance_buckets_exponent
   173     64.5 MiB      0.0 MiB               self._deep_distance_buckets_exponent = len(str(self.max_passes)) + 3  # Adding some padding to it.
   174                                         # Parameters are the clean parameters to initialize DeepDiff with so we avoid all the above
   175                                         # cleaning functionalities when running DeepDiff recursively.
   176                                         # However DeepHash has its own set of parameters that are slightly different than DeepDIff.
   177                                         # DeepDiff parameters are transformed to DeepHash parameters via __get_deephash_params method.
   178     64.5 MiB      0.0 MiB               parameters = self.__dict__.copy()
   179
   180     64.5 MiB      0.0 MiB           _purge_cache = True
   181
   182     64.5 MiB      0.0 MiB           if shared_parameters:
   183                                         self.is_root = False
   184                                         self.shared_parameters = shared_parameters
   185                                         self.__dict__.update(shared_parameters)
   186                                         # We are in some pass other than root
   187                                         repeated_timer = None
   188                                     else:
   189                                         # we are at the root
   190     64.5 MiB      0.0 MiB               self.is_root = True
   191                                         # keep the cache. Only used for debugging what was in the cache.
   192     64.5 MiB      0.0 MiB               if _cache == 'keep':
   193                                             _purge_cache = False
   194                                         # Caching the DeepDiff results for dynamic programming
   195     64.5 MiB      0.0 MiB               self._cache = LFUCache(cache_size) if cache_size else DummyLFU()
   196                                         self._stats = {
   197     64.5 MiB      0.0 MiB                   PASSES_COUNT: 0,
   198     64.5 MiB      0.0 MiB                   DIFF_COUNT: 0,
   199     64.5 MiB      0.0 MiB                   LEVEL_CACHE_HIT_COUNT: 0,
   200     64.5 MiB      0.0 MiB                   DISTANCE_CACHE_HIT_COUNT: 0,
   201     64.5 MiB      0.0 MiB                   MAX_PASS_LIMIT_REACHED: False,
   202     64.5 MiB      0.0 MiB                   MAX_DIFF_LIMIT_REACHED: False,
   203                                         }
   204     64.5 MiB      0.0 MiB               self.hashes = {} if hashes is None else hashes
   205     64.5 MiB      0.0 MiB               self._numpy_paths = {} if _numpy_paths is None else _numpy_paths
   206                                         self.shared_parameters = {
   207     64.5 MiB      0.0 MiB                   'hashes': self.hashes,
   208     64.5 MiB      0.0 MiB                   '_stats': self._stats,
   209     64.5 MiB      0.0 MiB                   '_cache': self._cache,
   210     64.5 MiB      0.0 MiB                   '_numpy_paths': self._numpy_paths
   211                                         }
   212     64.5 MiB      0.0 MiB               if log_frequency_in_sec:
   213                                             # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
   214                                             repeated_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
   215                                             logging.basicConfig(level=logging.INFO)
   216                                         else:
   217     64.5 MiB      0.0 MiB                   repeated_timer = None
   218
   219     64.5 MiB      0.0 MiB           self.parameters = parameters
   220     64.5 MiB      0.0 MiB           self.deephash_parameters = self.__get_deephash_params()
   221     64.5 MiB      0.0 MiB           self.tree = TreeResult()
   222     64.5 MiB      0.0 MiB           self.t1 = t1
   223     64.5 MiB      0.0 MiB           self.t2 = t2
   224
   225     64.5 MiB      0.0 MiB           try:
   226     64.5 MiB      0.0 MiB               root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
   227                                         # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
   228                                         # The reason is that we convert the numpy array to python list and then later for distance calculations
   229                                         # we convert only the the last dimension of it into numpy arrays.
   230    840.6 MiB    776.1 MiB               self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
   231
   232    840.6 MiB      0.0 MiB               self.tree.remove_empty_keys()
   233    882.0 MiB     41.5 MiB               view_results = self._get_view_results(self.view)
   234    882.0 MiB      0.0 MiB               self.update(view_results)
   235                                     finally:
   236    882.0 MiB      0.0 MiB               if self.is_root:
   237    882.0 MiB      0.0 MiB                   if _purge_cache:
   238    882.0 MiB      0.0 MiB                       del self._cache
   239    882.0 MiB      0.0 MiB                   del self.shared_parameters
   240    882.0 MiB      0.0 MiB                   del self.parameters
   241    882.0 MiB      0.0 MiB               if repeated_timer:
   242                                             duration = repeated_timer.stop()
   243                                             self._stats['DURATION SEC'] = duration
   244                                             logger.info('stats {}'.format(self.get_stats()))

May 27 '20 18:05 David-Herman

which other class methods should I add the decorator, besides the init?

May 27 '20 18:05 David-Herman

Interesting. I should run some memory profiler too. Yes we should add the decorator to a bunch of other places but it has been a while since I ran the memory profiler so I don't remember if you can add the decorator to more than one place. What I'm curious about is what percentage of the memory usages is in the DeepDiff object's self.hashes vs. self.tree. There is a ton of overhead that can be freed up once the DeepDiff object is created and the user does not need the tree view and/or the deep_distance feature. But that doesn't mean during DeepDiff object creation it is going to use less memory.

May 27 '20 20:05 seperman

To give further background. My data structure in the above trace example is composed of a scipy.io.matlab.mio5_params.mat_struct, which is highly nested, that is composed by about ~250 data objects which include strings, ints, floats, and primarily numpy arrays. The data structure typically does not change but may (data streams of different compositions). Is deepdiff the right tool to enable efficient processing, with the goal of getting the diff for use as compression utility (most data objects are the same) or understanding structural differences of the differences in the class instances'?

Jun 01 '20 14:06 David-Herman

Hi @David-Herman Please pip install the latest DeepDiff that I just released and let me know how it goes. There has been many performance improvements. Please also checkout https://zepworks.com/deepdiff/5.0.0/optimizations.html

Jun 23 '20 18:06 seperman

This is the pip install at the latest commit (used hash to be explicit on what I was installing)

>pip install git+git://github.com/seperman/deepdiff.git@4662faea2cf4982e6f1974fd5abf18216ebc7aaa

and confirming via .__version__.

>>> import deepdiff
>>> deepdiff.__version__
'5.0.0'

(base) >python example.py
Filename: \Continuum\anaconda3\lib\site-packages\deepdiff\diff.py

Line #    Mem usage    Increment   Line Contents
================================================
   106     64.5 MiB     64.5 MiB       @profile
   107                                 def __init__(self,
   108                                              t1,
   109                                              t2,
   110                                              cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
   111                                              cutoff_intersection_for_pairs=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
   112                                              cache_size=0,
   113                                              cache_tuning_sample_size=0,
   114                                              cache_purge_level=1,
   115                                              exclude_paths=None,
   116                                              exclude_regex_paths=None,
   117                                              exclude_types=None,
   118                                              exclude_obj_callback=None,
   119                                              get_deep_distance=False,
   120                                              hasher=None,
   121                                              hashes=None,
   122                                              ignore_order=False,
   123                                              ignore_type_in_groups=None,
   124                                              ignore_string_type_changes=False,
   125                                              ignore_numeric_type_changes=False,
   126                                              ignore_type_subclasses=False,
   127                                              ignore_string_case=False,
   128                                              ignore_nan_inequality=False,
   129                                              ignore_private_variables=True,
   130                                              log_frequency_in_sec=0,
   131                                              max_passes=10000000,
   132                                              max_diffs=None,
   133                                              number_format_notation="f",
   134                                              number_to_string_func=None,
   135                                              progress_logger=logger.info,
   136                                              report_repetition=False,
   137                                              significant_digits=None,
   138                                              truncate_datetime=None,
   139                                              verbose_level=1,
   140                                              view=TEXT_VIEW,
   141                                              _original_type=None,
   142                                              _parameters=None,
   143                                              _shared_parameters=None,
   144                                              **kwargs):
   145     64.5 MiB      0.0 MiB           super().__init__()
   146     64.5 MiB      0.0 MiB           if kwargs:
   147                                         raise ValueError((
   148                                             "The following parameter(s) are not valid: %s\n"
   149                                             "The valid parameters are ignore_order, report_repetition, significant_digits, "
   150                                             "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
   151                                             "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, "
   152                                             "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
   153                                             "view, hasher, hashes, max_passes, max_diffs, "
   154                                             "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
   155                                             "cache_tuning_sample_size, get_deep_distance, cache_purge_level, "
   156                                             "_original_type, _parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
   157
   158     64.5 MiB      0.0 MiB           if _parameters:
   159                                         self.__dict__.update(_parameters)
   160                                     else:
   161     64.5 MiB      0.0 MiB               self.ignore_order = ignore_order
   162     64.5 MiB      0.0 MiB               ignore_type_in_groups = ignore_type_in_groups or []
   163     64.5 MiB      0.0 MiB               if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
   164                                             ignore_numeric_type_changes = True
   165     64.5 MiB      0.0 MiB               self.ignore_numeric_type_changes = ignore_numeric_type_changes
   166     64.5 MiB      0.0 MiB               if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
   167                                             ignore_string_type_changes = True
   168     64.5 MiB      0.0 MiB               self.ignore_string_type_changes = ignore_string_type_changes
   169     64.5 MiB      0.0 MiB               self.ignore_type_in_groups = self.get_ignore_types_in_groups(
   170     64.5 MiB      0.0 MiB                   ignore_type_in_groups=ignore_type_in_groups,
   171     64.5 MiB      0.0 MiB                   ignore_string_type_changes=ignore_string_type_changes,
   172     64.5 MiB      0.0 MiB                   ignore_numeric_type_changes=ignore_numeric_type_changes,
   173     64.5 MiB      0.0 MiB                   ignore_type_subclasses=ignore_type_subclasses)
   174     64.5 MiB      0.0 MiB               self.report_repetition = report_repetition
   175     64.5 MiB      0.0 MiB               self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
   176     64.5 MiB      0.0 MiB               self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
   177     64.5 MiB      0.0 MiB               self.exclude_types = set(exclude_types) if exclude_types else None
   178     64.5 MiB      0.0 MiB               self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None  # we need tuple for checking isinstance
   179     64.5 MiB      0.0 MiB               self.ignore_type_subclasses = ignore_type_subclasses
   180     64.5 MiB      0.0 MiB               self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
   181     64.5 MiB      0.0 MiB               self.ignore_string_case = ignore_string_case
   182     64.5 MiB      0.0 MiB               self.exclude_obj_callback = exclude_obj_callback
   183     64.5 MiB      0.0 MiB               self.number_to_string = number_to_string_func or number_to_string
   184     64.5 MiB      0.0 MiB               self.ignore_private_variables = ignore_private_variables
   185     64.5 MiB      0.0 MiB               self.ignore_nan_inequality = ignore_nan_inequality
   186     64.5 MiB      0.0 MiB               self.hasher = hasher
   187     64.5 MiB      0.0 MiB               self.cache_tuning_sample_size = cache_tuning_sample_size
   188
   189     64.5 MiB      0.0 MiB               self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
   190     64.5 MiB      0.0 MiB               self.truncate_datetime = get_truncate_datetime(truncate_datetime)
   191     64.5 MiB      0.0 MiB               self.number_format_notation = number_format_notation
   192     64.5 MiB      0.0 MiB               if verbose_level in {0, 1, 2}:
   193     64.5 MiB      0.0 MiB                   self.verbose_level = verbose_level
   194                                         else:
   195                                             raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
   196     64.5 MiB      0.0 MiB               if cache_purge_level not in {0, 1, 2}:
   197                                             raise ValueError(PURGE_LEVEL_RANGE_MSG)
   198     64.5 MiB      0.0 MiB               self.view = view
   199                                         # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
   200     64.5 MiB      0.0 MiB               self.max_passes = max_passes
   201     64.5 MiB      0.0 MiB               self.max_diffs = max_diffs
   202     64.5 MiB      0.0 MiB               self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
   203     64.5 MiB      0.0 MiB               self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
   204     64.5 MiB      0.0 MiB               if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
   205                                             raise ValueError(CUTOFF_RANGE_ERROR_MSG)
   206                                         # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
   207                                         # cleaning functionalities when running DeepDiff recursively.
   208                                         # However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
   209                                         # DeepDiff _parameters are transformed to DeepHash _parameters via __get_deephash_params method.
   210     64.5 MiB      0.0 MiB               self.progress_logger = progress_logger
   211     64.5 MiB      0.0 MiB               self.cache_size = cache_size
   212     64.5 MiB      0.0 MiB               _parameters = self.__dict__.copy()
   213
   214                                     # Non-Root
   215     64.5 MiB      0.0 MiB           if _shared_parameters:
   216                                         self.is_root = False
   217                                         self._shared_parameters = _shared_parameters
   218                                         self.__dict__.update(_shared_parameters)
   219                                         # We are in some pass other than root
   220                                         progress_timer = None
   221                                     # Root
   222                                     else:
   223     64.5 MiB      0.0 MiB               self.is_root = True
   224                                         # Caching the DeepDiff results for dynamic programming
   225     64.5 MiB      0.0 MiB               self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
   226                                         self._stats = {
   227     64.5 MiB      0.0 MiB                   PASSES_COUNT: 0,
   228     64.5 MiB      0.0 MiB                   DIFF_COUNT: 0,
   229     64.5 MiB      0.0 MiB                   DISTANCE_CACHE_HIT_COUNT: 0,
   230     64.5 MiB      0.0 MiB                   PREVIOUS_DIFF_COUNT: 0,
   231     64.5 MiB      0.0 MiB                   PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
   232     64.5 MiB      0.0 MiB                   MAX_PASS_LIMIT_REACHED: False,
   233     64.5 MiB      0.0 MiB                   MAX_DIFF_LIMIT_REACHED: False,
   234     64.5 MiB      0.0 MiB                   DISTANCE_CACHE_ENABLED: bool(cache_size),
   235                                         }
   236     64.5 MiB      0.0 MiB               self.hashes = dict_() if hashes is None else hashes
   237     64.5 MiB      0.0 MiB               self._numpy_paths = dict_()  # if _numpy_paths is None else _numpy_paths
   238                                         self._shared_parameters = {
   239     64.5 MiB      0.0 MiB                   'hashes': self.hashes,
   240     64.5 MiB      0.0 MiB                   '_stats': self._stats,
   241     64.5 MiB      0.0 MiB                   '_distance_cache': self._distance_cache,
   242     64.5 MiB      0.0 MiB                   '_numpy_paths': self._numpy_paths,
   243     64.5 MiB      0.0 MiB                   _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
   244                                         }
   245     64.5 MiB      0.0 MiB               if log_frequency_in_sec:
   246                                             # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
   247                                             progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
   248                                         else:
   249     64.5 MiB      0.0 MiB                   progress_timer = None
   250
   251     64.5 MiB      0.0 MiB           self._parameters = _parameters
   252     64.5 MiB      0.0 MiB           self.deephash_parameters = self.__get_deephash_params()
   253     64.5 MiB      0.0 MiB           self.tree = TreeResult()
   254     64.5 MiB      0.0 MiB           self.t1 = t1
   255     64.5 MiB      0.0 MiB           self.t2 = t2
   256
   257     64.5 MiB      0.0 MiB           try:
   258     64.5 MiB      0.0 MiB               root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
   259                                         # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
   260                                         # The reason is that we convert the numpy array to python list and then later for distance calculations
   261                                         # we convert only the the last dimension of it into numpy arrays.
   262    831.4 MiB    767.0 MiB               self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
   263
   264    831.4 MiB      0.0 MiB               if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
   265                                             self.tree['deep_distance'] = self._get_rough_distance()
   266
   267    831.4 MiB      0.0 MiB               self.tree.remove_empty_keys()
   268    883.2 MiB     51.8 MiB               view_results = self._get_view_results(self.view)
   269    883.2 MiB      0.0 MiB               self.update(view_results)
   270                                     finally:
   271    883.2 MiB      0.0 MiB               if self.is_root:
   272    883.2 MiB      0.0 MiB                   if cache_purge_level:
   273    883.2 MiB      0.0 MiB                       del self._distance_cache
   274    883.2 MiB      0.0 MiB                       del self.hashes
   275    883.2 MiB      0.0 MiB                   del self._shared_parameters
   276    883.2 MiB      0.0 MiB                   del self._parameters
   277    883.2 MiB      0.0 MiB                   for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
   278    883.2 MiB      0.0 MiB                               DISTANCE_CACHE_ENABLED):
   279    883.2 MiB      0.0 MiB                       del self._stats[key]
   280    883.2 MiB      0.0 MiB                   if progress_timer:
   281                                                 duration = progress_timer.stop()
   282                                                 self._stats['DURATION SEC'] = duration
   283                                                 logger.info('stats {}'.format(self.get_stats()))
   284    883.2 MiB      0.0 MiB                   if cache_purge_level == 2:
   285                                                 self.__dict__.clear()

When I pre-flatten the data structure I do save some considerable amount of space. Here is the code I used to flatten,

from memory_profiler import profile
from deepdiff import DeepDiff  # For Deep Difference of 2 objects
from scipy.io import loadmat
import os
import wrapt
import numpy as np

def deepflatten(instance, depth=None, types=None, ignore=None):
    '''
    function to flatten mat_sturct object into flat dictionary 
    '''
    flat_d ={}
    
    def flatten(instance, key=''):
        if type(instance) in [str,float,int,np.ndarray]:
            flat_d[key] = instance
        else:
            for a in dir(instance):
                if not a.startswith('_'):
                    #print(a,type(getattr(instance,a)))
                    flatten(getattr(instance,a),key+'.'+a)

    flatten(instance)        
    return flat_d

    
    



if __name__ == '__main__':
    # get the data structures and sub-select to filter the profiling
    matlist = [f for f in os.listdir() if f.endswith('.mat')]
    mat1 = loadmat(matlist[0], squeeze_me=True, struct_as_record=False)
    mat2 = loadmat(matlist[1], squeeze_me=True, struct_as_record=False)
    key = list(mat1.keys())[-10]
    mat1 = mat1[key]
    mat2 = mat2[key]
    # monkey patch and add the profile decorator
    #wrapt.wrap_function_wrapper(DeepDiff, '__init__', profile) #threw an error regarding hashing the backend for memory_profile
    # now call profile decorated DeepDiff
    DeepDiff(deepflatten(mat1),deepflatten(mat2))

and the memory consumption with a flat data structure,

(base) >python example.py
Filename: Continuum\anaconda3\lib\site-packages\deepdiff\diff.py

Line #    Mem usage    Increment   Line Contents
================================================
   106     64.8 MiB     64.8 MiB       @profile
   107                                 def __init__(self,
   108                                              t1,
   109                                              t2,
   110                                              cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
   111                                              cutoff_intersection_for_pairs=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
   112                                              cache_size=0,
   113                                              cache_tuning_sample_size=0,
   114                                              cache_purge_level=1,
   115                                              exclude_paths=None,
   116                                              exclude_regex_paths=None,
   117                                              exclude_types=None,
   118                                              exclude_obj_callback=None,
   119                                              get_deep_distance=False,
   120                                              hasher=None,
   121                                              hashes=None,
   122                                              ignore_order=False,
   123                                              ignore_type_in_groups=None,
   124                                              ignore_string_type_changes=False,
   125                                              ignore_numeric_type_changes=False,
   126                                              ignore_type_subclasses=False,
   127                                              ignore_string_case=False,
   128                                              ignore_nan_inequality=False,
   129                                              ignore_private_variables=True,
   130                                              log_frequency_in_sec=0,
   131                                              max_passes=10000000,
   132                                              max_diffs=None,
   133                                              number_format_notation="f",
   134                                              number_to_string_func=None,
   135                                              progress_logger=logger.info,
   136                                              report_repetition=False,
   137                                              significant_digits=None,
   138                                              truncate_datetime=None,
   139                                              verbose_level=1,
   140                                              view=TEXT_VIEW,
   141                                              _original_type=None,
   142                                              _parameters=None,
   143                                              _shared_parameters=None,
   144                                              **kwargs):
   145     64.8 MiB      0.0 MiB           super().__init__()
   146     64.8 MiB      0.0 MiB           if kwargs:
   147                                         raise ValueError((
   148                                             "The following parameter(s) are not valid: %s\n"
   149                                             "The valid parameters are ignore_order, report_repetition, significant_digits, "
   150                                             "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
   151                                             "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, "
   152                                             "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
   153                                             "view, hasher, hashes, max_passes, max_diffs, "
   154                                             "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
   155                                             "cache_tuning_sample_size, get_deep_distance, cache_purge_level, "
   156                                             "_original_type, _parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
   157
   158     64.8 MiB      0.0 MiB           if _parameters:
   159                                         self.__dict__.update(_parameters)
   160                                     else:
   161     64.8 MiB      0.0 MiB               self.ignore_order = ignore_order
   162     64.8 MiB      0.0 MiB               ignore_type_in_groups = ignore_type_in_groups or []
   163     64.8 MiB      0.0 MiB               if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
   164                                             ignore_numeric_type_changes = True
   165     64.8 MiB      0.0 MiB               self.ignore_numeric_type_changes = ignore_numeric_type_changes
   166     64.8 MiB      0.0 MiB               if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
   167                                             ignore_string_type_changes = True
   168     64.8 MiB      0.0 MiB               self.ignore_string_type_changes = ignore_string_type_changes
   169     64.8 MiB      0.0 MiB               self.ignore_type_in_groups = self.get_ignore_types_in_groups(
   170     64.8 MiB      0.0 MiB                   ignore_type_in_groups=ignore_type_in_groups,
   171     64.8 MiB      0.0 MiB                   ignore_string_type_changes=ignore_string_type_changes,
   172     64.8 MiB      0.0 MiB                   ignore_numeric_type_changes=ignore_numeric_type_changes,
   173     64.8 MiB      0.0 MiB                   ignore_type_subclasses=ignore_type_subclasses)
   174     64.8 MiB      0.0 MiB               self.report_repetition = report_repetition
   175     64.8 MiB      0.0 MiB               self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
   176     64.8 MiB      0.0 MiB               self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
   177     64.8 MiB      0.0 MiB               self.exclude_types = set(exclude_types) if exclude_types else None
   178     64.8 MiB      0.0 MiB               self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None  # we need tuple for checking isinstance
   179     64.8 MiB      0.0 MiB               self.ignore_type_subclasses = ignore_type_subclasses
   180     64.8 MiB      0.0 MiB               self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
   181     64.8 MiB      0.0 MiB               self.ignore_string_case = ignore_string_case
   182     64.8 MiB      0.0 MiB               self.exclude_obj_callback = exclude_obj_callback
   183     64.8 MiB      0.0 MiB               self.number_to_string = number_to_string_func or number_to_string
   184     64.8 MiB      0.0 MiB               self.ignore_private_variables = ignore_private_variables
   185     64.8 MiB      0.0 MiB               self.ignore_nan_inequality = ignore_nan_inequality
   186     64.8 MiB      0.0 MiB               self.hasher = hasher
   187     64.8 MiB      0.0 MiB               self.cache_tuning_sample_size = cache_tuning_sample_size
   188
   189     64.8 MiB      0.0 MiB               self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
   190     64.8 MiB      0.0 MiB               self.truncate_datetime = get_truncate_datetime(truncate_datetime)
   191     64.8 MiB      0.0 MiB               self.number_format_notation = number_format_notation
   192     64.8 MiB      0.0 MiB               if verbose_level in {0, 1, 2}:
   193     64.8 MiB      0.0 MiB                   self.verbose_level = verbose_level
   194                                         else:
   195                                             raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
   196     64.8 MiB      0.0 MiB               if cache_purge_level not in {0, 1, 2}:
   197                                             raise ValueError(PURGE_LEVEL_RANGE_MSG)
   198     64.8 MiB      0.0 MiB               self.view = view
   199                                         # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
   200     64.8 MiB      0.0 MiB               self.max_passes = max_passes
   201     64.8 MiB      0.0 MiB               self.max_diffs = max_diffs
   202     64.8 MiB      0.0 MiB               self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
   203     64.8 MiB      0.0 MiB               self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
   204     64.8 MiB      0.0 MiB               if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
   205                                             raise ValueError(CUTOFF_RANGE_ERROR_MSG)
   206                                         # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
   207                                         # cleaning functionalities when running DeepDiff recursively.
   208                                         # However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
   209                                         # DeepDiff _parameters are transformed to DeepHash _parameters via __get_deephash_params method.
   210     64.8 MiB      0.0 MiB               self.progress_logger = progress_logger
   211     64.8 MiB      0.0 MiB               self.cache_size = cache_size
   212     64.8 MiB      0.0 MiB               _parameters = self.__dict__.copy()
   213
   214                                     # Non-Root
   215     64.8 MiB      0.0 MiB           if _shared_parameters:
   216                                         self.is_root = False
   217                                         self._shared_parameters = _shared_parameters
   218                                         self.__dict__.update(_shared_parameters)
   219                                         # We are in some pass other than root
   220                                         progress_timer = None
   221                                     # Root
   222                                     else:
   223     64.8 MiB      0.0 MiB               self.is_root = True
   224                                         # Caching the DeepDiff results for dynamic programming
   225     64.8 MiB      0.0 MiB               self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
   226                                         self._stats = {
   227     64.8 MiB      0.0 MiB                   PASSES_COUNT: 0,
   228     64.8 MiB      0.0 MiB                   DIFF_COUNT: 0,
   229     64.8 MiB      0.0 MiB                   DISTANCE_CACHE_HIT_COUNT: 0,
   230     64.8 MiB      0.0 MiB                   PREVIOUS_DIFF_COUNT: 0,
   231     64.8 MiB      0.0 MiB                   PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
   232     64.8 MiB      0.0 MiB                   MAX_PASS_LIMIT_REACHED: False,
   233     64.8 MiB      0.0 MiB                   MAX_DIFF_LIMIT_REACHED: False,
   234     64.8 MiB      0.0 MiB                   DISTANCE_CACHE_ENABLED: bool(cache_size),
   235                                         }
   236     64.8 MiB      0.0 MiB               self.hashes = dict_() if hashes is None else hashes
   237     64.8 MiB      0.0 MiB               self._numpy_paths = dict_()  # if _numpy_paths is None else _numpy_paths
   238                                         self._shared_parameters = {
   239     64.8 MiB      0.0 MiB                   'hashes': self.hashes,
   240     64.8 MiB      0.0 MiB                   '_stats': self._stats,
   241     64.8 MiB      0.0 MiB                   '_distance_cache': self._distance_cache,
   242     64.8 MiB      0.0 MiB                   '_numpy_paths': self._numpy_paths,
   243     64.8 MiB      0.0 MiB                   _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
   244                                         }
   245     64.8 MiB      0.0 MiB               if log_frequency_in_sec:
   246                                             # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
   247                                             progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
   248                                         else:
   249     64.8 MiB      0.0 MiB                   progress_timer = None
   250
   251     64.8 MiB      0.0 MiB           self._parameters = _parameters
   252     64.8 MiB      0.0 MiB           self.deephash_parameters = self.__get_deephash_params()
   253     64.8 MiB      0.0 MiB           self.tree = TreeResult()
   254     64.8 MiB      0.0 MiB           self.t1 = t1
   255     64.8 MiB      0.0 MiB           self.t2 = t2
   256
   257     64.8 MiB      0.0 MiB           try:
   258     64.8 MiB      0.0 MiB               root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
   259                                         # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
   260                                         # The reason is that we convert the numpy array to python list and then later for distance calculations
   261                                         # we convert only the the last dimension of it into numpy arrays.
   262    572.2 MiB    507.4 MiB               self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
   263
   264    572.2 MiB      0.0 MiB               if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
   265                                             self.tree['deep_distance'] = self._get_rough_distance()
   266
   267    572.2 MiB      0.0 MiB               self.tree.remove_empty_keys()
   268    625.0 MiB     52.8 MiB               view_results = self._get_view_results(self.view)
   269    625.0 MiB      0.0 MiB               self.update(view_results)
   270                                     finally:
   271    625.0 MiB      0.0 MiB               if self.is_root:
   272    625.0 MiB      0.0 MiB                   if cache_purge_level:
   273    625.0 MiB      0.0 MiB                       del self._distance_cache
   274    625.0 MiB      0.0 MiB                       del self.hashes
   275    625.0 MiB      0.0 MiB                   del self._shared_parameters
   276    625.0 MiB      0.0 MiB                   del self._parameters
   277    625.1 MiB      0.1 MiB                   for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
   278    625.1 MiB      0.0 MiB                               DISTANCE_CACHE_ENABLED):
   279    625.1 MiB      0.0 MiB                       del self._stats[key]
   280    625.1 MiB      0.0 MiB                   if progress_timer:
   281                                                 duration = progress_timer.stop()
   282                                                 self._stats['DURATION SEC'] = duration
   283                                                 logger.info('stats {}'.format(self.get_stats()))
   284    625.1 MiB      0.0 MiB                   if cache_purge_level == 2:
   285                                                 self.__dict__.clear()

I am not sure the memory savings is due to a bug or just that the data structure is smaller to keep in memory.

Any additional inputs I should use to play with the results? Are there sub-methods I should `@profile

Jun 24 '20 19:06 David-Herman

deepdiff deepdiff copied to clipboard

High memory usage performing diffs of scipy.io.matlab.mio5_params.mat_struct

deepdiff
deepdiff copied to clipboard