deepdiff
deepdiff copied to clipboard
High memory usage performing diffs of scipy.io.matlab.mio5_params.mat_struct
Creating a new issue from, https://github.com/seperman/deepdiff/issues/194#issuecomment-631799918
using the current dev branch as of today other than adding a wrapper on the init method. I did try to add the decorator via monkey patching but that ended up throwing an error.
from memory_profiler import profile
from deepdiff import DeepDiff # For Deep Difference of 2 objects
from scipy.io import loadmat
import os
import wrapt
if __name__ == '__main__':
# get the data structures and sub-select to filter the profiling
matlist = [f for f in os.listdir() if f.endswith('.mat')]
mat1 = loadmat(matlist[0], squeeze_me=True, struct_as_record=False)
mat2 = loadmat(matlist[1], squeeze_me=True, struct_as_record=False)
key = list(mat1.keys())[-10]
mat1 = mat1[key]
mat2 = mat2[key]
# monkey patch and add the profile decorator
#wrapt.wrap_function_wrapper(DeepDiff, '__init__', profile) #threw an error regarding hashing the backend for memory_profile
# now call profile decorated DeepDiff
DeepDiff(mat1,mat2)
(base)python example.py
Filename: \lib\site-packages\deepdiff\diff.py
Line # Mem usage Increment Line Contents
================================================
80 64.5 MiB 64.5 MiB @profile
81 def __init__(self,
82 t1,
83 t2,
84 ignore_order=False,
85 report_repetition=False,
86 significant_digits=None,
87 number_format_notation="f",
88 exclude_paths=None,
89 exclude_regex_paths=None,
90 exclude_types=None,
91 ignore_type_in_groups=None,
92 ignore_string_type_changes=False,
93 ignore_numeric_type_changes=False,
94 ignore_type_subclasses=False,
95 ignore_string_case=False,
96 exclude_obj_callback=None,
97 number_to_string_func=None,
98 ignore_nan_inequality=False,
99 ignore_private_variables=True,
100 verbose_level=1,
101 view=TEXT_VIEW,
102 hasher=None,
103 hashes=None,
104 parameters=None,
105 shared_parameters=None,
106 max_passes=10000000,
107 max_distances_to_keep_track_per_item=10000,
108 max_diffs=None,
109 cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
110 log_frequency_in_sec=0,
111 progress_logger=logger.info,
112 cache_size=5000,
113 _stats=None,
114 _cache=None,
115 _numpy_paths=None,
116 _original_type=None,
117 **kwargs):
118 64.5 MiB 0.0 MiB if kwargs:
119 raise ValueError((
120 "The following parameter(s) are not valid: %s\n"
121 "The valid parameters are ignore_order, report_repetition, significant_digits, "
122 "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
123 "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, "
124 "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
125 "view, hasher, hashes, max_passes, max_distances_to_keep_track_per_item, max_diffs, "
126 "cutoff_distance_for_pairs, log_frequency_in_sec, cache_size, _stats, _numpy_paths, _original_type "
127 "parameters and shared_parameters.") % ', '.join(kwargs.keys()))
128
129 64.5 MiB 0.0 MiB if parameters:
130 self.__dict__ = deepcopy(parameters)
131 else:
132 64.5 MiB 0.0 MiB self.ignore_order = ignore_order
133 64.5 MiB 0.0 MiB ignore_type_in_groups = ignore_type_in_groups or []
134 64.5 MiB 0.0 MiB if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
135 ignore_numeric_type_changes = True
136 64.5 MiB 0.0 MiB self.ignore_numeric_type_changes = ignore_numeric_type_changes
137 64.5 MiB 0.0 MiB if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
138 ignore_string_type_changes = True
139 64.5 MiB 0.0 MiB self.ignore_string_type_changes = ignore_string_type_changes
140 64.5 MiB 0.0 MiB self.ignore_type_in_groups = self.get_ignore_types_in_groups(
141 64.5 MiB 0.0 MiB ignore_type_in_groups=ignore_type_in_groups,
142 64.5 MiB 0.0 MiB ignore_string_type_changes=ignore_string_type_changes,
143 64.5 MiB 0.0 MiB ignore_numeric_type_changes=ignore_numeric_type_changes,
144 64.5 MiB 0.0 MiB ignore_type_subclasses=ignore_type_subclasses)
145 64.5 MiB 0.0 MiB self.report_repetition = report_repetition
146 64.5 MiB 0.0 MiB self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
147 64.5 MiB 0.0 MiB self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
148 64.5 MiB 0.0 MiB self.exclude_types = set(exclude_types) if exclude_types else None
149 64.5 MiB 0.0 MiB self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance
150 64.5 MiB 0.0 MiB self.ignore_type_subclasses = ignore_type_subclasses
151 64.5 MiB 0.0 MiB self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
152 64.5 MiB 0.0 MiB self.ignore_string_case = ignore_string_case
153 64.5 MiB 0.0 MiB self.exclude_obj_callback = exclude_obj_callback
154 64.5 MiB 0.0 MiB self.number_to_string = number_to_string_func or number_to_string
155 64.5 MiB 0.0 MiB self.ignore_private_variables = ignore_private_variables
156 64.5 MiB 0.0 MiB self.ignore_nan_inequality = ignore_nan_inequality
157 64.5 MiB 0.0 MiB self.hasher = hasher
158
159 64.5 MiB 0.0 MiB self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
160 64.5 MiB 0.0 MiB self.number_format_notation = number_format_notation
161 64.5 MiB 0.0 MiB self.verbose_level = verbose_level
162 64.5 MiB 0.0 MiB self.view = view
163 # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
164 64.5 MiB 0.0 MiB self.max_passes = max_passes
165 64.5 MiB 0.0 MiB self.max_diffs = max_diffs
166 # Maximum number of calculated distances between pairs to be tracked.
167 # For huge lists, this number may need to be modified at the cost of more memory usage.
168 # Only used when ignore_order = True.
169 64.5 MiB 0.0 MiB self.max_distances_to_keep_track_per_item = max_distances_to_keep_track_per_item
170 64.5 MiB 0.0 MiB self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
171 # _deep_distance_buckets_exponent is only used for the number of buckets of distances when doing ignore_order=True calculations
172 # The actual number of buckets will be 10 to the power of the _deep_distance_buckets_exponent
173 64.5 MiB 0.0 MiB self._deep_distance_buckets_exponent = len(str(self.max_passes)) + 3 # Adding some padding to it.
174 # Parameters are the clean parameters to initialize DeepDiff with so we avoid all the above
175 # cleaning functionalities when running DeepDiff recursively.
176 # However DeepHash has its own set of parameters that are slightly different than DeepDIff.
177 # DeepDiff parameters are transformed to DeepHash parameters via __get_deephash_params method.
178 64.5 MiB 0.0 MiB parameters = self.__dict__.copy()
179
180 64.5 MiB 0.0 MiB _purge_cache = True
181
182 64.5 MiB 0.0 MiB if shared_parameters:
183 self.is_root = False
184 self.shared_parameters = shared_parameters
185 self.__dict__.update(shared_parameters)
186 # We are in some pass other than root
187 repeated_timer = None
188 else:
189 # we are at the root
190 64.5 MiB 0.0 MiB self.is_root = True
191 # keep the cache. Only used for debugging what was in the cache.
192 64.5 MiB 0.0 MiB if _cache == 'keep':
193 _purge_cache = False
194 # Caching the DeepDiff results for dynamic programming
195 64.5 MiB 0.0 MiB self._cache = LFUCache(cache_size) if cache_size else DummyLFU()
196 self._stats = {
197 64.5 MiB 0.0 MiB PASSES_COUNT: 0,
198 64.5 MiB 0.0 MiB DIFF_COUNT: 0,
199 64.5 MiB 0.0 MiB LEVEL_CACHE_HIT_COUNT: 0,
200 64.5 MiB 0.0 MiB DISTANCE_CACHE_HIT_COUNT: 0,
201 64.5 MiB 0.0 MiB MAX_PASS_LIMIT_REACHED: False,
202 64.5 MiB 0.0 MiB MAX_DIFF_LIMIT_REACHED: False,
203 }
204 64.5 MiB 0.0 MiB self.hashes = {} if hashes is None else hashes
205 64.5 MiB 0.0 MiB self._numpy_paths = {} if _numpy_paths is None else _numpy_paths
206 self.shared_parameters = {
207 64.5 MiB 0.0 MiB 'hashes': self.hashes,
208 64.5 MiB 0.0 MiB '_stats': self._stats,
209 64.5 MiB 0.0 MiB '_cache': self._cache,
210 64.5 MiB 0.0 MiB '_numpy_paths': self._numpy_paths
211 }
212 64.5 MiB 0.0 MiB if log_frequency_in_sec:
213 # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
214 repeated_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
215 logging.basicConfig(level=logging.INFO)
216 else:
217 64.5 MiB 0.0 MiB repeated_timer = None
218
219 64.5 MiB 0.0 MiB self.parameters = parameters
220 64.5 MiB 0.0 MiB self.deephash_parameters = self.__get_deephash_params()
221 64.5 MiB 0.0 MiB self.tree = TreeResult()
222 64.5 MiB 0.0 MiB self.t1 = t1
223 64.5 MiB 0.0 MiB self.t2 = t2
224
225 64.5 MiB 0.0 MiB try:
226 64.5 MiB 0.0 MiB root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
227 # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
228 # The reason is that we convert the numpy array to python list and then later for distance calculations
229 # we convert only the the last dimension of it into numpy arrays.
230 840.6 MiB 776.1 MiB self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
231
232 840.6 MiB 0.0 MiB self.tree.remove_empty_keys()
233 882.0 MiB 41.5 MiB view_results = self._get_view_results(self.view)
234 882.0 MiB 0.0 MiB self.update(view_results)
235 finally:
236 882.0 MiB 0.0 MiB if self.is_root:
237 882.0 MiB 0.0 MiB if _purge_cache:
238 882.0 MiB 0.0 MiB del self._cache
239 882.0 MiB 0.0 MiB del self.shared_parameters
240 882.0 MiB 0.0 MiB del self.parameters
241 882.0 MiB 0.0 MiB if repeated_timer:
242 duration = repeated_timer.stop()
243 self._stats['DURATION SEC'] = duration
244 logger.info('stats {}'.format(self.get_stats()))
which other class methods should I add the decorator, besides the init?
Interesting. I should run some memory profiler too. Yes we should add the decorator to a bunch of other places but it has been a while since I ran the memory profiler so I don't remember if you can add the decorator to more than one place. What I'm curious about is what percentage of the memory usages is in the DeepDiff object's self.hashes vs. self.tree. There is a ton of overhead that can be freed up once the DeepDiff object is created and the user does not need the tree view and/or the deep_distance feature. But that doesn't mean during DeepDiff object creation it is going to use less memory.
To give further background. My data structure in the above trace example is composed of a scipy.io.matlab.mio5_params.mat_struct, which is highly nested, that is composed by about ~250 data objects which include strings, ints, floats, and primarily numpy arrays. The data structure typically does not change but may (data streams of different compositions). Is deepdiff the right tool to enable efficient processing, with the goal of getting the diff for use as compression utility (most data objects are the same) or understanding structural differences of the differences in the class instances'?
Hi @David-Herman Please pip install the latest DeepDiff that I just released and let me know how it goes. There has been many performance improvements. Please also checkout https://zepworks.com/deepdiff/5.0.0/optimizations.html
This is the pip install at the latest commit (used hash to be explicit on what I was installing)
>pip install git+git://github.com/seperman/deepdiff.git@4662faea2cf4982e6f1974fd5abf18216ebc7aaa
and confirming via .__version__.
>>> import deepdiff
>>> deepdiff.__version__
'5.0.0'
(base) >python example.py
Filename: \Continuum\anaconda3\lib\site-packages\deepdiff\diff.py
Line # Mem usage Increment Line Contents
================================================
106 64.5 MiB 64.5 MiB @profile
107 def __init__(self,
108 t1,
109 t2,
110 cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
111 cutoff_intersection_for_pairs=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
112 cache_size=0,
113 cache_tuning_sample_size=0,
114 cache_purge_level=1,
115 exclude_paths=None,
116 exclude_regex_paths=None,
117 exclude_types=None,
118 exclude_obj_callback=None,
119 get_deep_distance=False,
120 hasher=None,
121 hashes=None,
122 ignore_order=False,
123 ignore_type_in_groups=None,
124 ignore_string_type_changes=False,
125 ignore_numeric_type_changes=False,
126 ignore_type_subclasses=False,
127 ignore_string_case=False,
128 ignore_nan_inequality=False,
129 ignore_private_variables=True,
130 log_frequency_in_sec=0,
131 max_passes=10000000,
132 max_diffs=None,
133 number_format_notation="f",
134 number_to_string_func=None,
135 progress_logger=logger.info,
136 report_repetition=False,
137 significant_digits=None,
138 truncate_datetime=None,
139 verbose_level=1,
140 view=TEXT_VIEW,
141 _original_type=None,
142 _parameters=None,
143 _shared_parameters=None,
144 **kwargs):
145 64.5 MiB 0.0 MiB super().__init__()
146 64.5 MiB 0.0 MiB if kwargs:
147 raise ValueError((
148 "The following parameter(s) are not valid: %s\n"
149 "The valid parameters are ignore_order, report_repetition, significant_digits, "
150 "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
151 "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, "
152 "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
153 "view, hasher, hashes, max_passes, max_diffs, "
154 "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
155 "cache_tuning_sample_size, get_deep_distance, cache_purge_level, "
156 "_original_type, _parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
157
158 64.5 MiB 0.0 MiB if _parameters:
159 self.__dict__.update(_parameters)
160 else:
161 64.5 MiB 0.0 MiB self.ignore_order = ignore_order
162 64.5 MiB 0.0 MiB ignore_type_in_groups = ignore_type_in_groups or []
163 64.5 MiB 0.0 MiB if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
164 ignore_numeric_type_changes = True
165 64.5 MiB 0.0 MiB self.ignore_numeric_type_changes = ignore_numeric_type_changes
166 64.5 MiB 0.0 MiB if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
167 ignore_string_type_changes = True
168 64.5 MiB 0.0 MiB self.ignore_string_type_changes = ignore_string_type_changes
169 64.5 MiB 0.0 MiB self.ignore_type_in_groups = self.get_ignore_types_in_groups(
170 64.5 MiB 0.0 MiB ignore_type_in_groups=ignore_type_in_groups,
171 64.5 MiB 0.0 MiB ignore_string_type_changes=ignore_string_type_changes,
172 64.5 MiB 0.0 MiB ignore_numeric_type_changes=ignore_numeric_type_changes,
173 64.5 MiB 0.0 MiB ignore_type_subclasses=ignore_type_subclasses)
174 64.5 MiB 0.0 MiB self.report_repetition = report_repetition
175 64.5 MiB 0.0 MiB self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
176 64.5 MiB 0.0 MiB self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
177 64.5 MiB 0.0 MiB self.exclude_types = set(exclude_types) if exclude_types else None
178 64.5 MiB 0.0 MiB self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance
179 64.5 MiB 0.0 MiB self.ignore_type_subclasses = ignore_type_subclasses
180 64.5 MiB 0.0 MiB self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
181 64.5 MiB 0.0 MiB self.ignore_string_case = ignore_string_case
182 64.5 MiB 0.0 MiB self.exclude_obj_callback = exclude_obj_callback
183 64.5 MiB 0.0 MiB self.number_to_string = number_to_string_func or number_to_string
184 64.5 MiB 0.0 MiB self.ignore_private_variables = ignore_private_variables
185 64.5 MiB 0.0 MiB self.ignore_nan_inequality = ignore_nan_inequality
186 64.5 MiB 0.0 MiB self.hasher = hasher
187 64.5 MiB 0.0 MiB self.cache_tuning_sample_size = cache_tuning_sample_size
188
189 64.5 MiB 0.0 MiB self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
190 64.5 MiB 0.0 MiB self.truncate_datetime = get_truncate_datetime(truncate_datetime)
191 64.5 MiB 0.0 MiB self.number_format_notation = number_format_notation
192 64.5 MiB 0.0 MiB if verbose_level in {0, 1, 2}:
193 64.5 MiB 0.0 MiB self.verbose_level = verbose_level
194 else:
195 raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
196 64.5 MiB 0.0 MiB if cache_purge_level not in {0, 1, 2}:
197 raise ValueError(PURGE_LEVEL_RANGE_MSG)
198 64.5 MiB 0.0 MiB self.view = view
199 # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
200 64.5 MiB 0.0 MiB self.max_passes = max_passes
201 64.5 MiB 0.0 MiB self.max_diffs = max_diffs
202 64.5 MiB 0.0 MiB self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
203 64.5 MiB 0.0 MiB self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
204 64.5 MiB 0.0 MiB if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
205 raise ValueError(CUTOFF_RANGE_ERROR_MSG)
206 # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
207 # cleaning functionalities when running DeepDiff recursively.
208 # However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
209 # DeepDiff _parameters are transformed to DeepHash _parameters via __get_deephash_params method.
210 64.5 MiB 0.0 MiB self.progress_logger = progress_logger
211 64.5 MiB 0.0 MiB self.cache_size = cache_size
212 64.5 MiB 0.0 MiB _parameters = self.__dict__.copy()
213
214 # Non-Root
215 64.5 MiB 0.0 MiB if _shared_parameters:
216 self.is_root = False
217 self._shared_parameters = _shared_parameters
218 self.__dict__.update(_shared_parameters)
219 # We are in some pass other than root
220 progress_timer = None
221 # Root
222 else:
223 64.5 MiB 0.0 MiB self.is_root = True
224 # Caching the DeepDiff results for dynamic programming
225 64.5 MiB 0.0 MiB self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
226 self._stats = {
227 64.5 MiB 0.0 MiB PASSES_COUNT: 0,
228 64.5 MiB 0.0 MiB DIFF_COUNT: 0,
229 64.5 MiB 0.0 MiB DISTANCE_CACHE_HIT_COUNT: 0,
230 64.5 MiB 0.0 MiB PREVIOUS_DIFF_COUNT: 0,
231 64.5 MiB 0.0 MiB PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
232 64.5 MiB 0.0 MiB MAX_PASS_LIMIT_REACHED: False,
233 64.5 MiB 0.0 MiB MAX_DIFF_LIMIT_REACHED: False,
234 64.5 MiB 0.0 MiB DISTANCE_CACHE_ENABLED: bool(cache_size),
235 }
236 64.5 MiB 0.0 MiB self.hashes = dict_() if hashes is None else hashes
237 64.5 MiB 0.0 MiB self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths
238 self._shared_parameters = {
239 64.5 MiB 0.0 MiB 'hashes': self.hashes,
240 64.5 MiB 0.0 MiB '_stats': self._stats,
241 64.5 MiB 0.0 MiB '_distance_cache': self._distance_cache,
242 64.5 MiB 0.0 MiB '_numpy_paths': self._numpy_paths,
243 64.5 MiB 0.0 MiB _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
244 }
245 64.5 MiB 0.0 MiB if log_frequency_in_sec:
246 # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
247 progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
248 else:
249 64.5 MiB 0.0 MiB progress_timer = None
250
251 64.5 MiB 0.0 MiB self._parameters = _parameters
252 64.5 MiB 0.0 MiB self.deephash_parameters = self.__get_deephash_params()
253 64.5 MiB 0.0 MiB self.tree = TreeResult()
254 64.5 MiB 0.0 MiB self.t1 = t1
255 64.5 MiB 0.0 MiB self.t2 = t2
256
257 64.5 MiB 0.0 MiB try:
258 64.5 MiB 0.0 MiB root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
259 # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
260 # The reason is that we convert the numpy array to python list and then later for distance calculations
261 # we convert only the the last dimension of it into numpy arrays.
262 831.4 MiB 767.0 MiB self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
263
264 831.4 MiB 0.0 MiB if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
265 self.tree['deep_distance'] = self._get_rough_distance()
266
267 831.4 MiB 0.0 MiB self.tree.remove_empty_keys()
268 883.2 MiB 51.8 MiB view_results = self._get_view_results(self.view)
269 883.2 MiB 0.0 MiB self.update(view_results)
270 finally:
271 883.2 MiB 0.0 MiB if self.is_root:
272 883.2 MiB 0.0 MiB if cache_purge_level:
273 883.2 MiB 0.0 MiB del self._distance_cache
274 883.2 MiB 0.0 MiB del self.hashes
275 883.2 MiB 0.0 MiB del self._shared_parameters
276 883.2 MiB 0.0 MiB del self._parameters
277 883.2 MiB 0.0 MiB for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
278 883.2 MiB 0.0 MiB DISTANCE_CACHE_ENABLED):
279 883.2 MiB 0.0 MiB del self._stats[key]
280 883.2 MiB 0.0 MiB if progress_timer:
281 duration = progress_timer.stop()
282 self._stats['DURATION SEC'] = duration
283 logger.info('stats {}'.format(self.get_stats()))
284 883.2 MiB 0.0 MiB if cache_purge_level == 2:
285 self.__dict__.clear()
When I pre-flatten the data structure I do save some considerable amount of space. Here is the code I used to flatten,
from memory_profiler import profile
from deepdiff import DeepDiff # For Deep Difference of 2 objects
from scipy.io import loadmat
import os
import wrapt
import numpy as np
def deepflatten(instance, depth=None, types=None, ignore=None):
'''
function to flatten mat_sturct object into flat dictionary
'''
flat_d ={}
def flatten(instance, key=''):
if type(instance) in [str,float,int,np.ndarray]:
flat_d[key] = instance
else:
for a in dir(instance):
if not a.startswith('_'):
#print(a,type(getattr(instance,a)))
flatten(getattr(instance,a),key+'.'+a)
flatten(instance)
return flat_d
if __name__ == '__main__':
# get the data structures and sub-select to filter the profiling
matlist = [f for f in os.listdir() if f.endswith('.mat')]
mat1 = loadmat(matlist[0], squeeze_me=True, struct_as_record=False)
mat2 = loadmat(matlist[1], squeeze_me=True, struct_as_record=False)
key = list(mat1.keys())[-10]
mat1 = mat1[key]
mat2 = mat2[key]
# monkey patch and add the profile decorator
#wrapt.wrap_function_wrapper(DeepDiff, '__init__', profile) #threw an error regarding hashing the backend for memory_profile
# now call profile decorated DeepDiff
DeepDiff(deepflatten(mat1),deepflatten(mat2))
and the memory consumption with a flat data structure,
(base) >python example.py
Filename: Continuum\anaconda3\lib\site-packages\deepdiff\diff.py
Line # Mem usage Increment Line Contents
================================================
106 64.8 MiB 64.8 MiB @profile
107 def __init__(self,
108 t1,
109 t2,
110 cutoff_distance_for_pairs=CUTOFF_DISTANCE_FOR_PAIRS_DEFAULT,
111 cutoff_intersection_for_pairs=CUTOFF_INTERSECTION_FOR_PAIRS_DEFAULT,
112 cache_size=0,
113 cache_tuning_sample_size=0,
114 cache_purge_level=1,
115 exclude_paths=None,
116 exclude_regex_paths=None,
117 exclude_types=None,
118 exclude_obj_callback=None,
119 get_deep_distance=False,
120 hasher=None,
121 hashes=None,
122 ignore_order=False,
123 ignore_type_in_groups=None,
124 ignore_string_type_changes=False,
125 ignore_numeric_type_changes=False,
126 ignore_type_subclasses=False,
127 ignore_string_case=False,
128 ignore_nan_inequality=False,
129 ignore_private_variables=True,
130 log_frequency_in_sec=0,
131 max_passes=10000000,
132 max_diffs=None,
133 number_format_notation="f",
134 number_to_string_func=None,
135 progress_logger=logger.info,
136 report_repetition=False,
137 significant_digits=None,
138 truncate_datetime=None,
139 verbose_level=1,
140 view=TEXT_VIEW,
141 _original_type=None,
142 _parameters=None,
143 _shared_parameters=None,
144 **kwargs):
145 64.8 MiB 0.0 MiB super().__init__()
146 64.8 MiB 0.0 MiB if kwargs:
147 raise ValueError((
148 "The following parameter(s) are not valid: %s\n"
149 "The valid parameters are ignore_order, report_repetition, significant_digits, "
150 "number_format_notation, exclude_paths, exclude_types, exclude_regex_paths, ignore_type_in_groups, "
151 "ignore_string_type_changes, ignore_numeric_type_changes, ignore_type_subclasses, truncate_datetime, "
152 "ignore_private_variables, ignore_nan_inequality, number_to_string_func, verbose_level, "
153 "view, hasher, hashes, max_passes, max_diffs, "
154 "cutoff_distance_for_pairs, cutoff_intersection_for_pairs, log_frequency_in_sec, cache_size, "
155 "cache_tuning_sample_size, get_deep_distance, cache_purge_level, "
156 "_original_type, _parameters and _shared_parameters.") % ', '.join(kwargs.keys()))
157
158 64.8 MiB 0.0 MiB if _parameters:
159 self.__dict__.update(_parameters)
160 else:
161 64.8 MiB 0.0 MiB self.ignore_order = ignore_order
162 64.8 MiB 0.0 MiB ignore_type_in_groups = ignore_type_in_groups or []
163 64.8 MiB 0.0 MiB if numbers == ignore_type_in_groups or numbers in ignore_type_in_groups:
164 ignore_numeric_type_changes = True
165 64.8 MiB 0.0 MiB self.ignore_numeric_type_changes = ignore_numeric_type_changes
166 64.8 MiB 0.0 MiB if strings == ignore_type_in_groups or strings in ignore_type_in_groups:
167 ignore_string_type_changes = True
168 64.8 MiB 0.0 MiB self.ignore_string_type_changes = ignore_string_type_changes
169 64.8 MiB 0.0 MiB self.ignore_type_in_groups = self.get_ignore_types_in_groups(
170 64.8 MiB 0.0 MiB ignore_type_in_groups=ignore_type_in_groups,
171 64.8 MiB 0.0 MiB ignore_string_type_changes=ignore_string_type_changes,
172 64.8 MiB 0.0 MiB ignore_numeric_type_changes=ignore_numeric_type_changes,
173 64.8 MiB 0.0 MiB ignore_type_subclasses=ignore_type_subclasses)
174 64.8 MiB 0.0 MiB self.report_repetition = report_repetition
175 64.8 MiB 0.0 MiB self.exclude_paths = convert_item_or_items_into_set_else_none(exclude_paths)
176 64.8 MiB 0.0 MiB self.exclude_regex_paths = convert_item_or_items_into_compiled_regexes_else_none(exclude_regex_paths)
177 64.8 MiB 0.0 MiB self.exclude_types = set(exclude_types) if exclude_types else None
178 64.8 MiB 0.0 MiB self.exclude_types_tuple = tuple(exclude_types) if exclude_types else None # we need tuple for checking isinstance
179 64.8 MiB 0.0 MiB self.ignore_type_subclasses = ignore_type_subclasses
180 64.8 MiB 0.0 MiB self.type_check_func = type_is_subclass_of_type_group if ignore_type_subclasses else type_in_type_group
181 64.8 MiB 0.0 MiB self.ignore_string_case = ignore_string_case
182 64.8 MiB 0.0 MiB self.exclude_obj_callback = exclude_obj_callback
183 64.8 MiB 0.0 MiB self.number_to_string = number_to_string_func or number_to_string
184 64.8 MiB 0.0 MiB self.ignore_private_variables = ignore_private_variables
185 64.8 MiB 0.0 MiB self.ignore_nan_inequality = ignore_nan_inequality
186 64.8 MiB 0.0 MiB self.hasher = hasher
187 64.8 MiB 0.0 MiB self.cache_tuning_sample_size = cache_tuning_sample_size
188
189 64.8 MiB 0.0 MiB self.significant_digits = self.get_significant_digits(significant_digits, ignore_numeric_type_changes)
190 64.8 MiB 0.0 MiB self.truncate_datetime = get_truncate_datetime(truncate_datetime)
191 64.8 MiB 0.0 MiB self.number_format_notation = number_format_notation
192 64.8 MiB 0.0 MiB if verbose_level in {0, 1, 2}:
193 64.8 MiB 0.0 MiB self.verbose_level = verbose_level
194 else:
195 raise ValueError(VERBOSE_LEVEL_RANGE_MSG)
196 64.8 MiB 0.0 MiB if cache_purge_level not in {0, 1, 2}:
197 raise ValueError(PURGE_LEVEL_RANGE_MSG)
198 64.8 MiB 0.0 MiB self.view = view
199 # Setting up the cache for dynamic programming. One dictionary per instance of root of DeepDiff running.
200 64.8 MiB 0.0 MiB self.max_passes = max_passes
201 64.8 MiB 0.0 MiB self.max_diffs = max_diffs
202 64.8 MiB 0.0 MiB self.cutoff_distance_for_pairs = float(cutoff_distance_for_pairs)
203 64.8 MiB 0.0 MiB self.cutoff_intersection_for_pairs = float(cutoff_intersection_for_pairs)
204 64.8 MiB 0.0 MiB if self.cutoff_distance_for_pairs < 0 or self.cutoff_distance_for_pairs > 1:
205 raise ValueError(CUTOFF_RANGE_ERROR_MSG)
206 # _Parameters are the clean _parameters to initialize DeepDiff with so we avoid all the above
207 # cleaning functionalities when running DeepDiff recursively.
208 # However DeepHash has its own set of _parameters that are slightly different than DeepDIff.
209 # DeepDiff _parameters are transformed to DeepHash _parameters via __get_deephash_params method.
210 64.8 MiB 0.0 MiB self.progress_logger = progress_logger
211 64.8 MiB 0.0 MiB self.cache_size = cache_size
212 64.8 MiB 0.0 MiB _parameters = self.__dict__.copy()
213
214 # Non-Root
215 64.8 MiB 0.0 MiB if _shared_parameters:
216 self.is_root = False
217 self._shared_parameters = _shared_parameters
218 self.__dict__.update(_shared_parameters)
219 # We are in some pass other than root
220 progress_timer = None
221 # Root
222 else:
223 64.8 MiB 0.0 MiB self.is_root = True
224 # Caching the DeepDiff results for dynamic programming
225 64.8 MiB 0.0 MiB self._distance_cache = LFUCache(cache_size) if cache_size else DummyLFU()
226 self._stats = {
227 64.8 MiB 0.0 MiB PASSES_COUNT: 0,
228 64.8 MiB 0.0 MiB DIFF_COUNT: 0,
229 64.8 MiB 0.0 MiB DISTANCE_CACHE_HIT_COUNT: 0,
230 64.8 MiB 0.0 MiB PREVIOUS_DIFF_COUNT: 0,
231 64.8 MiB 0.0 MiB PREVIOUS_DISTANCE_CACHE_HIT_COUNT: 0,
232 64.8 MiB 0.0 MiB MAX_PASS_LIMIT_REACHED: False,
233 64.8 MiB 0.0 MiB MAX_DIFF_LIMIT_REACHED: False,
234 64.8 MiB 0.0 MiB DISTANCE_CACHE_ENABLED: bool(cache_size),
235 }
236 64.8 MiB 0.0 MiB self.hashes = dict_() if hashes is None else hashes
237 64.8 MiB 0.0 MiB self._numpy_paths = dict_() # if _numpy_paths is None else _numpy_paths
238 self._shared_parameters = {
239 64.8 MiB 0.0 MiB 'hashes': self.hashes,
240 64.8 MiB 0.0 MiB '_stats': self._stats,
241 64.8 MiB 0.0 MiB '_distance_cache': self._distance_cache,
242 64.8 MiB 0.0 MiB '_numpy_paths': self._numpy_paths,
243 64.8 MiB 0.0 MiB _ENABLE_CACHE_EVERY_X_DIFF: self.cache_tuning_sample_size * 10,
244 }
245 64.8 MiB 0.0 MiB if log_frequency_in_sec:
246 # Creating a progress log reporter that runs in a separate thread every log_frequency_in_sec seconds.
247 progress_timer = RepeatedTimer(log_frequency_in_sec, _report_progress, self._stats, progress_logger)
248 else:
249 64.8 MiB 0.0 MiB progress_timer = None
250
251 64.8 MiB 0.0 MiB self._parameters = _parameters
252 64.8 MiB 0.0 MiB self.deephash_parameters = self.__get_deephash_params()
253 64.8 MiB 0.0 MiB self.tree = TreeResult()
254 64.8 MiB 0.0 MiB self.t1 = t1
255 64.8 MiB 0.0 MiB self.t2 = t2
256
257 64.8 MiB 0.0 MiB try:
258 64.8 MiB 0.0 MiB root = DiffLevel(t1, t2, verbose_level=self.verbose_level)
259 # _original_type is only used to pass the original type of the data. Currently only used for numpy arrays.
260 # The reason is that we convert the numpy array to python list and then later for distance calculations
261 # we convert only the the last dimension of it into numpy arrays.
262 572.2 MiB 507.4 MiB self.__diff(root, parents_ids=frozenset({id(t1)}), _original_type=_original_type)
263
264 572.2 MiB 0.0 MiB if get_deep_distance and view in {TEXT_VIEW, TREE_VIEW}:
265 self.tree['deep_distance'] = self._get_rough_distance()
266
267 572.2 MiB 0.0 MiB self.tree.remove_empty_keys()
268 625.0 MiB 52.8 MiB view_results = self._get_view_results(self.view)
269 625.0 MiB 0.0 MiB self.update(view_results)
270 finally:
271 625.0 MiB 0.0 MiB if self.is_root:
272 625.0 MiB 0.0 MiB if cache_purge_level:
273 625.0 MiB 0.0 MiB del self._distance_cache
274 625.0 MiB 0.0 MiB del self.hashes
275 625.0 MiB 0.0 MiB del self._shared_parameters
276 625.0 MiB 0.0 MiB del self._parameters
277 625.1 MiB 0.1 MiB for key in (PREVIOUS_DIFF_COUNT, PREVIOUS_DISTANCE_CACHE_HIT_COUNT,
278 625.1 MiB 0.0 MiB DISTANCE_CACHE_ENABLED):
279 625.1 MiB 0.0 MiB del self._stats[key]
280 625.1 MiB 0.0 MiB if progress_timer:
281 duration = progress_timer.stop()
282 self._stats['DURATION SEC'] = duration
283 logger.info('stats {}'.format(self.get_stats()))
284 625.1 MiB 0.0 MiB if cache_purge_level == 2:
285 self.__dict__.clear()
I am not sure the memory savings is due to a bug or just that the data structure is smaller to keep in memory.
Any additional inputs I should use to play with the results? Are there sub-methods I should `@profile