spyglass
spyglass copied to clipboard
Merge duplicate funcs
Process
While writing pytests, I noticed quite a few funcs that were copies of methods shared across tables, both across different versions of a pipeline and across different pipelines. I put together the following script to map out where funcs are duplicated, ignoring funcs shared via inheritance, and funcs with contextual definitions like insert_default
Script
import importlib
import inspect
import pkgutil
from collections import defaultdict
import datajoint as dj
import ndx_franklab_novela
from hdmf.data_utils import GenericDataChunkIterator
from probeinterface import Probe
from spikeinterface.core import BaseRecordingSegment, BaseSorting
from tqdm import tqdm
import spyglass
from spyglass.utils import SpyglassMixin, _Merge
from spyglass.utils.dj_graph import TableChain
ignored_functions = set(
dir(dj.Computed)
+ dir(SpyglassMixin)
+ dir(TableChain)
+ dir(_Merge)
+ dir(ndx_franklab_novela)
+ dir(GenericDataChunkIterator)
+ dir(BaseSorting)
+ dir(BaseRecordingSegment)
+ dir(Probe)
+ [
"insert_default",
"fetch1_dataframe",
"fetch_dataframe",
"_no_transaction_make",
"nwb_object" "log",
"cleanup",
"nightly_cleanup",
"increment_access",
"_to_dict",
"to_dict",
"_from_dict",
]
)
ignored_classes = set(
[
"SpyglassMixin",
"SpyglassMixinPart",
"Merge",
"_Merge",
"TableChain",
"RestrGraph",
"AbstractGraph",
]
)
all_ignored = ignored_functions | ignored_classes
def load_funcs(package_name=spyglass.__name__):
function_paths = defaultdict(list)
for importer, module_name, _ in tqdm(
pkgutil.walk_packages(spyglass.__path__),
desc="Loading cache",
total=127,
):
module = importer.find_module(module_name).load_module(module_name)
# Iterate through all objects (functions and classes) in the module
for name, obj in inspect.getmembers(module):
if (
getattr(obj, "__module__", "") != module_name
or name in all_ignored
):
continue
if inspect.isfunction(obj):
function_paths[name].append(module_name)
elif inspect.isclass(obj):
for name, obj in inspect.getmembers(obj):
if (
inspect.isfunction(obj)
and name not in ignored_functions
):
function_paths[name].append(module_name)
return {k: v for k, v in function_paths.items() if len(v) > 1}
dupe_funcs = load_funcs()
sorted_funcs = sorted(dupe_funcs.keys())
for func in sorted_funcs:
print("-", func)
for module in dupe_funcs[func]:
print(" -", module)
Results
- _check_artifact_thresholds
- lfp.v1.lfp_artifact_difference_detection
- spikesorting.v0.spikesorting_artifact
- spikesorting.v1.artifact
- _compute_artifact_chunk
- spikesorting.v0.spikesorting_artifact
- spikesorting.v1.artifact
- _compute_metric
- spikesorting.v0.spikesorting_curation
- spikesorting.v1.metric_curation
- _convert_algorithm_params
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- _convert_dict_to_class
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- decoding.v1.temp
- _convert_env_dict
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- decoding.v1.temp
- _convert_environment_to_dict
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- _convert_transitions_to_dict
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- _get_artifact_times
- spikesorting.v0.spikesorting_artifact
- spikesorting.v1.artifact
- _get_interval_range
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- _get_peak_amplitude
- decoding.v0.clusterless
- decoding.v1.waveform_features
- _get_recording_timestamps
- spikesorting.v0.spikesorting_recording
- spikesorting.v1.recording
- _get_sort_interval_valid_times
- spikesorting.v0.spikesorting_recording
- spikesorting.v1.recording
- _init_artifact_worker
- spikesorting.v0.spikesorting_artifact
- spikesorting.v1.artifact
- _reformat_metrics
- spikesorting.v0.curation_figurl
- spikesorting.v1.figurl_curation
- calculate_position_info
- common.common_position
- position.v1.position_trodes_position
- convert_classes_to_dict
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- convert_to_pixels
- common.common_position
- position.v1.dlc_utils
- position.v1.position_trodes_position
- create_figurl
- mua.v1.mua
- ripple.v1.ripple
- create_group
- decoding.v1.clusterless
- decoding.v1.core
- spikesorting.analysis.v1.group
- discretize_and_trim
- decoding.v0.visualization_1D_view
- decoding.v0.visualization_2D_view
- fetch_environments
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- fetch_linear_position_info
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- fetch_model
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- fetch_position_info
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- fetch_results
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- fetch_spike_data
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- spikesorting.analysis.v1.group
- fill_nan
- common.common_position
- position.v1.dlc_utils
- position.v1.position_trodes_position
- generate_pos_components
- common.common_position
- position.v1.position_trodes_position
- get_Kay_ripple_consensus_trace
- common.common_ripple
- ripple.v1.ripple
- get_ahead_behind_distance
- decoding.v1.clusterless
- decoding.v1.sorted_spikes
- get_data_for_multiple_epochs
- decoding.v0.clusterless
- decoding.v0.sorted_spikes
- get_decoding_data_for_epoch
- decoding.v0.clusterless
- decoding.v0.sorted_spikes
- get_networkx_track_graph
- common.common_position
- linearization.v0.main
- linearization.v1.main
- get_ripple_lfps_and_position_info
- common.common_ripple
- ripple.v1.ripple
- get_time_bins_from_interval
- decoding.v0.clusterless
- decoding.v0.sorted_spikes
- interpolate_to_new_time
- common.common_ripple
- ripple.v1.ripple
- log
- common.common_nwbfile
- common.common_nwbfile
- make_default_decoding_parameters_cpu
- decoding.v0.clusterless
- decoding.v0.sorted_spikes
- make_default_decoding_parameters_gpu
- decoding.v0.clusterless
- decoding.v0.sorted_spikes
- make_video
- common.common_position
- position.v1.dlc_utils
- position.v1.position_trodes_position
- nwb_object
- common.common_ephys
- common.common_ephys
- plot_ripple
- common.common_ripple
- ripple.v1.ripple
- plot_ripple_consensus_trace
- common.common_ripple
- ripple.v1.ripple
- plot_track_graph
- common.common_position
- linearization.v0.main
- linearization.v1.main
- plot_track_graph_as_1D
- common.common_position
- linearization.v0.main
- linearization.v1.main
- populate_from_lock_file
- lock.file_lock
- lock.file_lock
- restore_classes
- decoding.v0.dj_decoder_conversion
- decoding.v1.dj_decoder_conversion
- set_lfp_band_electrodes
- common.common_ephys
- lfp.analysis.v1.lfp_band
- set_lfp_electrodes
- common.common_ephys
- common.common_ripple
- ripple.v1.ripple
Results
There some false positives...
- Some I'm not catching within inheritance
- Some coincidental overlap in name for different purposes (e.g.,
create_group
)
But there are also many repetitions across versions of pipelines,
- Exact copies (e.g.,
fill_nan
) - Modified copies that could accept an arg to determine if the new functionality should be used (e.g.,
make_video
) - Copies that refer back to the previous version (e.g.,
generate_pos_components
) - Copies that perform similar operations with different foreign key references
Proposed
- [ ] (1) Exact copies should (3) refer to the previous version, or, in the event of import issues, be migrated to a subpackage-level utilities script
- [ ] The original version of (2) modified copies should be replaced with the updated version, including an arg to determine use of new features
- [ ] The original version of (4) items with different references should be edited to extract the core functionality to a private method later versions can use