fiftyone
fiftyone copied to clipboard
[BUG] ConnectionResetError while processing big dataset
Describe the problem
I am trying to remove near-duplicates from big dataset (~91K images). After few hours of processing I got ConnectionResetError error.
Code to reproduce issue
Code I use:
def gen_approx_duplicate_groups_view(dataset, index):
"""This function is used to generate the approximate duplicate groups view.
From: https://github.com/jacobmarks/image-deduplication-plugin/blob/main/approx_dups.py
"""
dup_ids = index.duplicates_view().values("id")
view = dataset.select(dup_ids)
for rep_id, dups in index.neighbors_map.items():
ids = [rep_id] + [d[0] for d in dups]
subview = view.select(ids)
for sample in subview:
sample["approx_dup_group_id"] = rep_id
sample.save()
approx_dup_groups_view = view.group_by("approx_dup_group_id")
dataset.save_view("approx_dup_groups_view", approx_dup_groups_view, overwrite=True)
def remove_approx_duplicates(dataset: fo.Dataset, fraction: float) -> fo.Dataset:
similarity = fob.compute_similarity(dataset)
similarity.find_duplicates(fraction=fraction)
logging.debug("Preparing duplicates view")
approx_dup_view = similarity.duplicates_view()
dataset.save_view("approx_dup_view", approx_dup_view, overwrite=True)
approx_dup_view = dataset.load_saved_view("approx_dup_view")
logging.debug("Grouping duplicates view")
gen_approx_duplicate_groups_view(dataset, similarity)
approx_duplicates_list = []
logging.debug("Adding near-duplicates to list.")
for group_id in approx_dup_view.distinct("approx_dup_group_id"):
group_view = approx_dup_view.match(F("approx_dup_group_id") == group_id)
group_view = group_view.sort_by("filepath")
approx_duplicates_list.append(group_view.values("id")[1:])
approx_duplicates_list = list(set([item for sublist in approx_duplicates_list for item in sublist]))
logging.info(f"Near-duplicates found. Removing {len(approx_duplicates_list)} of samples...")
dataset.delete_samples(approx_duplicates_list)
logging.info("Done")
return dataset
System information
- OS Platform and Distribution: Linux Ubuntu 22.04
- Python version : 3.10.12
- FiftyOne version : FiftyOne v0.23.7, Voxel51, Inc.
- FiftyOne installed from : pip
Other info/logs
{"t":{"$date":"2024-04-23T05:21:33.443Z"},"s":"I", "c":"CONTROL", "id":20697, "ctx":"-","msg":"Renamed existing log file","attr":{"oldLogPath":"/home/marianna.parzych/.fiftyone/var/lib/mongo/log/mong
o.log","newLogPath":"/home/marianna.parzych/.fiftyone/var/lib/mongo/log/mongo.log.2024-04-23T05-21-33"}}
Subprocess ['/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/db/bin/mongod', '--dbpath', '/home/marianna.parzych/.fiftyone/var/lib
/mongo', '--logpath', '/home/marianna.parzych/.fiftyone/var/lib/mongo/log/mongo.log', '--port', '0', '--nounixsocket'] exited with error -9:
Traceback (most recent call last):
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/pool.py", line 968, in command
return command(
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/network.py", line 182, in command
reply = receive_message(conn, request_id)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/network.py", line 257, in receive_message
length, _, response_to, op_code = _UNPACK_HEADER(_receive_data_on_socket(conn, 16, deadline))
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/network.py", line 340, in _receive_data_on_socket
chunk_length = conn.conn.recv_into(mv[bytes_read:])
ConnectionResetError: [Errno 104] Connection reset by peer
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 309, in <module>
model_name: str,
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/click/core.py", line 1157, in __call__
return self.main(*args, **kwargs)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/click/core.py", line 1078, in main
rv = self.invoke(ctx)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/click/core.py", line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/click/core.py", line 783, in invoke
return __callback(*args, **kwargs)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 294, in main
type=list[str],
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 153, in deduplicate_image_dataset
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 133, in remove_approx_duplicates
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 6514, in sort_by
return self._add_view_stage(
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/view.py", line 1660, in _add_view_stage
stage.validate(self)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/stages.py", line 6970, in validate
sample_collection.create_index(index_spec)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 8970, in create_index
index_info = self.get_index_information()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 8903, in get_index_information
sample_info = self._dataset._sample_collection.index_information()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/collection.py", line 2347, in index_information
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 294, in main
type=list[str],
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 153, in deduplicate_image_dataset
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/unstructured_sg/scripts/sample_from_SEC_data_name-based_clusters.py", line 133, in remove_approx_duplicates
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 6514, in sort_by
return self._add_view_stage(
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/view.py", line 1660, in _add_view_stage
stage.validate(self)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/stages.py", line 6970, in validate
sample_collection.create_index(index_spec)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 8970, in create_index
index_info = self.get_index_information()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/fiftyone/core/collections.py", line 8903, in get_index_information
sample_info = self._dataset._sample_collection.index_information()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/collection.py", line 2347, in index_information
cursor = self.list_indexes(session=session, comment=comment)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/collection.py", line 2310, in list_indexes
return self.__database.client._retryable_read(_cmd, read_pref, s)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 1492, in _retryable_read
return self._retry_internal(
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/_csot.py", line 107, in csot_wrapper
return func(self, *args, **kwargs)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 1462, in _retry_internal
).run()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 2320, in run
self._check_last_error()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 2392, in _check_last_error
raise self._last_error
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 2315, in run
return self._read() if self._is_read else self._write()
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/mongo_client.py", line 2445, in _read
return self._func(self._session, self._server, conn, read_pref) # type: ignore
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/collection.py", line 2289, in _cmd
cursor = self._command(conn, cmd, read_preference, codec_options, session=session)[
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/collection.py", line 308, in _command
return conn.command(
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/helpers.py", line 322, in inner
return func(*args, **kwargs)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/pool.py", line 996, in command
self._raise_connection_failure(error)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/pool.py", line 1171, in _raise_connection_failure
_raise_connection_failure(self.address, error, timeout_details=details)
File "/mnt/ml-team/homes/marianna.parzych/Unstructured/od-modelling-super-gradients/venv/lib/python3.10/site-packages/pymongo/pool.py", line 411, in _raise_connection_failure
raise AutoReconnect(msg) from error
pymongo.errors.AutoReconnect: localhost:37343: [Errno 104] Connection reset by peer (configured timeouts: connectTimeoutMS: 20000.0ms)
Skipping automatic non-persistent dataset cleanup. This action requires read access of the 'admin' database
[2024-04-23 12:54:42] WARNING - database.py - Skipping automatic non-persistent dataset cleanup. This action requires read access of the 'admin' database
Willingness to contribute
The FiftyOne Community encourages bug fix contributions. Would you or another member of your organization be willing to contribute a fix for this bug to the FiftyOne codebase?
- [ ] Yes. I can contribute a fix for this bug independently
- [ ] Yes. I would be willing to contribute a fix for this bug with guidance from the FiftyOne community
- [x] No. I cannot contribute a bug fix at this time