streaming
                                
                                
                                
                                    streaming copied to clipboard
                            
                            
                            
                        Clean up garbage files
We really like the streaming library. The only issue is that it leaves garbage around the file system after every run, and therefore we adopted this ugly hack:
MAX_RETRIES = 2
SUDO_RM_ALLOWED_DIR_PREFIXES = ["/tmp", "/dev/shm", "/data", "/scratch", "/mnt/localssd", "/mnt/nvme"]
def create_streaming_dataset_with_local_cleanup(
    streams: List[Stream],
    batch_size: int,
    replication: int,
    shuffle: bool,
):
    def delete_dir_sudo_if_no_perms(dir: str):
        if os.path.exists(dir):
            try:
                shutil.rmtree(dir)
            except Exception as e:
                if "Errno 20" in str(e): # not a dir
                    try:
                        os.remove(dir)
                    except Exception as e:
                        pass # try sudo rm below
                logger.warning(f"Failed to delete directory {dir}, retrying with sudo... {e}")
                assert any(dir.startswith(d) for d in SUDO_RM_ALLOWED_DIR_PREFIXES), f"Directory {dir} is not a valid directory to sudo delete"
                os.system(f"sudo rm -rf {dir}")
    for i in range(MAX_RETRIES):
        try:
            dataset = StreamingDataset(
                streams=streams,
                predownload=None,
                shuffle=shuffle,
                batching_method="per_stream",
                batch_size=batch_size,
                replication=replication,
                download_timeout=300,
            )
            break
        except Exception as e:
            if i == MAX_RETRIES - 1:
                raise e
            logger.warning(f"Failed to create streaming dataset, cleaning up local env and retrying... {e}")
            for stream in streams:
                delete_dir_sudo_if_no_perms(stream.local)
            # delete /tmp/streaming
            delete_dir_sudo_if_no_perms("/tmp/streaming")
            # delete all files in /dev/shm/00*
            for file in os.listdir("/dev/shm"):
                if file.startswith("00"):
                    delete_dir_sudo_if_no_perms(f"/dev/shm/{file}")
    return dataset
Would be great to fix.