pyiron_base
pyiron_base copied to clipboard
loading jobs very slow
For pyiron_base versions 0.7.4 - 0.7.8, loading jobs were very slow compared to previous versions.
An example job is in the below link compressed file (I can't think of a better way to share it)! Not that this will only be noticed in relatively large jobs > ~100s MB jobs! https://datashare.mpcdf.mpg.de/s/oi2i125mevjj2K7
- Changing the tar file name fixed this issue
- This was also fixed in pyiron_base 0.7.9 probably related to #1343
I release a new pyiron_atomistics
version https://github.com/pyiron/pyiron_atomistics/releases/tag/pyiron_atomistics-0.4.16 - @niklassiemer Can you update the installation on the cluster, once it is available on conda?
A variant of this still occurs when accessing values from job.output
. For some reason this also tries to access the files first before looking at the correct place.
I haven't looked deeply, but here's a stack trace of job.get_structure()
that also seems to run into this.
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[42], line 2
1 for j in pr['bulk/reference/model'].iter_jobs(status='finished', hamilton='Vasp'):
----> 2 print(j.name, j['user/identifier'], j.get_structure())
File ~/software/pyiron_base/pyiron_base/utils/deprecate.py:171, in Deprecator.__deprecate_argument.<locals>.decorated(*args, **kwargs)
161 if kw in self.arguments:
162 warnings.warn(
163 message_format.format(
164 "{}.{}({}={})".format(
(...)
169 stacklevel=2,
170 )
--> 171 return function(*args, **kwargs)
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/structure/has_structure.py:109, in HasStructure.get_structure(self, frame, wrap_atoms, iteration_step)
105 except NotImplementedError:
106 raise KeyError(
107 f"argument frame {frame} is not an integer and _translate_frame() not implemented!"
108 ) from None
--> 109 num_structures = self.number_of_structures
110 if frame < 0:
111 frame += num_structures
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/structure/has_structure.py:143, in HasStructure.number_of_structures(self)
138 @property
139 def number_of_structures(self):
140 """
141 `int`: maximum `iteration_step` + 1 that can be passed to :meth:`.get_structure()`.
142 """
--> 143 return self._number_of_structures()
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/atomistic.py:770, in AtomisticGenericJob._number_of_structures(self)
769 def _number_of_structures(self):
--> 770 return self.output.positions.shape[0]
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:492, in GenericInteractiveOutput.positions(self)
490 @property
491 def positions(self):
--> 492 return self._lst_from_property("positions")
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:458, in GenericInteractiveOutput._lst_from_property(self, key)
444 """
445 Fetch latest values for the given property.
446
(...)
455 :class:`numpy.ndarray`: collected values from all previous steps
456 """
457 cached = np.array(self._lst_from_cache(key))
--> 458 fetched = self._key_from_hdf(key)
459 if fetched is None or len(fetched) == 0:
460 return cached
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:413, in GenericInteractiveOutput._key_from_hdf(self, key)
401 def _key_from_hdf(self, key):
402 """
403 Get all entries from the HDF5 file for a specific key - stored under 'output/interactive/<key>'. If not found
404 there the key is looked up in the regular HDF storage location 'output/<key>' via the property on our super
(...)
411
412 """
--> 413 fetched = self._job["output/interactive/" + key]
414 if fetched is None or len(fetched) == 0:
415 fetched = getattr(super(), key)
File ~/software/pyiron_base/pyiron_base/jobs/job/core.py:956, in JobCore.__getitem__(self, item)
951 except (ValueError, IndexError, KeyError):
952 # either group does not contain a data container or it is does, but it does not have the path we're
953 # looking for
954 pass
--> 956 if item in self.files.list():
957 warnings.warn(
958 "Using __getitem__ on a job to access files in deprecated: use job.files instead!",
959 category=DeprecationWarning,
960 )
961 return _job_read_file(self, item)
File ~/software/pyiron_base/pyiron_base/jobs/job/extension/files.py:64, in FileBrowser.list(self)
60 def list(self) -> List[str]:
61 """
62 List all files in the working directory of the job.
63 """
---> 64 return _working_directory_list_files(working_directory=self._working_directory)
File ~/software/pyiron_base/pyiron_base/jobs/job/util.py:375, in _working_directory_list_files(working_directory)
372 with tarfile.open(compressed_job_name, "r") as tar:
373 job_archive_name = os.path.basename(compressed_job_name)
374 compressed_files_lst = [
--> 375 member.name for member in tar.getmembers() if member.isfile()
376 ]
377 uncompressed_files_lst.remove(job_archive_name)
378 return uncompressed_files_lst + compressed_files_lst
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:1994, in TarFile.getmembers(self)
1992 self._check()
1993 if not self._loaded: # if we want to obtain a list of
-> 1994 self._load() # all members, we first have to
1995 # scan the whole archive.
1996 return self.members
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:2689, in TarFile._load(self)
2685 """Read through the entire archive file and look for readable
2686 members.
2687 """
2688 while True:
-> 2689 tarinfo = self.next()
2690 if tarinfo is None:
2691 break
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:2594, in TarFile.next(self)
2592 # Advance the file pointer.
2593 if self.offset != self.fileobj.tell():
-> 2594 self.fileobj.seek(self.offset - 1)
2595 if not self.fileobj.read(1):
2596 raise ReadError("unexpected end of data")
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/bz2.py:261, in BZ2File.seek(self, offset, whence)
246 """Change the file position.
247
248 The new position is specified by offset, relative to the
(...)
258 this operation may be extremely slow.
259 """
260 self._check_can_seek()
--> 261 return self._buffer.seek(offset, whence)
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/_compression.py:153, in DecompressReader.seek(self, offset, whence)
151 # Read and discard data until we reach the desired position.
152 while offset > 0:
--> 153 data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset))
154 if not data:
155 break
File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/_compression.py:103, in DecompressReader.read(self, size)
101 else:
102 rawblock = b""
--> 103 data = self._decompressor.decompress(rawblock, size)
104 if data:
105 break
KeyboardInterrupt:
File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:413, in GenericInteractiveOutput._key_from_hdf(self, key)
401 def _key_from_hdf(self, key):
402 """
403 Get all entries from the HDF5 file for a specific key - stored under 'output/interactive/<key>'. If not found
404 there the key is looked up in the regular HDF storage location 'output/<key>' via the property on our super
(...)
411
412 """
--> 413 fetched = self._job["output/interactive/" + key]
This checks if output/interactive
exists, so it should be changed to use:
fetched = self._job.project_hdf["output/interactive/" + key]
Good spot, will add it next week.