pyiron_base icon indicating copy to clipboard operation
pyiron_base copied to clipboard

loading jobs very slow

Open ahmedabdelkawy opened this issue 1 year ago • 5 comments

For pyiron_base versions 0.7.4 - 0.7.8, loading jobs were very slow compared to previous versions.

An example job is in the below link compressed file (I can't think of a better way to share it)! Not that this will only be noticed in relatively large jobs > ~100s MB jobs! https://datashare.mpcdf.mpg.de/s/oi2i125mevjj2K7

ahmedabdelkawy avatar Feb 20 '24 15:02 ahmedabdelkawy

  • Changing the tar file name fixed this issue
  • This was also fixed in pyiron_base 0.7.9 probably related to #1343

ahmedabdelkawy avatar Feb 20 '24 15:02 ahmedabdelkawy

I release a new pyiron_atomistics version https://github.com/pyiron/pyiron_atomistics/releases/tag/pyiron_atomistics-0.4.16 - @niklassiemer Can you update the installation on the cluster, once it is available on conda?

jan-janssen avatar Feb 20 '24 15:02 jan-janssen

A variant of this still occurs when accessing values from job.output. For some reason this also tries to access the files first before looking at the correct place.

I haven't looked deeply, but here's a stack trace of job.get_structure() that also seems to run into this.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[42], line 2
      1 for j in pr['bulk/reference/model'].iter_jobs(status='finished', hamilton='Vasp'):
----> 2     print(j.name, j['user/identifier'], j.get_structure())

File ~/software/pyiron_base/pyiron_base/utils/deprecate.py:171, in Deprecator.__deprecate_argument.<locals>.decorated(*args, **kwargs)
    161     if kw in self.arguments:
    162         warnings.warn(
    163             message_format.format(
    164                 "{}.{}({}={})".format(
   (...)
    169             stacklevel=2,
    170         )
--> 171 return function(*args, **kwargs)

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/structure/has_structure.py:109, in HasStructure.get_structure(self, frame, wrap_atoms, iteration_step)
    105     except NotImplementedError:
    106         raise KeyError(
    107             f"argument frame {frame} is not an integer and _translate_frame() not implemented!"
    108         ) from None
--> 109 num_structures = self.number_of_structures
    110 if frame < 0:
    111     frame += num_structures

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/structure/has_structure.py:143, in HasStructure.number_of_structures(self)
    138 @property
    139 def number_of_structures(self):
    140     """
    141     `int`: maximum `iteration_step` + 1 that can be passed to :meth:`.get_structure()`.
    142     """
--> 143     return self._number_of_structures()

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/atomistic.py:770, in AtomisticGenericJob._number_of_structures(self)
    769 def _number_of_structures(self):
--> 770     return self.output.positions.shape[0]

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:492, in GenericInteractiveOutput.positions(self)
    490 @property
    491 def positions(self):
--> 492     return self._lst_from_property("positions")

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:458, in GenericInteractiveOutput._lst_from_property(self, key)
    444 """
    445 Fetch latest values for the given property.
    446 
   (...)
    455     :class:`numpy.ndarray`: collected values from all previous steps
    456 """
    457 cached = np.array(self._lst_from_cache(key))
--> 458 fetched = self._key_from_hdf(key)
    459 if fetched is None or len(fetched) == 0:
    460     return cached

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:413, in GenericInteractiveOutput._key_from_hdf(self, key)
    401 def _key_from_hdf(self, key):
    402     """
    403     Get all entries from the HDF5 file for a specific key - stored under 'output/interactive/<key>'.  If not found
    404     there the key is looked up in the regular HDF storage location 'output/<key>' via the property on our super
   (...)
    411 
    412     """
--> 413     fetched = self._job["output/interactive/" + key]
    414     if fetched is None or len(fetched) == 0:
    415         fetched = getattr(super(), key)

File ~/software/pyiron_base/pyiron_base/jobs/job/core.py:956, in JobCore.__getitem__(self, item)
    951     except (ValueError, IndexError, KeyError):
    952         # either group does not contain a data container or it is does, but it does not have the path we're
    953         # looking for
    954         pass
--> 956 if item in self.files.list():
    957     warnings.warn(
    958         "Using __getitem__ on a job to access files in deprecated: use job.files instead!",
    959         category=DeprecationWarning,
    960     )
    961     return _job_read_file(self, item)

File ~/software/pyiron_base/pyiron_base/jobs/job/extension/files.py:64, in FileBrowser.list(self)
     60 def list(self) -> List[str]:
     61     """
     62     List all files in the working directory of the job.
     63     """
---> 64     return _working_directory_list_files(working_directory=self._working_directory)

File ~/software/pyiron_base/pyiron_base/jobs/job/util.py:375, in _working_directory_list_files(working_directory)
    372 with tarfile.open(compressed_job_name, "r") as tar:
    373     job_archive_name = os.path.basename(compressed_job_name)
    374     compressed_files_lst = [
--> 375         member.name for member in tar.getmembers() if member.isfile()
    376     ]
    377     uncompressed_files_lst.remove(job_archive_name)
    378     return uncompressed_files_lst + compressed_files_lst

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:1994, in TarFile.getmembers(self)
   1992 self._check()
   1993 if not self._loaded:    # if we want to obtain a list of
-> 1994     self._load()        # all members, we first have to
   1995                         # scan the whole archive.
   1996 return self.members

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:2689, in TarFile._load(self)
   2685 """Read through the entire archive file and look for readable
   2686    members.
   2687 """
   2688 while True:
-> 2689     tarinfo = self.next()
   2690     if tarinfo is None:
   2691         break

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/tarfile.py:2594, in TarFile.next(self)
   2592 # Advance the file pointer.
   2593 if self.offset != self.fileobj.tell():
-> 2594     self.fileobj.seek(self.offset - 1)
   2595     if not self.fileobj.read(1):
   2596         raise ReadError("unexpected end of data")

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/bz2.py:261, in BZ2File.seek(self, offset, whence)
    246 """Change the file position.
    247 
    248 The new position is specified by offset, relative to the
   (...)
    258 this operation may be extremely slow.
    259 """
    260 self._check_can_seek()
--> 261 return self._buffer.seek(offset, whence)

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/_compression.py:153, in DecompressReader.seek(self, offset, whence)
    151 # Read and discard data until we reach the desired position.
    152 while offset > 0:
--> 153     data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset))
    154     if not data:
    155         break

File /cmmc/ptmp/pyironhb/mambaforge/envs/pyiron_latest/lib/python3.10/_compression.py:103, in DecompressReader.read(self, size)
    101     else:
    102         rawblock = b""
--> 103     data = self._decompressor.decompress(rawblock, size)
    104 if data:
    105     break

KeyboardInterrupt: 

pmrv avatar Feb 23 '24 21:02 pmrv

File ~/software/pyiron_atomistics/pyiron_atomistics/atomistics/job/interactive.py:413, in GenericInteractiveOutput._key_from_hdf(self, key)
    401 def _key_from_hdf(self, key):
    402     """
    403     Get all entries from the HDF5 file for a specific key - stored under 'output/interactive/<key>'.  If not found
    404     there the key is looked up in the regular HDF storage location 'output/<key>' via the property on our super
   (...)
    411 
    412     """
--> 413     fetched = self._job["output/interactive/" + key]

This checks if output/interactive exists, so it should be changed to use:

   fetched = self._job.project_hdf["output/interactive/" + key]

jan-janssen avatar Feb 23 '24 21:02 jan-janssen

Good spot, will add it next week.

pmrv avatar Feb 23 '24 22:02 pmrv