polyfile
polyfile copied to clipboard
Example files that cause an unexpected delay iterating (infinite?) matches
On Windows with python 3.11 and polyfile 0.5.2, processing the following files as demonstrated in the README seems to take forever:
- https://github.com/ahupp/python-magic/files/9231524/memblock.txt (1,399 bytes)
- https://github.com/ggerganov/whisper.cpp/blob/master/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java (4,178 bytes)
python-magic using libmagic v4 from years ago spits out an error mentioning regex and memory.
code
import polyfile
def from_file(file):
print(file)
with open(file, "rb") as f:
# the default instance automatically loads all file definitions
for match in polyfile.magic.MagicMatcher.DEFAULT_INSTANCE.match(f.read()):
for mimetype in match.mimetypes:
print(f"Matched MIME: {mimetype}", flush=True)
print(f"Match string: {match!s}", flush=True)
from_file("test3.py")
from_file("memblock.txt")
from_file("whisper.cpp/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java")
output including stack trace after Ctrl+C after waiting ~5 minutes
09/05/2023 01:45:27 C:\Users\WDAGUtilityAccount\Desktop> python.exe .\test3.py
test3.py
Matched MIME: text/plain
Match string: ascii text
memblock.txt
Matched MIME: text/x-c
Match string: C source text
Traceback (most recent call last):
File "C:\Users\WDAGUtilityAccount\Desktop\test3.py", line 13, in <module>
from_file("memblock.txt")
File "C:\Users\WDAGUtilityAccount\Desktop\test3.py", line 7, in from_file
for match in polyfile.magic.MagicMatcher.DEFAULT_INSTANCE.match(f.read()):
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2742, in match
if m and (not to_match.only_match_mime or any(t is not None for t in m.mimetypes)):
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2513, in __bool__
return any(m for m in self.mimetypes) or any(e for e in self.extensions) or bool(self.message())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2513, in <genexpr>
return any(m for m in self.mimetypes) or any(e for e in self.extensions) or bool(self.message())
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 44, in __iter__
yield self[i]
~~~~^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 30, in __getitem__
self._items.append(next(self._source_iter))
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\iterators.py", line 54, in unique
for t in iterator:
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2493, in <genexpr>
return LazyIterableSet((
^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2543, in __iter__
yield self[i]
~~~~^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2527, in __getitem__
result = next(self._result_iter)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 928, in _match
yield from child._match(context=context, parent_match=m)
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 917, in _match
m = self.test(context.data, absolute_offset, parent_match)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 2103, in test
match = self.data_type.match(data[absolute_offset:], self.constant)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\WDAGUtilityAccount\scoop\apps\python\current\Lib\site-packages\polyfile\magic.py", line 1767, in match
m = expected.search(data[:self.length])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt
09/05/2023 01:50:32 C:\Users\WDAGUtilityAccount\Desktop>