yt-dlp
yt-dlp copied to clipboard
Unique names for dumped files with `--write-pages`
DO NOT REMOVE OR SKIP THE ISSUE TEMPLATE
- [X] I understand that I will be blocked if I intentionally remove or skip any mandatory* field
Checklist
- [X] I'm requesting a feature unrelated to a specific site
- [X] I've looked through the README
- [X] I've verified that I have updated yt-dlp to nightly or master (update instructions)
- [X] I've searched known issues and the bugtracker for similar issues including closed ones. DO NOT post duplicates
- [X] I've read the guidelines for opening an issue
Provide a description that is worded well enough to be understood
Thanks for the great program.
When I debugging for https://github.com/yt-dlp/yt-dlp/issues/9358, I found that the --write-pages
option dumps diffirent responses to the same file.
The cause is that the dump file name is specified with only video_id
and url
in _request_dump_filename
in extractor/common.py
.
YouTube comment API is usually same URL, thus these responses will be stored in the same file: None_https_-_www.youtube.com_youtubei_v1_nextkey=AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8_prettyPrint=false.dump
.
If the file already exists, it will be overwritten regardless of --no-overwrites
option.
This is inconvenient for debugging.
I suggest to give the unique names for every dump files.
Personally, I applied the following patch myself when debugging.
# This patch is in the public domain (CC0).
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1006,7 +1006,7 @@ def __check_blocked(self, content):
expected=True)
def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ basen = f'{video_id}_{str(time.time())}_{url}'
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
Thanks.
Provide verbose output that clearly demonstrates the problem
- [ ] Run your yt-dlp command with -vU flag added (
yt-dlp -vU <your command line>
) - [ ] If using API, add
'verbose': True
toYoutubeDL
params instead - [ ] Copy the WHOLE output (starting with
[debug] Command-line config
) and insert it below
Complete Verbose Output
No response
the patch would break --load-pages
Adding timestamp will break --load-pages
. We'd need some way to encode the POSTed data into the name?
Encoding all the posted data would make the filename really long, but a hash of the post data should be unique enough, without making the filename too long.
In my environment, the following patch works correctly.
# This patch is in the public domain.
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -1005,8 +1005,9 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, data, video_id):
+ hash = hashlib.sha256(str(data).encode('utf-8')).hexdigest()
+ basen = f'{video_id}_{hash}_{url}'
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1037,7 +1038,7 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, urlh._requests_response.request.__dict__['body'], video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1099,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, url_or_request.data, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
- filename = self._request_dump_filename(urlh.url, video_id) + filename = self._request_dump_filename(urlh.url, urlh._requests_response.request.__dict__['body'], video_id)
_requests_response
is not a public attribute and it would only work if the requests
handler was used
Okay, how about this?
# This patch is in the public domain (CC0).
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -957,7 +957,7 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data)
return (content, urlh)
@staticmethod
@@ -1005,8 +1005,9 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, data, video_id):
+ hash = hashlib.sha256(str(data).encode('utf-8')).hexdigest()
+ basen = f'{video_id}_{hash}_{url}'
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1028,7 +1029,7 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
@@ -1037,7 +1038,7 @@ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errno
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, data, video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1099,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, url_or_request.data, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
@minamotorin I think that basically works. I would suggest a few changes though:
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index bebbc6b43..be9f07c38 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
if urlh is False:
assert not fatal
return False
- content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+ content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data)
return (content, urlh)
@staticmethod
@@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
expected=True)
- def _request_dump_filename(self, url, video_id):
- basen = f'{video_id}_{url}'
+ def _request_dump_filename(self, url, video_id, data=None):
+ if data is not None:
+ data = hashlib.md5(data).hexdigest()
+ basen = join_nonempty(video_id, data, url, delim='_')
trim_length = self.get_param('trim_file_name') or 240
if len(basen) > trim_length:
h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1028,16 +1031,19 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
except LookupError:
return webpage_bytes.decode('utf-8', 'replace')
- def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+ def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+ prefix=None, encoding=None, data=None):
webpage_bytes = urlh.read()
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
+ if isinstance(url_or_request, Request):
+ data = data if data is not None else url_or_request.data
if self.get_param('dump_intermediate_pages', False):
self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.url, video_id)
+ filename = self._request_dump_filename(urlh.url, video_id, data)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1098,7 +1104,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
impersonate=None, require_impersonation=False):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
@bashonly Thanks for the suggestion. I created a PR. https://github.com/yt-dlp/yt-dlp/pull/9879