YouTube-operational-API
YouTube-operational-API copied to clipboard
Extend `minimizeCURL.py` to minimize Protobuf
Let us try to proceed by hand first for a web-scraping usage:
https://www.youtube.com/playlist?list=UUWeg2Pkate69NFdBeuRFTAw (with a private Firefox window)
minimizeCURL curl.sh 'XrruklOv8X0'
Note that minimizing on field name playlistVideoRenderer make us remove pagination related stuff in the following.
curl https://www.youtube.com/youtubei/v1/browse -H 'Content-Type: application/json' --data-raw '{"context": {"client": {"clientName": "WEB", "clientVersion": "2.20240313.05.00"}}, "continuation": "4qmFsgLmARIaVkxVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcarAFDQUY2ZlZCVU9rTkhVV2xGUkU1RVQwUlpNRTlWVlhoT1ZGVXlUMFJyTVZFd1VXOUJWV3BvYkdaTFpEaFFkVVZCTVVGQ1YycG5hVkV5YUc5V2JGcFhXa2Q0WVdWcmNGSlpWRXBIVFVad1ZWZFVWbFZoTVhCeVZWY3hWMDFXVm5KWGJGWlNWMGRPVkZKRlJuQk5WRTAxWlZoYVEyRkZVblpOYlZFeVYwVkdia2xumgIYVVVXZWcyUGthdGU2OU5GZEJldVJGVEF3"}'
Python script using hardcoded continuation base64 encoded string:
import requests
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': '4qmFsgLmARIaVkxVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcarAFDQUY2ZlZCVU9rTkhVV2xGUkU1RVQwUlpNRTlWVlhoT1ZGVXlUMFJyTVZFd1VXOUJWV3BvYkdaTFpEaFFkVVZCTVVGQ1YycG5hVkV5YUc5V2JGcFhXa2Q0WVdWcmNGSlpWRXBIVFVad1ZWZFVWbFZoTVhCeVZWY3hWMDFXVm5KWGJGWlNWMGRPVkZKRlJuQk5WRTAxWlZoYVEyRkZVblpOYlZFeVYwVkdia2xumgIYVVVXZWcyUGthdGU2OU5GZEJldVJGVEF3'
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
Python script first level decoded Protobuf:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': 'CAF6fVBUOkNHUWlFRE5ET0RZME9VVXhOVFUyT0RrMVEwUW9BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln',
'35': 'UUWeg2Pkate69NFdBeuRFTAw'
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
'35': {
'type': 'string'
}
},
'field_order': [
'2',
'3',
'35'
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
Just modifying message values seem to show that only 2 and 3 are actually needed, then can simplify typedef.
So get:
Python script first level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': 'CAF6fVBUOkNHUWlFRE5ET0RZME9VVXhOVFUyT0RrMVEwUW9BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln',
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
but in my case 3 is the pagination Protobuf, so have to proceed recursively:
Python script second level decoded Protobuf:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'1': 1,
'15': 'PT:CGQiEDNDODY0OUUxNTU2ODk1Q0QoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}
typedef = {
'1': {
'type': 'int'
},
'15': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': three,
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
1 is useless, hence get:
Python script second level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'15': 'PT:CGQiEDNDODY0OUUxNTU2ODk1Q0QoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}
typedef = {
'15': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': three,
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
15 is another Protobuf, hence use:
Python script third level decoded Protobuf:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'1': 100,
'4': '3C8649E1556895CD',
'5': 1,
'9': 1710698421586657,
'10': 1,
'11': '"ChhVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcSDAi139yvBhDo2d6XAg"'
}
typedef = {
'1': {
'type': 'int'
},
'4': {
'type': 'string'
},
'5': {
'type': 'int'
},
'9': {
'type': 'int'
},
'10': {
'type': 'int'
},
'11': {
'type': 'string'
}
}
fifteen = getBase64Protobuf(message, typedef)
message = {
'15': f'PT:{fifteen}'
}
typedef = {
'15': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': three,
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
After simplifying get:
Python script third level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'1': 100,
}
typedef = {
'1': {
'type': 'int'
},
}
fifteen = getBase64Protobuf(message, typedef)
message = {
'15': f'PT:{fifteen}'
}
typedef = {
'15': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
'3': three,
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
Let us proceed by hand for a YouTube Data API v3 usage now:
https://www.youtube.com/playlist?list=UUWeg2Pkate69NFdBeuRFTAw
Looking for entry 75: rntZOALPknU.
Retrieve first YouTube Data API v3 playlistItems nextPageToken:
import requests
CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
'fields': 'nextPageToken',
}
response = requests.get(URL, params = params).json()
print(response['nextPageToken'])
Check if rntZOALPknU is part of the second page:
import requests
CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'maxResults' : 50,
'fields': 'items/snippet/resourceId/videoId',
'pageToken': 'EAAafVBUOkNESWlFRVEyTjBSRk1EazFRMEkxT0RWRU1UZ29BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln'
}
response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))
Note that pageToken does not enable us to remove any of field among part, maxResults and fields.
Expanding Protobuf:
Python script third level decoded Protobuf:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'2': 0,
'3': 'PT:CDIiEEQ2N0RFMDk1Q0I1ODVEMTgoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}
typedef = {
'2': {
'type': 'int'
},
'3': {
'type': 'string'
}
}
pageToken = getBase64Protobuf(message, typedef)
CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
'fields': 'items/snippet/resourceId/videoId',
'pageToken': pageToken
}
response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))
I verified that cannot remove Protobuf fields, now let us expand 3:
Python script second level decoded Protobuf:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'1': 50,
'4': 'D67DE095CB585D18',
'5': 1,
'9': 1710698421586657,
'10': 1,
'11': '"ChhVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcSDAi139yvBhDo2d6XAg"'
}
typedef = {
'1': {
'type': 'int'
},
'4': {
'type': 'string'
},
'5': {
'type': 'int'
},
'9': {
'type': 'int'
},
'10': {
'type': 'int'
},
'11': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'2': 0,
'3': f'PT:{three}'
}
typedef = {
'2': {
'type': 'int'
},
'3': {
'type': 'string'
}
}
pageToken = getBase64Protobuf(message, typedef)
CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
'fields': 'items/snippet/resourceId/videoId',
'pageToken': pageToken
}
response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))
Now let us minimize:
Python script second level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
message = {
'1': 50,
}
typedef = {
'1': {
'type': 'int'
},
}
three = getBase64Protobuf(message, typedef)
message = {
'2': 0,
'3': f'PT:{three}'
}
typedef = {
'2': {
'type': 'int'
},
'3': {
'type': 'string'
}
}
pageToken = getBase64Protobuf(message, typedef)
CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
'fields': 'items/snippet/resourceId/videoId',
'pageToken': pageToken
}
response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))
Can avoid multiple getBase64Protobuf calls? While concatenation seems more difficult, simple use, as this script would be need would be nice.
If have a decoding base64 error, do not pay attention to it and protobuf decode anyway thanks to base64.urlsafe_b64decode. Should investigate the deep reason for this. Precising altchars = '-/' solves the issue, probably should use -_, as indicated in the Stack Overflow answer 1638761. urlsafe_b64decode is maybe more appropriate than using altchars, as it seems equivalent to precising altchars = '-_'.
In fact:
0ofMyAOUAhpeQ2lrcUp3b1lWVU4wTlZWVFdYQjZlazFEV1docmFYSldVVWRJZDB0UkVnczVNWEV6Ym5Ob1JWVjRjeG9UNnFqZHVRRU5DZ3M1TVhFemJuTm9SVlY0Y3lBQ01BQSUzRCjluNHtlIiGAzAAQAJKbggAGAAgAEoKCAEQABgAIAAwAFDJxOSKl4iGA1gDeACiAQCqAQwQABoAIgAqBAgAEACwAQDAAQDIAcnE5IqXiIYD4gEMCO39grIGEI7Y058D6AEA8AEA-AEAiAIAkAIAmgIMCIP-grIGEL2N15ACULCX_u2UiIYDWOPLlOyUiIYDggEECAQYAYgBAJoBAggAoAGLm_eZl4iGA7oBAggK0AH4_YKyBg==
requires -_ altchars.
Unclear how to minimize the following, as even with urlsafe_b64decode, I get for blackboxprotobuf.decode_message:
blackboxprotobuf.lib.exceptions.DecoderException: Encountered error decoding field 6: GROUP wire types not supported:
Traceback (most recent call last):
File "<tmp 7>", line 6, in <module>
message, typedef = blackboxprotobuf.decode_message(data)
File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/api.py", line 86, in decode_message
value, typedef, _, _ = blackboxprotobuf.lib.types.length_delim.decode_message(
File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 320, in decode_message
grouped_fields, field_order, pos = _group_by_number(buf, pos, end, path)
File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 439, in _group_by_number
raise DecoderException("GROUP wire types not supported", path=field_path)
blackboxprotobuf.lib.exceptions.DecoderException: Encountered error decoding field 6: GROUP wire types not supported
Maybe it is a blackboxprotobuf issue.
Unclear array of bytes:
b'\x01\xf8\xea\xd2\xf3M\xcdA\xde\xc25cH^\x823\x8ad\x85\\\xfc\x10\x9b\x08\xa1\x06
\x8b\xeb\xb9\nE\xb7\x94\xca\xd0NA\x90epa\x18\xce\x00@\xbc\tx\xd0dY\xc6\x1fPT\xd7
\x0c\x96\xe4\xdf\x0f\x04\xe2\xa4\xe4\xb2j\x8e\xe0)\x97v\xa1\x83\x15\xdd\xd1\xc8\
x07@_\xd8\xdb$\x8aNL\x0e<\xc9\xb4\xa2\xa7\xd8\x88\xbfO\xe0\nS_\xf2\xd6\n\\\x99U\
xe78+\x913>I\x13\xe9\xf7\xcf\x07\xf2\x9bI`\x82+FW\xec\xae\xccPU\x0f\xf5%\xe0\xf4
m\xd7o h\x9b\x87Y\x19;\xd4\xc5\xce\xca\xc0@\x9c\xac-\x11a\x16\xc1<\xe9\xf2c\x8c%
O\x89\xaa\x07\xd2\xc5{\x0ct\xf0\x0c\xbe\x94\r%9\xab\x89\x03\xe1\xf1\x08\xc1E\xe3
\x8d\xd2Y\x13r\xe5U\x87\xa5\x93\xff\xc3\x14P\x81\xb1\x05q\xeb\xa6\x1b\xd5\xd9\x0
4(\xd2\xcbT\xcd\x02>J\r\xef9t$)k\xee\xb8\x0e\x9bl\x81\xce\xd6\x13=5\x9f\xb9\xbb\
xbd%\xc9\x93=\x11\x94\xcc\x1cN\x04\x13\xa2&\xa7\xca\xf7\x03o\x0f\x7fZ6\'\x94N[\r
\x13\xde\xbf\xaa\x08_t\xec\xe5\xf2Tv\x88\x87\xdf\x80\x00|\x02\r\x11\xe8\xd5\x03+
\xce\xf2[\xb7G\xdc\xc5:\x8a\x04+\xca\x9d\xf9C\xa8\xc9\x92u\xbcv7\xfc\x92T\xd1\x1
dx\xba\\"\xc8\x04e\xf8\xb6u\xdd\xff+0\xc0\x92f\xc0y\xcc)\x9fk\xe5|8\x0c\x96\xf7\
xd7\xd2\xf1k\x0c\xc9\x80\x03\xb9x(\xf8\xbe\xbf\n[\x15\x01%\xd6.X\xa4\xb8\xd8\xe0
/\x8f\xda\xa7\xa2{O)6Nw5\xb7a\x17\xa3\xee0\x8c\xfe\xf1\xf9\xd1\xf2b\xfa6\xdf\xe7
\xe0\xa2\x89\x1c\x8f\'\xe8\xeb\x98\xc9o\xc5\xc8z\x055\x13\xcb\xac\xfa\xd4\xe1\xf
a#\xc7\xeb}f4\x96:\xca\x03\xf3\xd9\x025\xb4D\x18\xc2c\xeaB\\\xd3\xddc\xed\x9e\x8
b\x9c2>\xff\xda\xc0\xe5Q\xa5;\x1bw>)i\x93\xc9O\xba{\xcb\x83\xc2W\xcd1\xb7\xbb\xa
1\xe5"\\\xf2\x15\xd0AdI#\x9b\xb8y\xaf\xc0\x85\xf6\xc3\xe4\xff5\xffT=w{R\xc7u\xe3
f^n\xe0\xdcjY\xa6W\xc9\x01\x7fG\xacs\x05$oor\xf5\xaf'
does not seem to help, as CyberChef.
Unclear purpose Python script:
import requests
import blackboxprotobuf
import base64
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data, altchars = b'-_')
message = {
'110': {
'3': {
'15': {
'1': {
'1': 'Afjq0vNNzUHewjVjSF6CM4pkhVz8EJsIoQaL67kKRbeUytBOQZBlcGEYzgBAvAl40GRZxh9QVNcMluTfDwTipOSyao7gKZd2oYMV3dHIB0Bf2Nskik5MDjzJtKKn2Ii_T-AKU1_y1gpcmVXnOCuRMz5JE-n3zwfym0lggitGV-yuzFBVD_Ul4PRt128gaJuHWRk71MXOysBAnKwtEWEWwTzp8mOMJU-JqgfSxXsMdPAMvpQNJTmriQPh8QjBReON0lkTcuVVh6WT_8MUUIGxBXHrphvV2QQo0stUzQI-Sg3vOXQkKWvuuA6bbIHO1hM9NZ-5u70lyZM9EZTMHE4EE6Imp8r3A28Pf1o2J5ROWw0T3r-qCF907OXyVHaIh9-AAHwCDRHo1QMrzvJbt0fcxTqKBCvKnflDqMmSdbx2N_ySVNEdeLpcIsgEZfi2dd3_KzDAkmbAecwpn2vlfDgMlvfX0vFrDMmAA7l4KPi-vwpbFQEl1i5YpLjY4C-P2qeie08pNk53NbdhF6PuMIz-8fnR8mL6Nt_n4KKJHI8n6OuYyW_FyHoFNRPLrPrU4fojx-t9ZjSWOsoD89kCNbREGMJj6kJc091j7Z6LnDI-_9rA5VGlOxt3Pilpk8lPunvLg8JXzTG3u6HlIlzyFdBBZEkjm7h5r8CF9sPk_zX_VD13e1LHdeNmXm7g3GpZplfJAX9HrHMFJG9vcvWv',
'2': '661ecf48-0000-2e08-9784-d4f547fd991c'
},
'3': 1
}
}
}
}
typedef = {
'110': {
'type': 'message',
'message_typedef': {
'3': {
'type': 'message',
'message_typedef': {
'15': {
'type': 'message',
'message_typedef': {
'1': {
'type': 'message',
'message_typedef': {
'1': {
'type': 'string'
},
'2': {
'type': 'string'
}
},
'field_order': [
'1',
'2'
]
},
'3': {
'type': 'int'
}
},
'field_order': [
'1',
'3'
]
}
},
'field_order': [
'15'
]
}
},
'field_order': [
'3'
]
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': 'UCWeg2Pkate69NFdBeuRFTAw',
'3': three
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
}
},
'field_order': [
'2',
'3'
]
}
}
continuation = getBase64Protobuf(message, typedef)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
},
'continuation': continuation
}
data = requests.post(url, headers = headers, json = data).json()
print('Ce film était une très mauvaise idée' in str(data))
Note that .decode('ascii') in base64.b64encode(data).decode('ascii') does not seem necessary but removing it is debatable in my opinion, I prefer strings over arrays of bytes.
Even with considering double base64 encoding I am unable to make progress.
Could integrate to minimizeCURL.py by automatically detecting fields being Protobuf encoded.
Maybe first having an explicit request generator not relying on Base64 black box would be a good start.
minimizeCURL curl.sh 'ADHD Test'
...
curl https://www.youtube.com/youtubei/v1/search -H 'Content-Type: application/json' --data-raw '{"context": {"client": {"clientName": "WEB", "clientVersion": "2.20240909.02.00"}}, "continuation": "EugNEgR0ZXN0GtwNRXBzS2tnR1hDanFWQVJLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NrNEFXQWFTdndJQ2lZS0JIUmxjM1RxQVE4S0RWb0xDZ2NJaGdFU0FCZ0xHR0h5QVFVS0EwRnNiTmdDQWJnRFlRcTFBZW9CRHdvTldnc0tCd2lHQVJJQUdBc1lHdklCQ0FvR1UyaHZjblJ6d2dLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NuNEFnRzRBeG9LVE9vQkR3b05XZ3NLQndpR0FSSUFHQXNZU1BJQkNBb0dWbWxrWlc5endnSWFlVzkxZEhWaVpWOTJhV1JsYjE5d1lXZGxJRHAwZVhCbE9uTG9BZ0dvQXdHNEEwakFBd0hJQXdIUUF3RUtLT29CRHdvTldnc0tCd2lHQVJJQUdBc1lIUElCQ3dvSlZXNTNZWFJqYUdWa3lnSUNDQUc0QXh3S0p1b0JEd29OV2dzS0J3aUdBUklBR0FzWVN2SUJDUW9IVjJGMFkyaGxaTW9DQWhnQ3VBTktDaTFnQi1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQl9JQkV3b1JVbVZqWlc1MGJIa2dkWEJzYjJGa1pXUzRBd2NLUy1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQlBJQkJnb0VUR2wyWmNJQ0ozbHZkWFIxWW1WZmJHbDJaVjlpY205aFpHTmhjM1JmYzNSaGRIVnpQVEFnT25SNWNHVTZjdWdDQWJnREJBb19DaEIyYjJ4MmJ5QmpjbUZ6YUNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxkbTlzZG05ZlkzSmhjMmp5QVEwS0MxWnZiSFp2SUdOeVlYTm9DaTBLQ25SbGMzUWdiWFZ6YVdQcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWdGRYTnBZX0lCQndvRlRYVnphV01LT1FvT1kyRnlJR055WVhOb0lIUmxjM1RxQVJnS0Zsb1VDZ2NJaGdFU0FCZ0xFZ2xqWVhKZlkzSmhjMmp5QVFzS0NVTmhjaUJqY21GemFBb2tDZ2RwY1NCMFpYTjA2Z0VSQ2c5YURRb0hDSVlCRWdBWUN4SUNhWEh5QVFRS0FrbHhDaTBLQ25SbGMzUWdZWFZrYVdfcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWaGRXUnBiX0lCQndvRlFYVmthVzhLUHdvUWNHVnljMjl1WVd4cGRIa2dkR1Z6ZE9vQkdnb1lXaFlLQndpR0FSSUFHQXNTQzNCbGNuTnZibUZzYVhSNThnRU5DZ3RRWlhKemIyNWhiR2wwZVFvb0NnVjBaWE4wYi1vQkZBb1NXaEFLQndpR0FSSUFHQXNTQlhSbGMzUnY4Z0VIQ2dWVVpYTjBid29rQ2dkMFpYTjBJRzFsNmdFUkNnOWFEUW9IQ0lZQkVnQVlDeElDYldYeUFRUUtBazFsQ2pRS0NYUmxjM1FnZEdWemRPb0JHQW9XV2hRS0J3aUdBUklBR0FzU0NYUmxjM1JmZEdWemRQSUJDd29KVkdWemRDQjBaWE4wQ2ljS0NHVjVaU0IwWlhOMDZnRVNDaEJhRGdvSENJWUJFZ0FZQ3hJRFpYbGw4Z0VGQ2dORmVXVUtQQW9QYldWdWRHRnNJR0ZuWlNCMFpYTjA2Z0VaQ2hkYUZRb0hDSVlCRWdBWUN4SUtiV1Z1ZEdGc1gyRm5aZklCREFvS1RXVnVkR0ZzSUdGblpRb19DaEJqYjJ4dmNpQmliR2x1WkNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxZMjlzYjNKZllteHBibVR5QVEwS0MwTnZiRzl5SUdKc2FXNWtHQXRhRFFvTENBUXFCd2lHQVJJQUdBdDRBQSUzRCUzRJABARiB4OgYIgtzZWFyY2gtcGFnZQ%3D%3D"}'
Python script with hardcoded continuation:
import requests
import json
URL = 'https://www.youtube.com/youtubei/v1/search'
HEADERS = {
'Content-Type': 'application/json',
}
data = {
"context": {
"client": {
"clientName": "WEB",
"clientVersion": "2.20240909.02.00",
}
},
"continuation": "EugNEgR0ZXN0GtwNRXBzS2tnR1hDanFWQVJLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NrNEFXQWFTdndJQ2lZS0JIUmxjM1RxQVE4S0RWb0xDZ2NJaGdFU0FCZ0xHR0h5QVFVS0EwRnNiTmdDQWJnRFlRcTFBZW9CRHdvTldnc0tCd2lHQVJJQUdBc1lHdklCQ0FvR1UyaHZjblJ6d2dLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NuNEFnRzRBeG9LVE9vQkR3b05XZ3NLQndpR0FSSUFHQXNZU1BJQkNBb0dWbWxrWlc5endnSWFlVzkxZEhWaVpWOTJhV1JsYjE5d1lXZGxJRHAwZVhCbE9uTG9BZ0dvQXdHNEEwakFBd0hJQXdIUUF3RUtLT29CRHdvTldnc0tCd2lHQVJJQUdBc1lIUElCQ3dvSlZXNTNZWFJqYUdWa3lnSUNDQUc0QXh3S0p1b0JEd29OV2dzS0J3aUdBUklBR0FzWVN2SUJDUW9IVjJGMFkyaGxaTW9DQWhnQ3VBTktDaTFnQi1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQl9JQkV3b1JVbVZqWlc1MGJIa2dkWEJzYjJGa1pXUzRBd2NLUy1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQlBJQkJnb0VUR2wyWmNJQ0ozbHZkWFIxWW1WZmJHbDJaVjlpY205aFpHTmhjM1JmYzNSaGRIVnpQVEFnT25SNWNHVTZjdWdDQWJnREJBb19DaEIyYjJ4MmJ5QmpjbUZ6YUNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxkbTlzZG05ZlkzSmhjMmp5QVEwS0MxWnZiSFp2SUdOeVlYTm9DaTBLQ25SbGMzUWdiWFZ6YVdQcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWdGRYTnBZX0lCQndvRlRYVnphV01LT1FvT1kyRnlJR055WVhOb0lIUmxjM1RxQVJnS0Zsb1VDZ2NJaGdFU0FCZ0xFZ2xqWVhKZlkzSmhjMmp5QVFzS0NVTmhjaUJqY21GemFBb2tDZ2RwY1NCMFpYTjA2Z0VSQ2c5YURRb0hDSVlCRWdBWUN4SUNhWEh5QVFRS0FrbHhDaTBLQ25SbGMzUWdZWFZrYVdfcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWaGRXUnBiX0lCQndvRlFYVmthVzhLUHdvUWNHVnljMjl1WVd4cGRIa2dkR1Z6ZE9vQkdnb1lXaFlLQndpR0FSSUFHQXNTQzNCbGNuTnZibUZzYVhSNThnRU5DZ3RRWlhKemIyNWhiR2wwZVFvb0NnVjBaWE4wYi1vQkZBb1NXaEFLQndpR0FSSUFHQXNTQlhSbGMzUnY4Z0VIQ2dWVVpYTjBid29rQ2dkMFpYTjBJRzFsNmdFUkNnOWFEUW9IQ0lZQkVnQVlDeElDYldYeUFRUUtBazFsQ2pRS0NYUmxjM1FnZEdWemRPb0JHQW9XV2hRS0J3aUdBUklBR0FzU0NYUmxjM1JmZEdWemRQSUJDd29KVkdWemRDQjBaWE4wQ2ljS0NHVjVaU0IwWlhOMDZnRVNDaEJhRGdvSENJWUJFZ0FZQ3hJRFpYbGw4Z0VGQ2dORmVXVUtQQW9QYldWdWRHRnNJR0ZuWlNCMFpYTjA2Z0VaQ2hkYUZRb0hDSVlCRWdBWUN4SUtiV1Z1ZEdGc1gyRm5aZklCREFvS1RXVnVkR0ZzSUdGblpRb19DaEJqYjJ4dmNpQmliR2x1WkNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxZMjlzYjNKZllteHBibVR5QVEwS0MwTnZiRzl5SUdKc2FXNWtHQXRhRFFvTENBUXFCd2lHQVJJQUdBdDRBQSUzRCUzRJABARiB4OgYIgtzZWFyY2gtcGFnZQ%3D%3D",
}
##
response = requests.post(URL, headers = HEADERS, json = data)
#print(response.text)
data = response.json()
dataStr = json.dumps(data, indent = 4)
#print(dataStr)
print('ADHD Test' in dataStr)
import base64
import blackboxprotobuf
import urllib.parse as ul
import copy
import binascii
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
def isRequestStillFine(httpMethod, url, params, headers, data, needle):
data = httpMethod(url, params = params, headers = headers, json = data).json()
dataStr = json.dumps(data, indent = 4)
#print(dataStr)
return isDataOnlyContainingShorts(data)
def isRequestStillFineExplicit(httpMethod, url, params, headers, data, dataPath, message, typedef, needle):
setDataFromPath(data, dataPath, getBase64Protobuf(message, typedef))
return isRequestStillFine(httpMethod, url, params, headers, data, needle)
# Will need to proceed recursively
def minimizeProtobuf(httpMethod, url, params, headers, data, dataPath, needle, messages = [], typedefs = []):
print(dataPath)
print(json.dumps(data, indent = 4))
entry = base64.b64decode(ul.unquote_plus(getDataFromPath(data, dataPath)), altchars = '-_')
message, typedef = blackboxprotobuf.decode_message(entry)
#print(json.dumps(message, indent = 4))
print(json.dumps(typedef, indent = 4))
# Based on [YouTube-operational-API/blob/11566147f4d54b8d8d8481709fd5bf6b1329f4de/tools/minimizeCURL.py](https://github.com/Benjamin-Loison/YouTube-operational-API/blob/11566147f4d54b8d8d8481709fd5bf6b1329f4de/tools/minimizeCURL.py) `isJson`.
def getPaths(d):
if isinstance(d, dict):
for key, value in d.items():
yield f'/{key}'
yield from (f'/{key}{p}' for p in getPaths(value))
# If a single unknown entry is necessary, then this algorithm seems to most efficiently goes from parents to children if necessary to remove other entries. Hence, it seems to proceed in a linear number of HTTPS requests and not a quadratic one.
# Try until no more change to remove unnecessary entries. If assume a logical behavior as just mentioned, would not a single loop iteration be enough? Not with current design, see (1).
while True:
changedSomething = False
# Note that the path goes from parents to children if necessary which is quite a wanted behavior to quickly remove useless chunks.
paths = getPaths(message)
# For all entries, copy current `rawData` and try to remove an entry.
for path in paths:
# Copy current `rawData`.
messageCopy = copy.deepcopy(message)
# Remove an entry.
# Pay attention that integer keys here are .
entry = messageCopy
pathParts = path[1:].split('/')
for pathPart in pathParts[:-1]:
entry = entry[pathPart]
lastPathPart = pathParts[-1]
del entry[lastPathPart]
# Test if the removed entry was necessary.
# (1) If it was unnecessary, then reconsider paths excluding possible children paths of this unnecessary entry, ensuring optimized complexity it seems.
if isRequestStillFineExplicit(httpMethod, url, params, headers, data, dataPath, messageCopy, typedef, needle):
print(len(json.dumps(data)), 'still fine')
changedSomething = True
message = messageCopy
break
# If it was necessary, we consider possible children paths of this necessary entry and other paths.
# If a loop iteration considering all paths, does not change anything, then the request cannot be minimized further.
if not changedSomething:
break
# Maybe minimize `typedef` once have minimized `message`. Especially as `field_order` can be removed if only know that do not need other entries.
# However, can postpone implementing such minimization, as minimizing `typedef` once have minimized `message` is quick.
messages += [message]
typedefs += [typedef]
paths = getPaths(message)
for path in paths:
leaf = getDataFromPath(message, path)
# To avoid intermediary nodes.
if type(leaf) is str:
try:
base64.b64decode(ul.unquote_plus(leaf))
print(path)
setDataFromPath(message, path, f'_{pathPart}')
messagesRecursive, typedefsRecursive = minimizeProtobuf(HTTP_METHOD, URL, PARAMS, HEADERS, leaf, dataPath + path, NEEDLE)
messages += messagesRecursive
typedefs += typedefsRecursive
except binascii.Error:
pass
return messages, typedefs
def getDataFromPath(data, path):
pathParts = path[1:].split('/')
for pathPart in pathParts:
data = data[pathPart]
return data
def setDataFromPath(data, path, value):
pathParts = path[1:].split('/')
for pathPart in pathParts[:-1]:
data = data[pathPart]
lastPathPart = pathParts[-1]
data[lastPathPart] = value
HTTP_METHOD = requests.post
DATA_PATH = '/continuation'
messages, typedefs = minimizeProtobuf(HTTP_METHOD, URL, PARAMS, HEADERS, DATA, DATA_PATH, NEEDLE)
print(json.dumps(messages, indent = 4))
#print(json.dumps(typedef, indent = 4))
Similar to:
https://github.com/Benjamin-Loison/YouTube-operational-API/blob/0e4168e9ed307cc760f1fe4d69aa8143a4a66ba1/tools/minimizeCURL.py#L151-L228 isJson part.
Commenting this code seems to make sense.
git blame may help:
commit bb31362388beba3aa97b25d40dcf924e2c2bc931
Author: Benjamin Loison <[email protected]>
Date: Wed Jun 14 21:36:31 2023 +0200
Removes unnecessary raw data in `tools/minimizeCURL.py`
does not help much.
Not simplified Protobuf typedef with simplified message:
{
"2": {
"2": "test"
}
}
{
"2": {
"field_order": [
"2",
"3",
"18"
],
"message_typedef": {
"2": {
"type": "string"
},
"3": {
"type": "string"
},
"18": {
"type": "int"
}
},
"type": "message"
},
"3": {
"type": "int"
},
"4": {
"type": "string"
}
}
Simplified Protobuf typedef:
{
"2": {
"message_typedef": {
"2": {
"type": "string"
}
},
"type": "message"
}
}
Hence, simplified Python script:
import requests
import json
import base64
import blackboxprotobuf
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
URL = 'https://www.youtube.com/youtubei/v1/search'
HEADERS = {
'Content-Type': 'application/json',
}
message = {
'2': {
'2': 'test'
}
}
typedef = {
'2': {
'message_typedef': {
'2': {
'type': 'string'
}
},
'type': 'message'
}
}
continuation = getBase64Protobuf(message, typedef)
data = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240909.02.00',
}
},
'continuation': continuation,
}
##
response = requests.post(URL, headers = HEADERS, json = data)
#print(response.text)
data = response.json()
dataStr = json.dumps(data, indent = 4)
#print(dataStr)
print('ADHD Test' in dataStr)
PARAMS = {
'prettyPrint': 'false',
}
can be useful to check first item to be given one.
Note that there are 2 paths, one inside data and the other within message.
It does not seem possible to easily simplify by hand recursively by recalling the minimizer with differents arguments.