YouTube-operational-API icon indicating copy to clipboard operation
YouTube-operational-API copied to clipboard

Extend `minimizeCURL.py` to minimize Protobuf

Open Benjamin-Loison opened this issue 1 year ago • 17 comments

Related to #190, #69, #255 and Benjamin-Loison/cpython/issues/16.

+8

Benjamin-Loison avatar Mar 18 '24 13:03 Benjamin-Loison

Let us try to proceed by hand first for a web-scraping usage:

https://www.youtube.com/playlist?list=UUWeg2Pkate69NFdBeuRFTAw (with a private Firefox window)

minimizeCURL curl.sh 'XrruklOv8X0'

Note that minimizing on field name playlistVideoRenderer make us remove pagination related stuff in the following.

curl https://www.youtube.com/youtubei/v1/browse -H 'Content-Type: application/json' --data-raw '{"context": {"client": {"clientName": "WEB", "clientVersion": "2.20240313.05.00"}}, "continuation": "4qmFsgLmARIaVkxVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcarAFDQUY2ZlZCVU9rTkhVV2xGUkU1RVQwUlpNRTlWVlhoT1ZGVXlUMFJyTVZFd1VXOUJWV3BvYkdaTFpEaFFkVVZCTVVGQ1YycG5hVkV5YUc5V2JGcFhXa2Q0WVdWcmNGSlpWRXBIVFVad1ZWZFVWbFZoTVhCeVZWY3hWMDFXVm5KWGJGWlNWMGRPVkZKRlJuQk5WRTAxWlZoYVEyRkZVblpOYlZFeVYwVkdia2xumgIYVVVXZWcyUGthdGU2OU5GZEJldVJGVEF3"}'
Python script using hardcoded continuation base64 encoded string:
import requests

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': '4qmFsgLmARIaVkxVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcarAFDQUY2ZlZCVU9rTkhVV2xGUkU1RVQwUlpNRTlWVlhoT1ZGVXlUMFJyTVZFd1VXOUJWV3BvYkdaTFpEaFFkVVZCTVVGQ1YycG5hVkV5YUc5V2JGcFhXa2Q0WVdWcmNGSlpWRXBIVFVad1ZWZFVWbFZoTVhCeVZWY3hWMDFXVm5KWGJGWlNWMGRPVkZKRlJuQk5WRTAxWlZoYVEyRkZVblpOYlZFeVYwVkdia2xumgIYVVVXZWcyUGthdGU2OU5GZEJldVJGVEF3'
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))
Python script first level decoded Protobuf:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': 'CAF6fVBUOkNHUWlFRE5ET0RZME9VVXhOVFUyT0RrMVEwUW9BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln',
        '35': 'UUWeg2Pkate69NFdBeuRFTAw'
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
            '35': {
                'type': 'string'
            }
        },
        'field_order': [
            '2',
            '3',
            '35'
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

Just modifying message values seem to show that only 2 and 3 are actually needed, then can simplify typedef.

So get:

Python script first level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': 'CAF6fVBUOkNHUWlFRE5ET0RZME9VVXhOVFUyT0RrMVEwUW9BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln',
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
        },
        'field_order': [
            '2',
            '3',
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

but in my case 3 is the pagination Protobuf, so have to proceed recursively:

Python script second level decoded Protobuf:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '1': 1,
    '15': 'PT:CGQiEDNDODY0OUUxNTU2ODk1Q0QoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}

typedef = {
    '1': {
        'type': 'int'
    },
    '15': {
        'type': 'string'
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': three,
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
        },
        'field_order': [
            '2',
            '3',
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

1 is useless, hence get:

Python script second level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '15': 'PT:CGQiEDNDODY0OUUxNTU2ODk1Q0QoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}

typedef = {
    '15': {
        'type': 'string'
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': three,
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
        },
        'field_order': [
            '2',
            '3',
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

15 is another Protobuf, hence use:

Python script third level decoded Protobuf:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '1': 100,
    '4': '3C8649E1556895CD',
    '5': 1,
    '9': 1710698421586657,
    '10': 1,
    '11': '"ChhVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcSDAi139yvBhDo2d6XAg"'
}

typedef = {
    '1': {
        'type': 'int'
    },
    '4': {
        'type': 'string'
    },
    '5': {
        'type': 'int'
    },
    '9': {
        'type': 'int'
    },
    '10': {
        'type': 'int'
    },
    '11': {
        'type': 'string'
    }
}

fifteen = getBase64Protobuf(message, typedef)

message = {
    '15': f'PT:{fifteen}'
}

typedef = {
    '15': {
        'type': 'string'
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': three,
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
        },
        'field_order': [
            '2',
            '3',
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

After simplifying get:

Python script third level decoded Protobuf simplified:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '1': 100,
}

typedef = {
    '1': {
        'type': 'int'
    },
}

fifteen = getBase64Protobuf(message, typedef)

message = {
    '15': f'PT:{fifteen}'
}

typedef = {
    '15': {
        'type': 'string'
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '80226972': {
        '2': 'VLUUWeg2Pkate69NFdBeuRFTAw',
        '3': three,
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            },
        },
        'field_order': [
            '2',
            '3',
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('XrruklOv8X0' in str(data))

Benjamin-Loison avatar Mar 18 '24 13:03 Benjamin-Loison

Let us proceed by hand for a YouTube Data API v3 usage now:

https://www.youtube.com/playlist?list=UUWeg2Pkate69NFdBeuRFTAw Looking for entry 75: rntZOALPknU.

Retrieve first YouTube Data API v3 playlistItems nextPageToken:
import requests

CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]

URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
    'part': ','.join(['snippet']),
    'playlistId': PLAYLIST_ID,
    'maxResults' : 50,
    'fields': 'nextPageToken',
}

response = requests.get(URL, params = params).json()
print(response['nextPageToken'])
Check if rntZOALPknU is part of the second page:
import requests

CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]

URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
    'part': ','.join(['snippet']),
    'maxResults' : 50,
    'fields': 'items/snippet/resourceId/videoId',
    'pageToken': 'EAAafVBUOkNESWlFRVEyTjBSRk1EazFRMEkxT0RWRU1UZ29BVWpobGZLZDhQdUVBMUFCV2pnaVEyaG9WbFpXWkd4YWVrcFJZVEpHTUZwVVdUVlVhMXByVVcxV01WVnJXbFZSV0dOVFJFRnBNVE01ZVhaQ2FFUnZNbVEyV0VGbkln'
}

response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))

Note that pageToken does not enable us to remove any of field among part, maxResults and fields.

Expanding Protobuf:

Python script third level decoded Protobuf:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '2': 0,
    '3': 'PT:CDIiEEQ2N0RFMDk1Q0I1ODVEMTgoAUjhlfKd8PuEA1ABWjgiQ2hoVlZWZGxaekpRYTJGMFpUWTVUa1prUW1WMVVrWlVRWGNTREFpMTM5eXZCaERvMmQ2WEFnIg'
}

typedef = {
    '2': {
        'type': 'int'
    },
    '3': {
        'type': 'string'
    }
}

pageToken = getBase64Protobuf(message, typedef)

CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]

URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
    'part': ','.join(['snippet']),
    'playlistId': PLAYLIST_ID,
    'maxResults' : 50,
    'fields': 'items/snippet/resourceId/videoId',
    'pageToken': pageToken
}

response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))

I verified that cannot remove Protobuf fields, now let us expand 3:

Python script second level decoded Protobuf:

import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '1': 50,
    '4': 'D67DE095CB585D18',
    '5': 1,
    '9': 1710698421586657,
    '10': 1,
    '11': '"ChhVVVdlZzJQa2F0ZTY5TkZkQmV1UkZUQXcSDAi139yvBhDo2d6XAg"'
}

typedef = {
    '1': {
        'type': 'int'
    },
    '4': {
        'type': 'string'
    },
    '5': {
        'type': 'int'
    },
    '9': {
        'type': 'int'
    },
    '10': {
        'type': 'int'
    },
    '11': {
        'type': 'string'
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '2': 0,
    '3': f'PT:{three}'
}

typedef = {
    '2': {
        'type': 'int'
    },
    '3': {
        'type': 'string'
    }
}

pageToken = getBase64Protobuf(message, typedef)

CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]

URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
    'part': ','.join(['snippet']),
    'playlistId': PLAYLIST_ID,
    'maxResults' : 50,
    'fields': 'items/snippet/resourceId/videoId',
    'pageToken': pageToken
}

response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))

Now let us minimize:

Python script second level decoded Protobuf simplified:

import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

message = {
    '1': 50,
}

typedef = {
    '1': {
        'type': 'int'
    },
}

three = getBase64Protobuf(message, typedef)

message = {
    '2': 0,
    '3': f'PT:{three}'
}

typedef = {
    '2': {
        'type': 'int'
    },
    '3': {
        'type': 'string'
    }
}

pageToken = getBase64Protobuf(message, typedef)

CHANNEL_ID = 'UCWeg2Pkate69NFdBeuRFTAw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]

URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
    'part': ','.join(['snippet']),
    'playlistId': PLAYLIST_ID,
    'maxResults' : 50,
    'fields': 'items/snippet/resourceId/videoId',
    'pageToken': pageToken
}

response = requests.get(URL, params = params).json()
print('rntZOALPknU' in str(response))

Benjamin-Loison avatar Mar 18 '24 14:03 Benjamin-Loison

Can avoid multiple getBase64Protobuf calls? While concatenation seems more difficult, simple use, as this script would be need would be nice.

Benjamin-Loison avatar Mar 18 '24 20:03 Benjamin-Loison

If have a decoding base64 error, do not pay attention to it and protobuf decode anyway thanks to base64.urlsafe_b64decode. Should investigate the deep reason for this. Precising altchars = '-/' solves the issue, probably should use -_, as indicated in the Stack Overflow answer 1638761. urlsafe_b64decode is maybe more appropriate than using altchars, as it seems equivalent to precising altchars = '-_'.

In fact:

0ofMyAOUAhpeQ2lrcUp3b1lWVU4wTlZWVFdYQjZlazFEV1docmFYSldVVWRJZDB0UkVnczVNWEV6Ym5Ob1JWVjRjeG9UNnFqZHVRRU5DZ3M1TVhFemJuTm9SVlY0Y3lBQ01BQSUzRCjluNHtlIiGAzAAQAJKbggAGAAgAEoKCAEQABgAIAAwAFDJxOSKl4iGA1gDeACiAQCqAQwQABoAIgAqBAgAEACwAQDAAQDIAcnE5IqXiIYD4gEMCO39grIGEI7Y058D6AEA8AEA-AEAiAIAkAIAmgIMCIP-grIGEL2N15ACULCX_u2UiIYDWOPLlOyUiIYDggEECAQYAYgBAJoBAggAoAGLm_eZl4iGA7oBAggK0AH4_YKyBg==

requires -_ altchars.

Unclear how to minimize the following, as even with urlsafe_b64decode, I get for blackboxprotobuf.decode_message:

blackboxprotobuf.lib.exceptions.DecoderException: Encountered error decoding field 6: GROUP wire types not supported:
Traceback (most recent call last):
  File "<tmp 7>", line 6, in <module>
    message, typedef = blackboxprotobuf.decode_message(data)
  File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/api.py", line 86, in decode_message
    value, typedef, _, _ = blackboxprotobuf.lib.types.length_delim.decode_message(
  File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 320, in decode_message
    grouped_fields, field_order, pos = _group_by_number(buf, pos, end, path)
  File "/home/benjamin/.local/lib/python3.10/site-packages/blackboxprotobuf/lib/types/length_delim.py", line 439, in _group_by_number
    raise DecoderException("GROUP wire types not supported", path=field_path)
blackboxprotobuf.lib.exceptions.DecoderException: Encountered error decoding field 6: GROUP wire types not supported

Maybe it is a blackboxprotobuf issue.

Unclear array of bytes:
b'\x01\xf8\xea\xd2\xf3M\xcdA\xde\xc25cH^\x823\x8ad\x85\\\xfc\x10\x9b\x08\xa1\x06
\x8b\xeb\xb9\nE\xb7\x94\xca\xd0NA\x90epa\x18\xce\x00@\xbc\tx\xd0dY\xc6\x1fPT\xd7
\x0c\x96\xe4\xdf\x0f\x04\xe2\xa4\xe4\xb2j\x8e\xe0)\x97v\xa1\x83\x15\xdd\xd1\xc8\
x07@_\xd8\xdb$\x8aNL\x0e<\xc9\xb4\xa2\xa7\xd8\x88\xbfO\xe0\nS_\xf2\xd6\n\\\x99U\
xe78+\x913>I\x13\xe9\xf7\xcf\x07\xf2\x9bI`\x82+FW\xec\xae\xccPU\x0f\xf5%\xe0\xf4
m\xd7o h\x9b\x87Y\x19;\xd4\xc5\xce\xca\xc0@\x9c\xac-\x11a\x16\xc1<\xe9\xf2c\x8c%
O\x89\xaa\x07\xd2\xc5{\x0ct\xf0\x0c\xbe\x94\r%9\xab\x89\x03\xe1\xf1\x08\xc1E\xe3
\x8d\xd2Y\x13r\xe5U\x87\xa5\x93\xff\xc3\x14P\x81\xb1\x05q\xeb\xa6\x1b\xd5\xd9\x0
4(\xd2\xcbT\xcd\x02>J\r\xef9t$)k\xee\xb8\x0e\x9bl\x81\xce\xd6\x13=5\x9f\xb9\xbb\
xbd%\xc9\x93=\x11\x94\xcc\x1cN\x04\x13\xa2&\xa7\xca\xf7\x03o\x0f\x7fZ6\'\x94N[\r
\x13\xde\xbf\xaa\x08_t\xec\xe5\xf2Tv\x88\x87\xdf\x80\x00|\x02\r\x11\xe8\xd5\x03+
\xce\xf2[\xb7G\xdc\xc5:\x8a\x04+\xca\x9d\xf9C\xa8\xc9\x92u\xbcv7\xfc\x92T\xd1\x1
dx\xba\\"\xc8\x04e\xf8\xb6u\xdd\xff+0\xc0\x92f\xc0y\xcc)\x9fk\xe5|8\x0c\x96\xf7\
xd7\xd2\xf1k\x0c\xc9\x80\x03\xb9x(\xf8\xbe\xbf\n[\x15\x01%\xd6.X\xa4\xb8\xd8\xe0
/\x8f\xda\xa7\xa2{O)6Nw5\xb7a\x17\xa3\xee0\x8c\xfe\xf1\xf9\xd1\xf2b\xfa6\xdf\xe7
\xe0\xa2\x89\x1c\x8f\'\xe8\xeb\x98\xc9o\xc5\xc8z\x055\x13\xcb\xac\xfa\xd4\xe1\xf
a#\xc7\xeb}f4\x96:\xca\x03\xf3\xd9\x025\xb4D\x18\xc2c\xeaB\\\xd3\xddc\xed\x9e\x8
b\x9c2>\xff\xda\xc0\xe5Q\xa5;\x1bw>)i\x93\xc9O\xba{\xcb\x83\xc2W\xcd1\xb7\xbb\xa
1\xe5"\\\xf2\x15\xd0AdI#\x9b\xb8y\xaf\xc0\x85\xf6\xc3\xe4\xff5\xffT=w{R\xc7u\xe3
f^n\xe0\xdcjY\xa6W\xc9\x01\x7fG\xacs\x05$oor\xf5\xaf'

does not seem to help, as CyberChef.

Unclear purpose Python script:
import requests
import blackboxprotobuf
import base64

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data, altchars = b'-_')

message = {
    '110': {
        '3': {
            '15': {
                '1': {
                    '1': 'Afjq0vNNzUHewjVjSF6CM4pkhVz8EJsIoQaL67kKRbeUytBOQZBlcGEYzgBAvAl40GRZxh9QVNcMluTfDwTipOSyao7gKZd2oYMV3dHIB0Bf2Nskik5MDjzJtKKn2Ii_T-AKU1_y1gpcmVXnOCuRMz5JE-n3zwfym0lggitGV-yuzFBVD_Ul4PRt128gaJuHWRk71MXOysBAnKwtEWEWwTzp8mOMJU-JqgfSxXsMdPAMvpQNJTmriQPh8QjBReON0lkTcuVVh6WT_8MUUIGxBXHrphvV2QQo0stUzQI-Sg3vOXQkKWvuuA6bbIHO1hM9NZ-5u70lyZM9EZTMHE4EE6Imp8r3A28Pf1o2J5ROWw0T3r-qCF907OXyVHaIh9-AAHwCDRHo1QMrzvJbt0fcxTqKBCvKnflDqMmSdbx2N_ySVNEdeLpcIsgEZfi2dd3_KzDAkmbAecwpn2vlfDgMlvfX0vFrDMmAA7l4KPi-vwpbFQEl1i5YpLjY4C-P2qeie08pNk53NbdhF6PuMIz-8fnR8mL6Nt_n4KKJHI8n6OuYyW_FyHoFNRPLrPrU4fojx-t9ZjSWOsoD89kCNbREGMJj6kJc091j7Z6LnDI-_9rA5VGlOxt3Pilpk8lPunvLg8JXzTG3u6HlIlzyFdBBZEkjm7h5r8CF9sPk_zX_VD13e1LHdeNmXm7g3GpZplfJAX9HrHMFJG9vcvWv',
                    '2': '661ecf48-0000-2e08-9784-d4f547fd991c'
                },
                '3': 1
            }
        }
    }
}

typedef = {
    '110': {
        'type': 'message',
        'message_typedef': {
            '3': {
                'type': 'message',
                'message_typedef': {
                    '15': {
                        'type': 'message',
                        'message_typedef': {
                            '1': {
                                'type': 'message',
                                'message_typedef': {
                                    '1': {
                                        'type': 'string'
                                    },
                                    '2': {
                                        'type': 'string'
                                    }
                                },
                                'field_order': [
                                    '1',
                                    '2'
                                ]
                            },
                            '3': {
                                'type': 'int'
                            }
                        },
                        'field_order': [
                            '1',
                            '3'
                        ]
                    }
                },
                'field_order': [
                    '15'
                ]
            }
        },
        'field_order': [
            '3'
        ]
    }
}

three = getBase64Protobuf(message, typedef)

message = {
    '80226972': {
        '2': 'UCWeg2Pkate69NFdBeuRFTAw',
        '3': three
    }
}

typedef = {
    '80226972': {
        'type': 'message',
        'message_typedef': {
            '2': {
                'type': 'string'
            },
            '3': {
                'type': 'string'
            }
        },
        'field_order': [
            '2',
            '3'
        ]
    }
}

continuation = getBase64Protobuf(message, typedef)

url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
    'Content-Type': 'application/json'
}
data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240313.05.00'
        }
    },
    'continuation': continuation
}

data = requests.post(url, headers = headers, json = data).json()
print('Ce film était une très mauvaise idée' in str(data))

Benjamin-Loison avatar Mar 18 '24 20:03 Benjamin-Loison

Note that .decode('ascii') in base64.b64encode(data).decode('ascii') does not seem necessary but removing it is debatable in my opinion, I prefer strings over arrays of bytes.

Benjamin-Loison avatar Mar 18 '24 20:03 Benjamin-Loison

Even with considering double base64 encoding I am unable to make progress.

Benjamin-Loison avatar Mar 25 '24 23:03 Benjamin-Loison

Could integrate to minimizeCURL.py by automatically detecting fields being Protobuf encoded.

Benjamin-Loison avatar Apr 08 '24 13:04 Benjamin-Loison

Maybe first having an explicit request generator not relying on Base64 black box would be a good start.

Benjamin-Loison avatar Apr 10 '24 15:04 Benjamin-Loison

minimizeCURL curl.sh 'ADHD Test'
...
curl https://www.youtube.com/youtubei/v1/search -H 'Content-Type: application/json' --data-raw '{"context": {"client": {"clientName": "WEB", "clientVersion": "2.20240909.02.00"}}, "continuation": "EugNEgR0ZXN0GtwNRXBzS2tnR1hDanFWQVJLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NrNEFXQWFTdndJQ2lZS0JIUmxjM1RxQVE4S0RWb0xDZ2NJaGdFU0FCZ0xHR0h5QVFVS0EwRnNiTmdDQWJnRFlRcTFBZW9CRHdvTldnc0tCd2lHQVJJQUdBc1lHdklCQ0FvR1UyaHZjblJ6d2dLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NuNEFnRzRBeG9LVE9vQkR3b05XZ3NLQndpR0FSSUFHQXNZU1BJQkNBb0dWbWxrWlc5endnSWFlVzkxZEhWaVpWOTJhV1JsYjE5d1lXZGxJRHAwZVhCbE9uTG9BZ0dvQXdHNEEwakFBd0hJQXdIUUF3RUtLT29CRHdvTldnc0tCd2lHQVJJQUdBc1lIUElCQ3dvSlZXNTNZWFJqYUdWa3lnSUNDQUc0QXh3S0p1b0JEd29OV2dzS0J3aUdBUklBR0FzWVN2SUJDUW9IVjJGMFkyaGxaTW9DQWhnQ3VBTktDaTFnQi1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQl9JQkV3b1JVbVZqWlc1MGJIa2dkWEJzYjJGa1pXUzRBd2NLUy1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQlBJQkJnb0VUR2wyWmNJQ0ozbHZkWFIxWW1WZmJHbDJaVjlpY205aFpHTmhjM1JmYzNSaGRIVnpQVEFnT25SNWNHVTZjdWdDQWJnREJBb19DaEIyYjJ4MmJ5QmpjbUZ6YUNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxkbTlzZG05ZlkzSmhjMmp5QVEwS0MxWnZiSFp2SUdOeVlYTm9DaTBLQ25SbGMzUWdiWFZ6YVdQcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWdGRYTnBZX0lCQndvRlRYVnphV01LT1FvT1kyRnlJR055WVhOb0lIUmxjM1RxQVJnS0Zsb1VDZ2NJaGdFU0FCZ0xFZ2xqWVhKZlkzSmhjMmp5QVFzS0NVTmhjaUJqY21GemFBb2tDZ2RwY1NCMFpYTjA2Z0VSQ2c5YURRb0hDSVlCRWdBWUN4SUNhWEh5QVFRS0FrbHhDaTBLQ25SbGMzUWdZWFZrYVdfcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWaGRXUnBiX0lCQndvRlFYVmthVzhLUHdvUWNHVnljMjl1WVd4cGRIa2dkR1Z6ZE9vQkdnb1lXaFlLQndpR0FSSUFHQXNTQzNCbGNuTnZibUZzYVhSNThnRU5DZ3RRWlhKemIyNWhiR2wwZVFvb0NnVjBaWE4wYi1vQkZBb1NXaEFLQndpR0FSSUFHQXNTQlhSbGMzUnY4Z0VIQ2dWVVpYTjBid29rQ2dkMFpYTjBJRzFsNmdFUkNnOWFEUW9IQ0lZQkVnQVlDeElDYldYeUFRUUtBazFsQ2pRS0NYUmxjM1FnZEdWemRPb0JHQW9XV2hRS0J3aUdBUklBR0FzU0NYUmxjM1JmZEdWemRQSUJDd29KVkdWemRDQjBaWE4wQ2ljS0NHVjVaU0IwWlhOMDZnRVNDaEJhRGdvSENJWUJFZ0FZQ3hJRFpYbGw4Z0VGQ2dORmVXVUtQQW9QYldWdWRHRnNJR0ZuWlNCMFpYTjA2Z0VaQ2hkYUZRb0hDSVlCRWdBWUN4SUtiV1Z1ZEdGc1gyRm5aZklCREFvS1RXVnVkR0ZzSUdGblpRb19DaEJqYjJ4dmNpQmliR2x1WkNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxZMjlzYjNKZllteHBibVR5QVEwS0MwTnZiRzl5SUdKc2FXNWtHQXRhRFFvTENBUXFCd2lHQVJJQUdBdDRBQSUzRCUzRJABARiB4OgYIgtzZWFyY2gtcGFnZQ%3D%3D"}'
Python script with hardcoded continuation:
import requests
import json

URL = 'https://www.youtube.com/youtubei/v1/search'
HEADERS = {
    'Content-Type': 'application/json',
}

data = {
    "context": {
        "client": {
            "clientName": "WEB",
            "clientVersion": "2.20240909.02.00",
        }
    },
    "continuation": "EugNEgR0ZXN0GtwNRXBzS2tnR1hDanFWQVJLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NrNEFXQWFTdndJQ2lZS0JIUmxjM1RxQVE4S0RWb0xDZ2NJaGdFU0FCZ0xHR0h5QVFVS0EwRnNiTmdDQWJnRFlRcTFBZW9CRHdvTldnc0tCd2lHQVJJQUdBc1lHdklCQ0FvR1UyaHZjblJ6d2dLT0FTaGhJSGx2ZFhSMVltVmZjMmh2Y25SelgyVnNhV2RwWW14bElEcDBlWEJsT25JZ0tHNGdlVzkxZEhWaVpWOW1iR0ZuWDJoaGMxOXdjbVZ0YVdWeVpWOTJhV1JsYjE5dFpYUmhaR0YwWVQweElEcDBlWEJsT25JcElDaHVJSGx2ZFhSMVltVmZabXhoWjE5b1lYTmZiR2wyWlY5emRISmxZVzFmYldWMFlXUmhkR0U5TVNBNmRIbHdaVHB5S1NuNEFnRzRBeG9LVE9vQkR3b05XZ3NLQndpR0FSSUFHQXNZU1BJQkNBb0dWbWxrWlc5endnSWFlVzkxZEhWaVpWOTJhV1JsYjE5d1lXZGxJRHAwZVhCbE9uTG9BZ0dvQXdHNEEwakFBd0hJQXdIUUF3RUtLT29CRHdvTldnc0tCd2lHQVJJQUdBc1lIUElCQ3dvSlZXNTNZWFJqYUdWa3lnSUNDQUc0QXh3S0p1b0JEd29OV2dzS0J3aUdBUklBR0FzWVN2SUJDUW9IVjJGMFkyaGxaTW9DQWhnQ3VBTktDaTFnQi1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQl9JQkV3b1JVbVZqWlc1MGJIa2dkWEJzYjJGa1pXUzRBd2NLUy1vQkR3b05XZ3NLQndpR0FSSUFHQXNZQlBJQkJnb0VUR2wyWmNJQ0ozbHZkWFIxWW1WZmJHbDJaVjlpY205aFpHTmhjM1JmYzNSaGRIVnpQVEFnT25SNWNHVTZjdWdDQWJnREJBb19DaEIyYjJ4MmJ5QmpjbUZ6YUNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxkbTlzZG05ZlkzSmhjMmp5QVEwS0MxWnZiSFp2SUdOeVlYTm9DaTBLQ25SbGMzUWdiWFZ6YVdQcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWdGRYTnBZX0lCQndvRlRYVnphV01LT1FvT1kyRnlJR055WVhOb0lIUmxjM1RxQVJnS0Zsb1VDZ2NJaGdFU0FCZ0xFZ2xqWVhKZlkzSmhjMmp5QVFzS0NVTmhjaUJqY21GemFBb2tDZ2RwY1NCMFpYTjA2Z0VSQ2c5YURRb0hDSVlCRWdBWUN4SUNhWEh5QVFRS0FrbHhDaTBLQ25SbGMzUWdZWFZrYVdfcUFSUUtFbG9RQ2djSWhnRVNBQmdMRWdWaGRXUnBiX0lCQndvRlFYVmthVzhLUHdvUWNHVnljMjl1WVd4cGRIa2dkR1Z6ZE9vQkdnb1lXaFlLQndpR0FSSUFHQXNTQzNCbGNuTnZibUZzYVhSNThnRU5DZ3RRWlhKemIyNWhiR2wwZVFvb0NnVjBaWE4wYi1vQkZBb1NXaEFLQndpR0FSSUFHQXNTQlhSbGMzUnY4Z0VIQ2dWVVpYTjBid29rQ2dkMFpYTjBJRzFsNmdFUkNnOWFEUW9IQ0lZQkVnQVlDeElDYldYeUFRUUtBazFsQ2pRS0NYUmxjM1FnZEdWemRPb0JHQW9XV2hRS0J3aUdBUklBR0FzU0NYUmxjM1JmZEdWemRQSUJDd29KVkdWemRDQjBaWE4wQ2ljS0NHVjVaU0IwWlhOMDZnRVNDaEJhRGdvSENJWUJFZ0FZQ3hJRFpYbGw4Z0VGQ2dORmVXVUtQQW9QYldWdWRHRnNJR0ZuWlNCMFpYTjA2Z0VaQ2hkYUZRb0hDSVlCRWdBWUN4SUtiV1Z1ZEdGc1gyRm5aZklCREFvS1RXVnVkR0ZzSUdGblpRb19DaEJqYjJ4dmNpQmliR2x1WkNCMFpYTjA2Z0VhQ2hoYUZnb0hDSVlCRWdBWUN4SUxZMjlzYjNKZllteHBibVR5QVEwS0MwTnZiRzl5SUdKc2FXNWtHQXRhRFFvTENBUXFCd2lHQVJJQUdBdDRBQSUzRCUzRJABARiB4OgYIgtzZWFyY2gtcGFnZQ%3D%3D",
}

##

response = requests.post(URL, headers = HEADERS, json = data)
#print(response.text)
data = response.json()
dataStr = json.dumps(data, indent = 4)
#print(dataStr)
print('ADHD Test' in dataStr)

Benjamin-Loison avatar Sep 11 '24 13:09 Benjamin-Loison

import base64
import blackboxprotobuf
import urllib.parse as ul
import copy
import binascii

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

def isRequestStillFine(httpMethod, url, params, headers, data, needle):
    data = httpMethod(url, params = params, headers = headers, json = data).json()
    dataStr = json.dumps(data, indent = 4)
    #print(dataStr)
    return isDataOnlyContainingShorts(data)

def isRequestStillFineExplicit(httpMethod, url, params, headers, data, dataPath, message, typedef, needle):
    setDataFromPath(data, dataPath, getBase64Protobuf(message, typedef))
    return isRequestStillFine(httpMethod, url, params, headers, data, needle)

# Will need to proceed recursively
def minimizeProtobuf(httpMethod, url, params, headers, data, dataPath, needle, messages = [], typedefs = []):
    print(dataPath)
    print(json.dumps(data, indent = 4))
    entry = base64.b64decode(ul.unquote_plus(getDataFromPath(data, dataPath)), altchars = '-_')
    message, typedef = blackboxprotobuf.decode_message(entry)
    #print(json.dumps(message, indent = 4))
    print(json.dumps(typedef, indent = 4))
    # Based on [YouTube-operational-API/blob/11566147f4d54b8d8d8481709fd5bf6b1329f4de/tools/minimizeCURL.py](https://github.com/Benjamin-Loison/YouTube-operational-API/blob/11566147f4d54b8d8d8481709fd5bf6b1329f4de/tools/minimizeCURL.py) `isJson`.
    def getPaths(d):
        if isinstance(d, dict):
            for key, value in d.items():
                yield f'/{key}'
                yield from (f'/{key}{p}' for p in getPaths(value))

    # If a single unknown entry is necessary, then this algorithm seems to most efficiently goes from parents to children if necessary to remove other entries. Hence, it seems to proceed in a linear number of HTTPS requests and not a quadratic one.
    # Try until no more change to remove unnecessary entries. If assume a logical behavior as just mentioned, would not a single loop iteration be enough? Not with current design, see (1).
    while True:
        changedSomething = False
        # Note that the path goes from parents to children if necessary which is quite a wanted behavior to quickly remove useless chunks.
        paths = getPaths(message)
        # For all entries, copy current `rawData` and try to remove an entry.
        for path in paths:
            # Copy current `rawData`.
            messageCopy = copy.deepcopy(message)
            # Remove an entry.
            # Pay attention that integer keys here are .
            entry = messageCopy
            pathParts = path[1:].split('/')
            for pathPart in pathParts[:-1]:
                entry = entry[pathPart]
            lastPathPart = pathParts[-1]
            del entry[lastPathPart]
            # Test if the removed entry was necessary.
            # (1) If it was unnecessary, then reconsider paths excluding possible children paths of this unnecessary entry, ensuring optimized complexity it seems.
            if isRequestStillFineExplicit(httpMethod, url, params, headers, data, dataPath, messageCopy, typedef, needle):
                print(len(json.dumps(data)), 'still fine')
                changedSomething = True
                message = messageCopy
                break
            # If it was necessary, we consider possible children paths of this necessary entry and other paths.
        # If a loop iteration considering all paths, does not change anything, then the request cannot be minimized further.
        if not changedSomething:
            break
    # Maybe minimize `typedef` once have minimized `message`. Especially as `field_order` can be removed if only know that do not need other entries.
    # However, can postpone implementing such minimization, as minimizing `typedef` once have minimized `message` is quick.
    messages += [message]
    typedefs += [typedef]
    paths = getPaths(message)
    for path in paths:
        leaf = getDataFromPath(message, path)
        # To avoid intermediary nodes.
        if type(leaf) is str:
            try:
                base64.b64decode(ul.unquote_plus(leaf))
                print(path)
                setDataFromPath(message, path, f'_{pathPart}')
                messagesRecursive, typedefsRecursive = minimizeProtobuf(HTTP_METHOD, URL, PARAMS, HEADERS, leaf, dataPath + path, NEEDLE)
                messages += messagesRecursive
                typedefs += typedefsRecursive
            except binascii.Error:
                pass
    return messages, typedefs

def getDataFromPath(data, path):
    pathParts = path[1:].split('/')
    for pathPart in pathParts:
        data = data[pathPart]
    return data

def setDataFromPath(data, path, value):
    pathParts = path[1:].split('/')
    for pathPart in pathParts[:-1]:
        data = data[pathPart]
    lastPathPart = pathParts[-1]
    data[lastPathPart] = value

HTTP_METHOD = requests.post
DATA_PATH = '/continuation'
messages, typedefs = minimizeProtobuf(HTTP_METHOD, URL, PARAMS, HEADERS, DATA, DATA_PATH, NEEDLE)
print(json.dumps(messages, indent = 4))
#print(json.dumps(typedef, indent = 4))

Benjamin-Loison avatar Sep 11 '24 13:09 Benjamin-Loison

Similar to:

https://github.com/Benjamin-Loison/YouTube-operational-API/blob/0e4168e9ed307cc760f1fe4d69aa8143a4a66ba1/tools/minimizeCURL.py#L151-L228 isJson part.

Commenting this code seems to make sense.

git blame may help:
commit bb31362388beba3aa97b25d40dcf924e2c2bc931
Author: Benjamin Loison <[email protected]>
Date:   Wed Jun 14 21:36:31 2023 +0200

    Removes unnecessary raw data in `tools/minimizeCURL.py`

does not help much.

Benjamin-Loison avatar Sep 11 '24 14:09 Benjamin-Loison

Not simplified Protobuf typedef with simplified message:
{
    "2": {
        "2": "test"
    }
}
{
    "2": {
        "field_order": [
            "2",
            "3",
            "18"
        ],
        "message_typedef": {
            "2": {
                "type": "string"
            },
            "3": {
                "type": "string"
            },
            "18": {
                "type": "int"
            }
        },
        "type": "message"
    },
    "3": {
        "type": "int"
    },
    "4": {
        "type": "string"
    }
}

Benjamin-Loison avatar Sep 11 '24 21:09 Benjamin-Loison

Simplified Protobuf typedef:
{
    "2": {
        "message_typedef": {
            "2": {
                "type": "string"
            }
        },
        "type": "message"
    }
}
Hence, simplified Python script:
import requests
import json
import base64
import blackboxprotobuf

def getBase64Protobuf(message, typedef):
    data = blackboxprotobuf.encode_message(message, typedef)
    return base64.b64encode(data).decode('ascii')

URL = 'https://www.youtube.com/youtubei/v1/search'
HEADERS = {
    'Content-Type': 'application/json',
}

message = {
    '2': {
        '2': 'test'
    }
}

typedef = {
    '2': {
        'message_typedef': {
            '2': {
                'type': 'string'
            }
        },
        'type': 'message'
    }
}

continuation = getBase64Protobuf(message, typedef)

data = {
    'context': {
        'client': {
            'clientName': 'WEB',
            'clientVersion': '2.20240909.02.00',
        }
    },
    'continuation': continuation,
}

##

response = requests.post(URL, headers = HEADERS, json = data)
#print(response.text)
data = response.json()
dataStr = json.dumps(data, indent = 4)
#print(dataStr)
print('ADHD Test' in dataStr)

Benjamin-Loison avatar Sep 11 '24 21:09 Benjamin-Loison

PARAMS = {
    'prettyPrint': 'false',
}

can be useful to check first item to be given one.

Benjamin-Loison avatar Sep 15 '24 15:09 Benjamin-Loison

Note that there are 2 paths, one inside data and the other within message.

Benjamin-Loison avatar Sep 15 '24 16:09 Benjamin-Loison

It does not seem possible to easily simplify by hand recursively by recalling the minimizer with differents arguments.

Benjamin-Loison avatar Sep 15 '24 16:09 Benjamin-Loison