requests
requests copied to clipboard
Unicode surrogates in POST data lead to exception
In requests 2.26.0 with Python 3, when passing unicode strings with surrogate characters as POST data (either directly as a str or as part of a dict), sending the request throws an exception.
Expected Result
The library should not throw an exception, even when fed with unvalidated user input.
Actual Result / Reproduction Steps
>>> import requests
>>> requests.__version__
'2.26.0'
>>> requests.post('https://example.invalid', data={'name': 'test\udced\udcb3\udc83.pdf'})
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/user/.local/lib/python3.7/site-packages/requests/api.py", line 117, in post
return request('post', url, data=data, json=json, **kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/sessions.py", line 528, in request
prep = self.prepare_request(req)
File "/home/user/.local/lib/python3.7/site-packages/requests/sessions.py", line 466, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/home/user/.local/lib/python3.7/site-packages/requests/models.py", line 319, in prepare
self.prepare_body(data, files, json)
File "/home/user/.local/lib/python3.7/site-packages/requests/models.py", line 515, in prepare_body
body = self._encode_params(data)
File "/home/user/.local/lib/python3.7/site-packages/requests/models.py", line 104, in _encode_params
v.encode('utf-8') if isinstance(v, str) else v))
UnicodeEncodeError: 'utf-8' codec can't encode characters in position 4-6: surrogates not allowed
>>> requests.post('https://example.com/', 'test\udced\udcb3\udc83.pdf')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/home/user/.local/lib/python3.7/site-packages/requests/api.py", line 117, in post
return request('post', url, data=data, json=json, **kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "/home/user/.local/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/home/user/.local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 706, in urlopen
chunked=chunked,
File "/home/user/.local/lib/python3.7/site-packages/urllib3/connectionpool.py", line 394, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/home/user/.local/lib/python3.7/site-packages/urllib3/connection.py", line 239, in request
super(HTTPConnection, self).request(method, url, body=body, headers=headers)
File "/usr/lib/python3.7/http/client.py", line 1260, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/usr/lib/python3.7/http/client.py", line 1305, in _send_request
body = _encode(body, 'body')
File "/usr/lib/python3.7/http/client.py", line 174, in _encode
(name.title(), data[err.start:err.end], name)) from None
UnicodeEncodeError: 'latin-1' codec can't encode characters in position 4-6: Body ('\udced\udcb3\udc83') is not valid Latin-1. Use body.encode('utf-8') if you want to send it encoded in UTF-8.
System Information
$ python -m requests.help
{
"chardet": {
"version": "3.0.4"
},
"charset_normalizer": {
"version": "2.0.9"
},
"cryptography": {
"version": ""
},
"idna": {
"version": "3.3"
},
"implementation": {
"name": "CPython",
"version": "3.7.3"
},
"platform": {
"release": "4.19.0-17-amd64",
"system": "Linux"
},
"pyOpenSSL": {
"openssl_version": "",
"version": null
},
"requests": {
"version": "2.26.0"
},
"system_ssl": {
"version": "1010104f"
},
"urllib3": {
"version": "1.26.7"
},
"using_charset_normalizer": false,
"using_pyopenssl": false
}
In requests/models.py, there are several calls like some_string.encode('utf-8'). I suppose changing them to some_string.encode('utf-8', 'surrogatepass') would at least fix the first reproduction case. Not sure about the second, since the exception occurs in http.client there.