requests
requests copied to clipboard
respect no_proxy environment variable and proxies['no'] parameter
make requests respect no_proxy settings
bugfix attached
Expected Result
http requests to 'white listed urls' should bypass all proxies
white listed urls, as defined in the no_proxy env var
Actual Result
proxies are not bypassed
the sample script will raise
requests.exceptions.ConnectionError: SOCKSHTTPConnectionPool ....: Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSConnection object at ....>: Failed to establish a new connection: 0x01: General SOCKS server failure',))
Reproduction Steps
use case: torify python requests, but also allow requests to localhost etc.
sample script
#!/usr/bin/python2
# license = public domain
import os
import random
import time
import requests
import BaseHTTPServer
import thread
import bs4
tor_host = '127.0.0.1'
#tor_port = 9050 # system-wide tor
tor_port = 9150 # torbrowser tor
# do not use tor to connect to local or private hosts
# see https://en.wikipedia.org/wiki/Reserved_IP_addresses
no_proxy_list = [
# hostnames are not resolved locally with socks5h proxy
'localhost',
'localhost.localdomain',
# IPv4
'127.0.0.0/8', # localhost
# subnets
'169.254.0.0/16',
'255.255.255.255',
# LAN aka private networks
'10.0.0.0/8',
'100.64.0.0/10',
'172.16.0.0/12',
'192.0.0.0/24',
'192.168.0.0/16',
'198.18.0.0/15',
# IPv6
'::1/128', # localhost
'fc00::/7', # LAN
'fe80::/10', # link-local
]
# variant 1
os.environ['no_proxy'] = ','.join(no_proxy_list)
def get_tor_session(tor_host='127.0.0.1', tor_port=9050,
torbrowser_headers=[], no_proxy_list=[]):
session = requests.session()
# variant 1
session.trust_env = True
#session.trust_env = False # ignore environment variables
# socks5h scheme = remote DNS = no DNS leaks
p = 'socks5h://{0}:{1}'.format(tor_host, tor_port)
session.proxies = {
'http' : p,
'https': p,
# variant 2
'no': ','.join(no_proxy_list)
}
if torbrowser_headers == []:
print('warning. got no torbrowser_headers')
# at least imitate torbrowser from year 2018
torbrowser_headers = [
('accept-language', 'en-US,en;q=0.5'),
('accept', 'text/html,application/xhtml+xml,' \
+ 'application/xml;q=0.9,*/*;q=0.8'),
('user-agent', 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) ' \
+ 'Gecko/20100101 Firefox/60.0'),
('upgrade-insecure-requests', '1'),
]
for k, v in torbrowser_headers:
# header 'host' is dynamic
# header 'connection' = 'keep-alive' is set internally
if k not in ['host', 'connection']:
session.headers[k] = v
return session
tor = get_tor_session(tor_host, tor_port, [], no_proxy_list)
test_host = '127.0.0.1'
test_port = random.randint(8000, 16000)
test_url = 'http://{0}:{1}/'.format(test_host, test_port)
def test_tor_get(test_url):
time.sleep(2) # wait for http server to start
tor.get(test_url)
thread.start_new_thread(test_tor_get, (test_url,))
test_headers = [] # global
class test_handler(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self): # handle GET request
global test_headers
test_headers = self.headers.items()
self.send_response(204, 'No Content')
self.end_headers()
serv = BaseHTTPServer.HTTPServer((test_host, test_port), test_handler)
serv.handle_request() # handle one request
del serv
print('tor.get headers')
for k, v in test_headers:
print('header %s: %s' % (k, v))
#print('tor ip '+tor.get("http://httpbin.org/ip").text)
print('tor check ' + \
bs4.BeautifulSoup(
tor.get("https://check.torproject.org/").text, 'html.parser'
).title.string.strip())
System Information
- python2
- current git-version of requests
~~Bugfix~~ Quickfix
the bug is in sessions.py
proxies = merge_setting(proxies, self.proxies)
where [request_]proxies was set to {} by utils.get_environ_proxies * but proxies is set to session_proxies
* with os.environ['no_proxy'] = '127.0.0.1'
this bugfix will respect both
- no_proxy environment variable aka os.environ['no_proxy']
- proxies['no'] parameter for requests.get and requests.session
patch
--- a/utils.py
+++ b/utils.py
@@ -757,7 +757,7 @@
:rtype: dict
"""
if should_bypass_proxies(url, no_proxy=no_proxy):
- return {}
+ return {'__bypass_proxies': True}
else:
return getproxies()
--- a/sessions.py
+++ b/sessions.py
@@ -698,8 +698,15 @@
verify = (os.environ.get('REQUESTS_CA_BUNDLE') or
os.environ.get('CURL_CA_BUNDLE'))
+ if 'no' in self.proxies:
+ if should_bypass_proxies(url, no_proxy=self.proxies['no']):
+ proxies = {'__bypass_proxies': True}
+
# Merge all the kwargs.
- proxies = merge_setting(proxies, self.proxies)
+ if '__bypass_proxies' in proxies:
+ proxies = {} # bypass proxies for this request
+ else:
+ proxies = merge_setting(proxies, self.proxies)
stream = merge_setting(stream, self.stream)
verify = merge_setting(verify, self.verify)
cert = merge_setting(cert, self.cert)
no interest in fixing this bug?
heloo?
Any update on this topic?
I am unsure how contributing to Python has changed over the years, but there is an issue for this matter documented here from 2017: https://bugs.python.org/issue29142
Note: I've stumbled upon this issue as part of debugging some unrelated problem.
Looking into it, I noticed that this issue focuses on adding IP addresses to the no_proxy setting.
From the available documentation I could find, no_proxy is supposed to be a comma-separated list of domain names; IP address aren't supported:
- Curl: https://curl.haxx.se/docs/manpage.html#NOPROXY
- Wget: https://www.gnu.org/software/wget/manual/html_node/Proxies.html
When running with a domain name, no_proxy is properly honoured by requests:
>>> import os, requests
# Invalid proxy address; exemption for example.com
>>> os.environ['no_proxy'], os.environ['https_proxy']
('example.com', 'http://localhost:1/')
# Requests to example.com DO bypass the proxy
>>> requests.get('http://example.com')
<Response [200]>
# Requests to example.org DON'T bypass the proxy, and fail.
>>> requests.get('http://example.org')
Traceback (most recent call last):
[...]
requests.exceptions.ProxyError: HTTPConnectionPool(host='localhost', port=1): Max retries exceeded with url: http://example.org/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fec85a53c10>: Failed to establish a new connection: [Errno 111] Connection refused')))
As far as I can tell, there is no issue in requests here: the library's behaviour is consistent with other HTTP clients in its handling of no_proxy.
For the original use case (bypassing a tor proxy for some IPs), it might be useful to add an additional local proxy that connects directly for those IPs, and chains to Tor for other addresses.
no_proxyis supposed to be a comma-separated list of domain names; IP address aren't supported
no, this just makes no sense. network nodes always have a numeric address, and only sometimes have a hostname. under the hood, hostnames are always resolved to numeric ads.
curl docs - https://curl.haxx.se/docs/manpage.html#NOPROXY
NO_PROXY <comma-separated list of hosts/domains> .... The list of host names can also be include numerical IP addresses, and IPv6 versions should then be given without enclosing brackets.
wget docs - https://www.gnu.org/software/wget/manual/html_node/Proxies.html
https_proxy If set, the http_proxy and https_proxy variables should contain the URLs of the proxies for HTTP and HTTPS connections respectively.
no_proxy This variable should contain a comma-separated list of domain extensions proxy should not be used for. For instance, if the value of no_proxy is ‘.mit.edu’, proxy will not be used to retrieve documents from MIT.
this is misleading. URL hosts can be numeric-address or hostname. *_proxy values should be consistent, so no_proxy also should accept any valid URL, where subnetworks also are valid resources - blocking of resources should allow for "fuzzy" IDs / wildcard IDs
just wanted to leave this comment here, i have lost interest in fixing the issue
@nateprewitt can you check if the patch in OP's can be patched into requests? Because as requests is now, it does not honor the no_proxy
requests.get('http://10.0.0.200:4454/abc.txt', proxies={'http': 'http://broken-ass-proxy.com', 'https': 'https://broken-ass-proxy.com'})
Will error out with requests.exceptions.ProxyError, as expected.
requests.get('http://10.0.0.200:4454/abc.txt', proxies={'no_proxy': '10.0.0.200', 'http': 'http://broken-ass-proxy.com', 'https': 'https://broken-ass-proxy.com'})
Will error out with requests.exceptions.ProxyError. This should not happen, as the no_proxy should take effect, before the http and https. The request should have been sent directly.
But with the patch that @milahu provided, the no_proxy is honored and works as intended.
>>WITH OP's PATCH<<
import requests
s = requests.Session()
s.proxies = {'no_proxy':'10.0.0.200', 'http': 'http://broken-ass-proxy.com'}
s.get('http://10.0.0.200:4454/abc.txt')
Will end up with requests.exceptions.ProxyError: HTTPConnectionPool(host='broken-ass-proxy.com', port=80): Max retries exceeded with url
Creating a session and assigning it some porxies, seems to fail in this case.
Seems that the s.proxies is never called in s.get, meaning that the call https://github.com/psf/requests/blob/967a05bfffcb68f97296eda197b062221c2ebc0d/requests/sessions.py#L530-L534 will always get an empty proxy var. Which in turn will mess with the following logic and prevent the no_proxy from working as intended. Followed by get_environ_proxies > should_bypass_proxies def, which needs the no_proxy to be extracted, to determine if the proxy should be bypassed or not https://github.com/psf/requests/blob/02eb5a2cd34d36548ebb08528c73ca66c2a398d9/requests/sessions.py#L708-L713
@nateprewitt can you check if the patch in OP's can be patched into requests?
not the original patch, cos it breaks a function interface by adding a hidden property to the return object (quick n dirty), which makes a test fail
if you wanna fix this, you will have to change the function interface (return nested object with proxy-map and optional parameters) and update the test
The longer I look at it, it starts making more sense to call should_bypass_proxies(url, no_proxy) inside merge_environment_settings just before https://github.com/psf/requests/blob/967a05bfffcb68f97296eda197b062221c2ebc0d/requests/sessions.py#L722
And decide if proxy is to be foreced to {} or allowed to merge proxies.
Coming across this now, is there any way in python to no_proxy an ip range (like a 10.0.0.0/8)? According to this, no: https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/#no_proxy. Article mentions curl discussion which is interesting to look at too: https://github.com/curl/curl/issues/1208
Code that works as expected:
import os
import sys
import requests
import traceback
url = "https://github.com"
# When only environment variables are passed, everything works as expected
# requests module will take them into account and go through the defined proxies
# unless no_proxy matches.
os.environ.update({
"http_proxy": "http://dummy-proxy.com:443",
"https_proxy": "http://dummy-proxy.com:443",
# "no_proxy": ".whatever.com" # Uncomment to fail with env proxies
"no_proxy": ".github.com" # Uncomment to pass with env proxies
})
with requests.Session() as session:
req = requests.Request(method="GET", url=url)
prep = session.prepare_request(req)
settings = session.merge_environment_settings(prep.url, None, None, None, None)
try:
print(session.request(method="GET", url=url, timeout=1))
except requests.exceptions.ProxyError:
print(traceback.format_exc().strip().splitlines()[-1], file=sys.stderr)
finally:
print("\nProxies:", settings.get("proxies"))
Code that DOESN'T works as expected:
import os
import sys
import requests
import traceback
url = "https://github.com"
# When only environment variables are passed, everything works as expected
# requests module will take them into account and go through the defined proxies
# unless no_proxy matches.
os.environ.update({
"http_proxy": "http://dummy-proxy.com:443",
"https_proxy": "http://dummy-proxy.com:443",
# "no_proxy": ".whatever.com" # Uncomment to fail with env proxies
"no_proxy": ".github.com" # Uncomment to pass with env proxies
})
# Doesn't work even if adding "no": ".github.com" or "no_proxy": ".github.com" inside the proxies kwarg
proxies = {"http": "http://dummy-proxy.com:443", "https": "http://dummy-proxy.com:443"}
with requests.Session() as session:
req = requests.Request(method="GET", url=url)
prep = session.prepare_request(req)
settings = session.merge_environment_settings(prep.url, proxies, None, None, None)
try:
print(session.request(method="GET", proxies=proxies, url=url, timeout=1))
except requests.exceptions.ProxyError:
print(traceback.format_exc().strip().splitlines()[-1], file=sys.stderr)
finally:
print("\nProxies:", settings.get("proxies"))
Like @milahu said, the issue comes from utils.get_environ_proxies and sessions.merge_environment_settings
https://github.com/psf/requests/pull/5596#issuecomment-1961184126