requests icon indicating copy to clipboard operation
requests copied to clipboard

respect no_proxy environment variable and proxies['no'] parameter

Open milahu opened this issue 7 years ago • 13 comments

make requests respect no_proxy settings

bugfix attached

Expected Result

http requests to 'white listed urls' should bypass all proxies

white listed urls, as defined in the no_proxy env var

Actual Result

proxies are not bypassed

the sample script will raise

requests.exceptions.ConnectionError: SOCKSHTTPConnectionPool ....: Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.contrib.socks.SOCKSConnection object at ....>: Failed to establish a new connection: 0x01: General SOCKS server failure',))

Reproduction Steps

use case: torify python requests, but also allow requests to localhost etc.

sample script
#!/usr/bin/python2

# license = public domain

import os
import random
import time
import requests
import BaseHTTPServer
import thread
import bs4

tor_host = '127.0.0.1'
#tor_port = 9050 # system-wide tor
tor_port = 9150 # torbrowser tor

# do not use tor to connect to local or private hosts
# see https://en.wikipedia.org/wiki/Reserved_IP_addresses
no_proxy_list = [
        # hostnames are not resolved locally with socks5h proxy
        'localhost',
        'localhost.localdomain',
        # IPv4
        '127.0.0.0/8', # localhost
        # subnets
        '169.254.0.0/16',
        '255.255.255.255',
        # LAN aka private networks
        '10.0.0.0/8',
        '100.64.0.0/10',
        '172.16.0.0/12',
        '192.0.0.0/24',
        '192.168.0.0/16',
        '198.18.0.0/15',
        # IPv6
        '::1/128', # localhost
        'fc00::/7', # LAN
        'fe80::/10', # link-local
]

# variant 1
os.environ['no_proxy'] = ','.join(no_proxy_list)

def get_tor_session(tor_host='127.0.0.1', tor_port=9050,
        torbrowser_headers=[], no_proxy_list=[]):

        session = requests.session()

        # variant 1
        session.trust_env = True
        #session.trust_env = False # ignore environment variables

        # socks5h scheme = remote DNS = no DNS leaks
        p = 'socks5h://{0}:{1}'.format(tor_host, tor_port)
        session.proxies = {
                'http' : p,
                'https': p,

                # variant 2
                'no': ','.join(no_proxy_list)
        }

        if torbrowser_headers == []:
                print('warning. got no torbrowser_headers')
                # at least imitate torbrowser from year 2018
                torbrowser_headers = [
                        ('accept-language', 'en-US,en;q=0.5'),
                        ('accept', 'text/html,application/xhtml+xml,' \
                                + 'application/xml;q=0.9,*/*;q=0.8'),
                        ('user-agent', 'Mozilla/5.0 (Windows NT 6.1; rv:60.0) ' \
                                + 'Gecko/20100101 Firefox/60.0'),
                        ('upgrade-insecure-requests', '1'),
                ]

        for k, v in torbrowser_headers:
                # header 'host' is dynamic
                # header 'connection' = 'keep-alive' is set internally
                if k not in ['host', 'connection']:
                        session.headers[k] = v

        return session



tor = get_tor_session(tor_host, tor_port, [], no_proxy_list)



test_host = '127.0.0.1'
test_port = random.randint(8000, 16000)
test_url = 'http://{0}:{1}/'.format(test_host, test_port)

def test_tor_get(test_url):
        time.sleep(2) # wait for http server to start
        tor.get(test_url)
thread.start_new_thread(test_tor_get, (test_url,))

test_headers = [] # global
class test_handler(BaseHTTPServer.BaseHTTPRequestHandler):
        def do_GET(self): # handle GET request
                global test_headers
                test_headers = self.headers.items()
                self.send_response(204, 'No Content')
                self.end_headers()

serv = BaseHTTPServer.HTTPServer((test_host, test_port), test_handler)
serv.handle_request() # handle one request
del serv

print('tor.get headers')
for k, v in test_headers:
        print('header %s: %s' % (k, v))

#print('tor ip '+tor.get("http://httpbin.org/ip").text)

print('tor check ' + \
bs4.BeautifulSoup(
tor.get("https://check.torproject.org/").text, 'html.parser'
).title.string.strip())

System Information

  • python2
  • current git-version of requests

~~Bugfix~~ Quickfix

the bug is in sessions.py

proxies = merge_setting(proxies, self.proxies)

where [request_]proxies was set to {} by utils.get_environ_proxies * but proxies is set to session_proxies

* with os.environ['no_proxy'] = '127.0.0.1'

this bugfix will respect both

  • no_proxy environment variable aka os.environ['no_proxy']
  • proxies['no'] parameter for requests.get and requests.session
patch
--- a/utils.py
+++ b/utils.py
@@ -757,7 +757,7 @@
     :rtype: dict
     """
     if should_bypass_proxies(url, no_proxy=no_proxy):
-        return {}
+        return {'__bypass_proxies': True}
     else:
         return getproxies()


--- a/sessions.py
+++ b/sessions.py
@@ -698,8 +698,15 @@
                 verify = (os.environ.get('REQUESTS_CA_BUNDLE') or
                           os.environ.get('CURL_CA_BUNDLE'))

+        if 'no' in self.proxies:
+            if should_bypass_proxies(url, no_proxy=self.proxies['no']):
+                proxies = {'__bypass_proxies': True}
+
         # Merge all the kwargs.
-        proxies = merge_setting(proxies, self.proxies)
+        if '__bypass_proxies' in proxies:
+            proxies = {} # bypass proxies for this request
+        else:
+            proxies = merge_setting(proxies, self.proxies)
         stream = merge_setting(stream, self.stream)
         verify = merge_setting(verify, self.verify)
         cert = merge_setting(cert, self.cert)

milahu avatar Nov 13 '18 14:11 milahu

no interest in fixing this bug?

milahu avatar Nov 27 '18 15:11 milahu

heloo?

milahu avatar Jan 15 '19 00:01 milahu

Any update on this topic?

eruvanos avatar Jun 03 '19 09:06 eruvanos

I am unsure how contributing to Python has changed over the years, but there is an issue for this matter documented here from 2017: https://bugs.python.org/issue29142

Halkcyon avatar Dec 04 '19 20:12 Halkcyon

Note: I've stumbled upon this issue as part of debugging some unrelated problem.

Looking into it, I noticed that this issue focuses on adding IP addresses to the no_proxy setting. From the available documentation I could find, no_proxy is supposed to be a comma-separated list of domain names; IP address aren't supported:

  • Curl: https://curl.haxx.se/docs/manpage.html#NOPROXY
  • Wget: https://www.gnu.org/software/wget/manual/html_node/Proxies.html

When running with a domain name, no_proxy is properly honoured by requests:

>>> import os, requests

# Invalid proxy address; exemption for example.com
>>> os.environ['no_proxy'], os.environ['https_proxy']
('example.com', 'http://localhost:1/')

# Requests to example.com DO bypass the proxy
>>> requests.get('http://example.com')
<Response [200]>

# Requests to example.org DON'T bypass the proxy, and fail.
>>> requests.get('http://example.org')
Traceback (most recent call last):
[...]
requests.exceptions.ProxyError: HTTPConnectionPool(host='localhost', port=1): Max retries exceeded with url: http://example.org/ (Caused by ProxyError('Cannot connect to proxy.', NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fec85a53c10>: Failed to establish a new connection: [Errno 111] Connection refused')))

As far as I can tell, there is no issue in requests here: the library's behaviour is consistent with other HTTP clients in its handling of no_proxy.

For the original use case (bypassing a tor proxy for some IPs), it might be useful to add an additional local proxy that connects directly for those IPs, and chains to Tor for other addresses.

rbarrois avatar Jun 16 '20 15:06 rbarrois

no_proxy is supposed to be a comma-separated list of domain names; IP address aren't supported

no, this just makes no sense. network nodes always have a numeric address, and only sometimes have a hostname. under the hood, hostnames are always resolved to numeric ads.

curl docs - https://curl.haxx.se/docs/manpage.html#NOPROXY

NO_PROXY <comma-separated list of hosts/domains> .... The list of host names can also be include numerical IP addresses, and IPv6 versions should then be given without enclosing brackets.

wget docs - https://www.gnu.org/software/wget/manual/html_node/Proxies.html

https_proxy If set, the http_proxy and https_proxy variables should contain the URLs of the proxies for HTTP and HTTPS connections respectively.

no_proxy This variable should contain a comma-separated list of domain extensions proxy should not be used for. For instance, if the value of no_proxy is ‘.mit.edu’, proxy will not be used to retrieve documents from MIT.

this is misleading. URL hosts can be numeric-address or hostname. *_proxy values should be consistent, so no_proxy also should accept any valid URL, where subnetworks also are valid resources - blocking of resources should allow for "fuzzy" IDs / wildcard IDs

just wanted to leave this comment here, i have lost interest in fixing the issue

milahu avatar Jun 17 '20 09:06 milahu

@nateprewitt can you check if the patch in OP's can be patched into requests? Because as requests is now, it does not honor the no_proxy

requests.get('http://10.0.0.200:4454/abc.txt', proxies={'http': 'http://broken-ass-proxy.com', 'https': 'https://broken-ass-proxy.com'}) Will error out with requests.exceptions.ProxyError, as expected.

requests.get('http://10.0.0.200:4454/abc.txt', proxies={'no_proxy': '10.0.0.200', 'http': 'http://broken-ass-proxy.com', 'https': 'https://broken-ass-proxy.com'}) Will error out with requests.exceptions.ProxyError. This should not happen, as the no_proxy should take effect, before the http and https. The request should have been sent directly.

But with the patch that @milahu provided, the no_proxy is honored and works as intended.

Suika avatar Sep 23 '20 23:09 Suika

>>WITH OP's PATCH<<

import requests
s = requests.Session()
s.proxies = {'no_proxy':'10.0.0.200', 'http': 'http://broken-ass-proxy.com'}
s.get('http://10.0.0.200:4454/abc.txt')

Will end up with requests.exceptions.ProxyError: HTTPConnectionPool(host='broken-ass-proxy.com', port=80): Max retries exceeded with url Creating a session and assigning it some porxies, seems to fail in this case.

Seems that the s.proxies is never called in s.get, meaning that the call https://github.com/psf/requests/blob/967a05bfffcb68f97296eda197b062221c2ebc0d/requests/sessions.py#L530-L534 will always get an empty proxy var. Which in turn will mess with the following logic and prevent the no_proxy from working as intended. Followed by get_environ_proxies > should_bypass_proxies def, which needs the no_proxy to be extracted, to determine if the proxy should be bypassed or not https://github.com/psf/requests/blob/02eb5a2cd34d36548ebb08528c73ca66c2a398d9/requests/sessions.py#L708-L713

Suika avatar Sep 24 '20 03:09 Suika

@nateprewitt can you check if the patch in OP's can be patched into requests?

not the original patch, cos it breaks a function interface by adding a hidden property to the return object (quick n dirty), which makes a test fail

if you wanna fix this, you will have to change the function interface (return nested object with proxy-map and optional parameters) and update the test

milahu avatar Sep 24 '20 06:09 milahu

The longer I look at it, it starts making more sense to call should_bypass_proxies(url, no_proxy) inside merge_environment_settings just before https://github.com/psf/requests/blob/967a05bfffcb68f97296eda197b062221c2ebc0d/requests/sessions.py#L722 And decide if proxy is to be foreced to {} or allowed to merge proxies.

Suika avatar Sep 24 '20 10:09 Suika

Coming across this now, is there any way in python to no_proxy an ip range (like a 10.0.0.0/8)? According to this, no: https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/#no_proxy. Article mentions curl discussion which is interesting to look at too: https://github.com/curl/curl/issues/1208

bendem avatar May 12 '23 11:05 bendem

Code that works as expected:

import os
import sys
import requests
import traceback

url = "https://github.com"

# When only environment variables are passed, everything works as expected
# requests module will take them into account and go through the defined proxies
# unless no_proxy matches.
os.environ.update({
    "http_proxy": "http://dummy-proxy.com:443",
    "https_proxy": "http://dummy-proxy.com:443",
   # "no_proxy": ".whatever.com"  # Uncomment to fail with env proxies
    "no_proxy": ".github.com"  # Uncomment to pass with env proxies
})

with requests.Session() as session:
    req = requests.Request(method="GET", url=url)
    prep = session.prepare_request(req)
    settings = session.merge_environment_settings(prep.url, None, None, None, None)
    try:
        print(session.request(method="GET", url=url, timeout=1))
    except requests.exceptions.ProxyError:
        print(traceback.format_exc().strip().splitlines()[-1], file=sys.stderr)
    finally:
        print("\nProxies:", settings.get("proxies"))

Code that DOESN'T works as expected:

import os
import sys
import requests
import traceback

url = "https://github.com"

# When only environment variables are passed, everything works as expected
# requests module will take them into account and go through the defined proxies
# unless no_proxy matches.
os.environ.update({
    "http_proxy": "http://dummy-proxy.com:443",
    "https_proxy": "http://dummy-proxy.com:443",
   # "no_proxy": ".whatever.com"  # Uncomment to fail with env proxies
    "no_proxy": ".github.com"  # Uncomment to pass with env proxies
})

# Doesn't work even if adding "no": ".github.com" or "no_proxy": ".github.com" inside the proxies kwarg
proxies = {"http": "http://dummy-proxy.com:443", "https": "http://dummy-proxy.com:443"}

with requests.Session() as session:
    req = requests.Request(method="GET", url=url)
    prep = session.prepare_request(req)
    settings = session.merge_environment_settings(prep.url, proxies, None, None, None)
    try:
        print(session.request(method="GET", proxies=proxies, url=url, timeout=1))
    except requests.exceptions.ProxyError:
        print(traceback.format_exc().strip().splitlines()[-1], file=sys.stderr)
    finally:
        print("\nProxies:", settings.get("proxies"))

Like @milahu said, the issue comes from utils.get_environ_proxies and sessions.merge_environment_settings

pBogey avatar Feb 23 '24 07:02 pBogey

https://github.com/psf/requests/pull/5596#issuecomment-1961184126

milahu avatar Feb 23 '24 11:02 milahu