requests
requests copied to clipboard
(Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))
trafficstars
I'm developing a program that scrapes news data using the requests library but while running I get this error. my proxy list is working correctly
error: Proxy: 103.21.244.100:80 - Error: HTTPSConnectionPool(host='www.nytimes.com', port=443): Max retries exceeded with url: /section/business/media (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))
Reproduction Steps
import sys
import time
import psycopg2
import newspaper
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton
from threading import Thread
import requests
class NewsScraperGUI(QMainWindow):
def __init__(self):
super().__init__()
self.initUI()
self.scraping = False # Flag to control scraping process
def initUI(self):
self.setWindowTitle('News Scraper')
self.setGeometry(100, 100, 400, 200)
self.start_button = QPushButton('Start Scraping', self)
self.start_button.clicked.connect(self.startScraping)
self.start_button.setGeometry(50, 50, 150, 30)
self.stop_button = QPushButton('Stop Scraping', self)
self.stop_button.clicked.connect(self.stopScraping)
self.stop_button.setGeometry(200, 50, 150, 30)
self.stop_button.setEnabled(False) # Initially disabled
def create_table(self):
conn = psycopg2.connect(
dbname='postgres',
user='postgres',
password='12Tilak34##',
host='localhost'
)
cur = conn.cursor()
cur.execute('''
CREATE TABLE IF NOT EXISTS news (
id SERIAL PRIMARY KEY,
source TEXT,
title TEXT,
text TEXT,
url TEXT
);
''')
conn.commit()
conn.close()
def insert_news(self, source, title, text, url):
conn = psycopg2.connect(
dbname='postgres',
user='postgres',
password='12Tilak34##',
host='localhost'
)
cur = conn.cursor()
cur.execute('''
INSERT INTO news (source, title, text, url) VALUES (%s, %s, %s, %s);
''', (source, title, text, url))
conn.commit()
conn.close()
def read_proxies(self, filename):
try:
with open(filename, 'r') as file:
proxies = file.readlines()
# Removing newline characters and any extra whitespace
proxies = [proxy.strip() for proxy in proxies if proxy.strip()]
return proxies
except Exception as e:
print(f"Error reading proxies from file: {e}")
return []
def scrape_news(self, source_url, num_articles, proxies_filename):
proxies = self.read_proxies(proxies_filename)
try:
source = newspaper.build(source_url, memoize_articles=False)
source.download()
source.parse()
articles_scraped = 0
for article in source.articles:
if articles_scraped >= num_articles:
break
proxy = proxies.pop(0) if proxies else None # Get the first proxy from the list
if proxy:
proxies.append(proxy) # Rotate proxies
article_url = article.url
article_title = article.title
article_text = article.text
try:
if proxy:
# Using requests library to send the request with the selected proxy
response = requests.get(article_url, proxies={"http": proxy, "https": proxy}, timeout=10)
if response.status_code == 200:
# If the request was successful, insert news and print proxy used
self.insert_news(source_url, article_title, article_text, article_url)
print(f"Proxy: {proxy} - Success: {article_url}")
else:
print(f"Proxy: {proxy} - Failed: {article_url}")
else:
print("No proxy available for request.")
except Exception as e:
print(f"Proxy: {proxy} - Error: {e}")
articles_scraped += 1
except Exception as e:
print(f"Error scraping from {source_url}: {e}")
def startScraping(self):
self.scraping = True
self.start_button.setEnabled(False)
self.stop_button.setEnabled(True)
self.create_table()
proxies_filename = "proxylist.txt"
# Define websites and the number of articles to scrape from each
websites = [
{'url': 'http://ft.com', 'num_articles': 5},
{'url': 'http://nytimes.com', 'num_articles': 5},
{'url': 'http://www.bloomberg.com/economics', 'num_articles': 5},
{'url': 'http://economictimes.indiatimes.com', 'num_articles': 5},
]
self.threads = []
for site in websites:
thread = Thread(target=self.scrape_news, args=(site['url'], site['num_articles'],proxies_filename))
self.threads.append(thread)
thread.start()
def stopScraping(self):
self.scraping = False
self.start_button.setEnabled(True)
self.stop_button.setEnabled(False)
for thread in self.threads:
thread.join()
def main():
app = QApplication(sys.argv)
window = NewsScraperGUI()
window.show()
sys.exit(app.exec_())
if __name__ == '__main__':
main()
System Information
$ python -m requests.help
C:\Python\Lib\site-packages\requests\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.15) or chardet (5.2.0)/charset_normalizer (2.0.12) doesn't match a supported version!
warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
C:\Python\Lib\site-packages\requests\help.py:25: DeprecationWarning: 'urllib3.contrib.pyopenssl' module is deprecated and will be removed in a future release of urllib3 2.x. Read more in this issue: https://github.com/urllib3/urllib3/issues/2680
from urllib3.contrib import pyopenssl
{
"chardet": {
"version": "5.2.0"
},
"charset_normalizer": {
"version": "2.0.12"
},
"cryptography": {
"version": "41.0.1"
},
"idna": {
"version": "3.4"
},
"implementation": {
"name": "CPython",
"version": "3.11.3"
},
"platform": {
"release": "10",
"system": "Windows"
},
"pyOpenSSL": {
"openssl_version": "30100010",
"version": "23.2.0"
},
"requests": {
"version": "2.26.0"
},
"system_ssl": {
"version": "1010114f"
},
"urllib3": {
"version": "1.26.15"
},
"using_charset_normalizer": false,
"using_pyopenssl": true
}