requests icon indicating copy to clipboard operation
requests copied to clipboard

(Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))

Open tilakpatel22 opened this issue 1 year ago • 0 comments
trafficstars

I'm developing a program that scrapes news data using the requests library but while running I get this error. my proxy list is working correctly

error: Proxy: 103.21.244.100:80 - Error: HTTPSConnectionPool(host='www.nytimes.com', port=443): Max retries exceeded with url: /section/business/media (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 400 Bad Request')))

proxylist.txt

Reproduction Steps

import sys
import time
import psycopg2
import newspaper
from PyQt5.QtWidgets import QApplication, QMainWindow, QPushButton
from threading import Thread
import requests


class NewsScraperGUI(QMainWindow):
    def __init__(self):
        super().__init__()
        self.initUI()
        self.scraping = False  # Flag to control scraping process

    def initUI(self):
        self.setWindowTitle('News Scraper')
        self.setGeometry(100, 100, 400, 200)

        self.start_button = QPushButton('Start Scraping', self)
        self.start_button.clicked.connect(self.startScraping)
        self.start_button.setGeometry(50, 50, 150, 30)

        self.stop_button = QPushButton('Stop Scraping', self)
        self.stop_button.clicked.connect(self.stopScraping)
        self.stop_button.setGeometry(200, 50, 150, 30)
        self.stop_button.setEnabled(False)  # Initially disabled

    def create_table(self):
        conn = psycopg2.connect(
            dbname='postgres',
            user='postgres',
            password='12Tilak34##',
            host='localhost'
        )
        cur = conn.cursor()
        cur.execute('''
            CREATE TABLE IF NOT EXISTS news (
                id SERIAL PRIMARY KEY,
                source TEXT,
                title TEXT,
                text TEXT,
                url TEXT
            );
        ''')
        conn.commit()
        conn.close()

    def insert_news(self, source, title, text, url):
        conn = psycopg2.connect(
            dbname='postgres',
            user='postgres',
            password='12Tilak34##',
            host='localhost'
        )
        cur = conn.cursor()
        cur.execute('''
            INSERT INTO news (source, title, text, url) VALUES (%s, %s, %s, %s);
        ''', (source, title, text, url))
        conn.commit()
        conn.close()

    def read_proxies(self, filename):
        try:
            with open(filename, 'r') as file:
                proxies = file.readlines()
                # Removing newline characters and any extra whitespace
                proxies = [proxy.strip() for proxy in proxies if proxy.strip()]
                return proxies
        except Exception as e:
            print(f"Error reading proxies from file: {e}")
            return []
    def scrape_news(self, source_url, num_articles, proxies_filename):
        proxies = self.read_proxies(proxies_filename)

        try:
            source = newspaper.build(source_url, memoize_articles=False)
            source.download()
            source.parse()

            articles_scraped = 0
            for article in source.articles:
                if articles_scraped >= num_articles:
                    break

                proxy = proxies.pop(0) if proxies else None  # Get the first proxy from the list
                if proxy:
                    proxies.append(proxy)  # Rotate proxies

                article_url = article.url
                article_title = article.title
                article_text = article.text

                try:
                    if proxy:
                        # Using requests library to send the request with the selected proxy
                        response = requests.get(article_url, proxies={"http": proxy, "https": proxy}, timeout=10)

                        if response.status_code == 200:
                            # If the request was successful, insert news and print proxy used
                            self.insert_news(source_url, article_title, article_text, article_url)
                            print(f"Proxy: {proxy} - Success: {article_url}")
                        else:
                            print(f"Proxy: {proxy} - Failed: {article_url}")

                    else:
                        print("No proxy available for request.")

                except Exception as e:
                    print(f"Proxy: {proxy} - Error: {e}")

                articles_scraped += 1

        except Exception as e:
            print(f"Error scraping from {source_url}: {e}")

    def startScraping(self):
        self.scraping = True
        self.start_button.setEnabled(False)
        self.stop_button.setEnabled(True)

        self.create_table()
        proxies_filename = "proxylist.txt"

        # Define websites and the number of articles to scrape from each
        websites = [
            {'url': 'http://ft.com', 'num_articles': 5},
            {'url': 'http://nytimes.com', 'num_articles': 5},
            {'url': 'http://www.bloomberg.com/economics', 'num_articles': 5},
            {'url': 'http://economictimes.indiatimes.com', 'num_articles': 5},
        ]

        self.threads = []
        for site in websites:
            thread = Thread(target=self.scrape_news, args=(site['url'], site['num_articles'],proxies_filename))
            self.threads.append(thread)
            thread.start()

    def stopScraping(self):
        self.scraping = False
        self.start_button.setEnabled(True)
        self.stop_button.setEnabled(False)

        for thread in self.threads:
            thread.join()


def main():
    app = QApplication(sys.argv)
    window = NewsScraperGUI()
    window.show()
    sys.exit(app.exec_())

if __name__ == '__main__':
    main()

System Information

$ python -m requests.help
C:\Python\Lib\site-packages\requests\__init__.py:102: RequestsDependencyWarning: urllib3 (1.26.15) or chardet (5.2.0)/charset_normalizer (2.0.12) doesn't match a supported version!
  warnings.warn("urllib3 ({}) or chardet ({})/charset_normalizer ({}) doesn't match a supported "
C:\Python\Lib\site-packages\requests\help.py:25: DeprecationWarning: 'urllib3.contrib.pyopenssl' module is deprecated and will be removed in a future release of urllib3 2.x. Read more in this issue: https://github.com/urllib3/urllib3/issues/2680
  from urllib3.contrib import pyopenssl
{
  "chardet": {
    "version": "5.2.0"
  },
  "charset_normalizer": {
    "version": "2.0.12"
  },
  "cryptography": {
    "version": "41.0.1"
  },
  "idna": {
    "version": "3.4"
  },
  "implementation": {
    "name": "CPython",
    "version": "3.11.3"
  },
  "platform": {
    "release": "10",
    "system": "Windows"
  },
  "pyOpenSSL": {
    "openssl_version": "30100010",
    "version": "23.2.0"
  },
  "requests": {
    "version": "2.26.0"
  },
  "system_ssl": {
    "version": "1010114f"
  },
  "urllib3": {
    "version": "1.26.15"
  },
  "using_charset_normalizer": false,
  "using_pyopenssl": true
}

tilakpatel22 avatar Nov 26 '23 06:11 tilakpatel22