scrapy-playwright icon indicating copy to clipboard operation
scrapy-playwright copied to clipboard

use scrapy-playwright can't login into ti.com

Open yswtrue opened this issue 1 year ago • 2 comments

These is pure playwright code, and it can login ti well.

from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async

async def playwright_ti_jiaocha():
    from playwright.async_api import async_playwright

    async with async_playwright() as p:
        browser = await p.chromium.launch(
            headless=False,
            executable_path="/usr/bin/google-chrome-stable",
            chromium_sandbox=False,
        )
        context = await browser.new_context(
            locale="zh-CN",
            ignore_https_errors=True,
        )
        page = await context.new_page()
        await stealth_async(
            page,
            StealthConfig(
                chrome_load_times=False,
                languages=["zh-Hans", "zh"],
                navigator_languages=False,
            ),
        )
        await page.add_init_script(
            """
        Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
        get: () => opts.languages || ['en-US', 'en']
    })
    """
        )
        await page.goto("https://ti.com/")
        # # 判断是否已经登陆
        await page.locator(':text("Login / Register"), :text("登录/注册")').click()
        await page.wait_for_load_state("domcontentloaded")
        account_info = {
            "username": "[email protected]",
            "password": "Aa2238117",
        }
        username = account_info["username"]
        password = account_info["password"]
        await page.click('input[name="username"]')
        # Fill input[name="username"]
        await page.fill('input[name="username"]', username)
        # Press Enter
        await page.locator(':text("Next")').click()
        await page.wait_for_timeout(2000)
        # Click input[name="password"]
        await page.click('input[name="password"]')
        # Fill input[name="password"]
        await page.fill('input[name="password"]', password)
        await page.wait_for_timeout(10000)
        async with page.expect_navigation():
            await page.locator("ti-button:has-text('Log in')").click()
        while True:
            await asyncio.sleep(10)

But with scrapy can't login well. It can open login page, and fill right account info, but when click login it will return to the login page again. even though use chrome and firefox. These is the code

from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
from scrapy.http import HtmlResponse

class TiJiaoCha(scrapy.Spider):
    name = "ti_jiaocha_spyder"
    allowed_domains = [
        "ti.com",
        "www.ti.com",
        "login.ti.com",
        "*.ti.com",
        "www.tij.co.jp",
        "www.ti.com.cn",
    ]
    headers = {}
    proxy: str = None

    def start_requests(self):
        self.headers = {
            "User-Agent": None,
            "Connection": "close",
        }
        logger.info("proxy: %s", proxy)
        url = "https://www.baidu.com/"
        proxy_url = proxy_type = user = password = hostport = None
        if proxy:
            proxy_type, user, password, hostport = _parse_proxy(proxy)
            proxy_url = urlunparse((proxy_type or "", hostport, "", "", "", ""))
        yield scrapy.Request(
            url=url,
            dont_filter=True,
            callback=self.check_login,
            headers=self.headers,
            meta={
                "account_info": account_info,
                "playwright": True,
                # "playwright_context": f"{account_info['username']}context",
                "playwright_page_goto_kwargs": {
                    # "wait_until": "domcontentloaded",
                },
                "playwright_include_page": True,
                "playwright_context_kwargs": {
                    # "java_script_enabled": False,
                    # "headless": False,
                    "locale": "zh-CN",
                    "ignore_https_errors": True,
                    # "user_data_dir": f"data/{account_info['username']}",
                    "proxy": {
                        "server": proxy_url,
                        "username": user or "",
                        "password": password or "",
                    }
                    if proxy
                    else {
                        "server": "localhost:8080",
                        "username": "",
                        "password": "",
                    },
                },
            },
            # meta={
            #     "proxy": self.proxy if self.proxy else None,
            # },
        )

        # yield PlaywrightRequest(
        #     url="https://www.mouser.com/",
        #     headers=self.headers,
        #     meta={
        #         "proxy": self.proxy if self.proxy else None,
        #     },
        #     dont_filter=True,
        #     callback=self.start_search,
        # )

    async def check_login(self, response: HtmlResponse):
        page: Page = response.meta["playwright_page"]
        # await stealth_async(page)
        await stealth_async(
            page,
            StealthConfig(
                languages=["zh-Hans", "zh"],
                navigator_languages=False,
                chrome_load_times=False,
            ),
        )
        await page.add_init_script(
            """
        Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
        get: () => opts.languages || ['en-US', 'en']
    })
    """
        )
        await page.goto("https://ti.com/")
        # # 判断是否已经登陆
        await page.locator(':text("Login / Register"), :text("登录/注册")').click()
        await page.wait_for_load_state("domcontentloaded")
        account_info = {
            "username": "[email protected]",
            "password": "Aa2238117",
        }
        username = account_info["username"]
        password = account_info["password"]
        await page.click('input[name="username"]')
        # Fill input[name="username"]
        await page.fill('input[name="username"]', username)
        # Press Enter
        await page.locator(':text("Next")').click()
        await page.wait_for_timeout(2000)
        # Click input[name="password"]
        await page.click('input[name="password"]')
        # Fill input[name="password"]
        await page.fill('input[name="password"]', password)
        # Click [email protected] 更改 密码 eyeeye-off大写锁定已打开 忘记密码? 登录 >> button
        # with page.expect_navigation(url="https://www.ti.com.cn/product/cn/TPS7H4002-SP?keyMatch=TPS7H4002-SP&tisearch=search-everything&usecase=GPN&login-check=true"):
        await page.wait_for_timeout(10000)
        async with page.expect_navigation():
            await page.locator("ti-button:has-text('Log in')").click()
        while True:
            await asyncio.sleep(10)


import scrapy
from billiard import Pool, Process
from scrapy import signals
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from twisted.internet import asyncioreactor


class UrlCrawlerScript(Process):
    def __init__(self, spider, accounts=None, proxy_provider="net_not"):

        Process.__init__(self)
        settings = get_project_settings()
        settings["TELNETCONSOLE_ENABLED"] = False
        # settings["DOWNLOADER_MIDDLEWARES"] = {
        #     # utils_tasks.ScraperResponseMiddleware: 1000,
        #     # "scrapy_deltafetch.DeltaFetch": 100,
        # }
        settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = 90000
        settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] = None
        settings["DOWNLOAD_HANDLERS"] = {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        }
        settings["CONCURRENT_REQUESTS"] = 40
        # settings["PLAYWRIGHT_MAX_CONTEXTS"] = 10
        # settings["PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"] = 1
        # settings["CONCURRENT_ITEMS"] = 1
        # settings["REACTOR_THREADPOOL_MAXSIZE"] = 1
        settings["ROBOTSTXT_OBEY"] = False
        settings["PLAYWRIGHT_BROWSER_TYPE"] = "firefox"
        settings["PLAYWRIGHT_LAUNCH_OPTIONS"] = {
            "headless": False,
            # "channel": "chrome",
            # "executable_path": "/usr/bin/google-chrome-stable",
            # "chromium_sandbox": False,
            # "devtools": True,
        }
        settings["DELTAFETCH_ENABLED"] = False
        block_list = [
            "https://www.ti.com/akam",
            "https://www.ti.com.cn/akam",
            "https://cm.g.doubleclick.net",
            "https://try.abtasty.com",
            "https://connect.facebook.net",
            "https://www.googletagmanager.com",
            "https://img.en25.com",
            "https://collect.tealiumiq.com",
            "https://cdn.decibelinsight.net",
            "https://s.adroll.com",
            "https://analytics.supplyframe.com",
            "https://www.tij.co.jp/akam",
            "https://visitor-service.tealiumiq.com",
            "https://try.abtasty.com",
            "https://metrics.brightcove.com",
            "https://t.supplyframe.com",
            # "https://www.gstatic.cn/recaptcha",
            "https://maps.googleapis.com/maps/api/mapsjs",
            "https://js-agent.newrelic.com",
        ]

        def should_abort_request(req):

            # for block_url in block_list:
            #     if req.url.startswith(block_url):
            #         logger.info(f"Aborting request: {req.url}")
            #         return True
            return False
            if req.resource_type == "image" or req.resource_type == "font":
                logger.info(f"Aborting {req.resource_type} request: {req.url}")
                return True

        settings["PLAYWRIGHT_ABORT_REQUEST"] = should_abort_request

        settings[
            "TWISTED_REACTOR"
        ] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"

        self.accounts = accounts
        self.proxy_provider = proxy_provider
        self.crawler = CrawlerRunner(settings)
        self.spider = spider

    def run(self):

        scrapy.utils.reactor.install_reactor(
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
        )
        is_asyncio_reactor_installed = (
            scrapy.utils.reactor.is_asyncio_reactor_installed()
        )
        print(f"Is asyncio reactor installed: {is_asyncio_reactor_installed}")

        from twisted.internet import reactor

        params = {}
        self.crawler.crawl(self.spider, params=params)
        # self.crawler.start()
        # self.crawler.join()
        # Thread(target=self.crawler.start).start()
        # reactor.run()

        # tp = reactor.getThreadPool()
        # tp.adjustPoolsize(maxthreads=10)
        # reactor.addSystemEventTrigger("before", "shutdown", self.crawler.stop)
        d = self.crawler.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()  # the script will block here until all crawling jobs are finished
crawler = UrlCrawlerScript(
    TiJiaoCha, accounts=accounts, proxy_provider=proxy_provider
)
crawler.start()
crawler.join()

yswtrue avatar Aug 11 '22 19:08 yswtrue

This is a support question, not a bug report of feature request. Please provide a minimal, reproducible example (without proxies, additional scripts, processes).

elacuesta avatar Aug 11 '22 21:08 elacuesta

This is minimal code. Thank you very much

import scrapy
from playwright.async_api import Page
from scrapy.http import HtmlResponse
import asyncio
class TiJiaoCha(scrapy.Spider):
    name = "ti_jiaocha_spyder"
    allowed_domains = [
        "ti.com",
        "www.ti.com",
        "login.ti.com",
        "*.ti.com",
        "www.tij.co.jp",
        "www.ti.com.cn",
        'baidu.com',
    ]
    headers = {}

    def start_requests(self):
        yield scrapy.Request(
            url='http://baidu.com',
            dont_filter=True,
            callback=self.check_login,
            headers=self.headers,
            meta={
                "playwright": True,
                # "playwright_context": f"{account_info['username']}context",
                "playwright_page_goto_kwargs": {
                    # "wait_until": "domcontentloaded",
                },
                "playwright_include_page": True,
                "playwright_context_kwargs": {
                    # "java_script_enabled": False,
                    # "headless": False,
                    "locale": "zh-CN",
                    "ignore_https_errors": True,
                    # "user_data_dir": f"data/{account_info['username']}",
                },
            },
        )


    async def check_login(self, response: HtmlResponse):
        page: Page = response.meta["playwright_page"]
        await page.goto("https://ti.com/")
        # # 判断是否已经登陆
        await page.locator(':text("Login / Register"), :text("登录/注册")').click()
        await page.wait_for_load_state("domcontentloaded")
        account_info = {
            "username": "[email protected]",
            "password": "Aa2238117",
        }
        username = account_info["username"]
        password = account_info["password"]
        await page.click('input[name="username"]')
        # Fill input[name="username"]
        await page.fill('input[name="username"]', username)
        # Press Enter
        await page.locator(':text("Next")').click()
        await page.wait_for_timeout(2000)
        # Click input[name="password"]
        await page.click('input[name="password"]')
        # Fill input[name="password"]
        await page.fill('input[name="password"]', password)
        await page.wait_for_timeout(10000)
        async with page.expect_navigation():
            await page.locator("ti-button:has-text('Log in')").click()
        while True:
            await asyncio.sleep(10)

yswtrue avatar Aug 11 '22 23:08 yswtrue