scrapy-playwright
scrapy-playwright copied to clipboard
use scrapy-playwright can't login into ti.com
These is pure playwright code, and it can login ti well.
from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
async def playwright_ti_jiaocha():
from playwright.async_api import async_playwright
async with async_playwright() as p:
browser = await p.chromium.launch(
headless=False,
executable_path="/usr/bin/google-chrome-stable",
chromium_sandbox=False,
)
context = await browser.new_context(
locale="zh-CN",
ignore_https_errors=True,
)
page = await context.new_page()
await stealth_async(
page,
StealthConfig(
chrome_load_times=False,
languages=["zh-Hans", "zh"],
navigator_languages=False,
),
)
await page.add_init_script(
"""
Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
get: () => opts.languages || ['en-US', 'en']
})
"""
)
await page.goto("https://ti.com/")
# # 判断是否已经登陆
await page.locator(':text("Login / Register"), :text("登录/注册")').click()
await page.wait_for_load_state("domcontentloaded")
account_info = {
"username": "[email protected]",
"password": "Aa2238117",
}
username = account_info["username"]
password = account_info["password"]
await page.click('input[name="username"]')
# Fill input[name="username"]
await page.fill('input[name="username"]', username)
# Press Enter
await page.locator(':text("Next")').click()
await page.wait_for_timeout(2000)
# Click input[name="password"]
await page.click('input[name="password"]')
# Fill input[name="password"]
await page.fill('input[name="password"]', password)
await page.wait_for_timeout(10000)
async with page.expect_navigation():
await page.locator("ti-button:has-text('Log in')").click()
while True:
await asyncio.sleep(10)
But with scrapy can't login well. It can open login page, and fill right account info, but when click login it will return to the login page again. even though use chrome and firefox. These is the code
from playwright_stealth import StealthConfig
from playwright_stealth import stealth_async
from scrapy.http import HtmlResponse
class TiJiaoCha(scrapy.Spider):
name = "ti_jiaocha_spyder"
allowed_domains = [
"ti.com",
"www.ti.com",
"login.ti.com",
"*.ti.com",
"www.tij.co.jp",
"www.ti.com.cn",
]
headers = {}
proxy: str = None
def start_requests(self):
self.headers = {
"User-Agent": None,
"Connection": "close",
}
logger.info("proxy: %s", proxy)
url = "https://www.baidu.com/"
proxy_url = proxy_type = user = password = hostport = None
if proxy:
proxy_type, user, password, hostport = _parse_proxy(proxy)
proxy_url = urlunparse((proxy_type or "", hostport, "", "", "", ""))
yield scrapy.Request(
url=url,
dont_filter=True,
callback=self.check_login,
headers=self.headers,
meta={
"account_info": account_info,
"playwright": True,
# "playwright_context": f"{account_info['username']}context",
"playwright_page_goto_kwargs": {
# "wait_until": "domcontentloaded",
},
"playwright_include_page": True,
"playwright_context_kwargs": {
# "java_script_enabled": False,
# "headless": False,
"locale": "zh-CN",
"ignore_https_errors": True,
# "user_data_dir": f"data/{account_info['username']}",
"proxy": {
"server": proxy_url,
"username": user or "",
"password": password or "",
}
if proxy
else {
"server": "localhost:8080",
"username": "",
"password": "",
},
},
},
# meta={
# "proxy": self.proxy if self.proxy else None,
# },
)
# yield PlaywrightRequest(
# url="https://www.mouser.com/",
# headers=self.headers,
# meta={
# "proxy": self.proxy if self.proxy else None,
# },
# dont_filter=True,
# callback=self.start_search,
# )
async def check_login(self, response: HtmlResponse):
page: Page = response.meta["playwright_page"]
# await stealth_async(page)
await stealth_async(
page,
StealthConfig(
languages=["zh-Hans", "zh"],
navigator_languages=False,
chrome_load_times=False,
),
)
await page.add_init_script(
"""
Object.defineProperty(Object.getPrototypeOf(navigator), 'languages', {
get: () => opts.languages || ['en-US', 'en']
})
"""
)
await page.goto("https://ti.com/")
# # 判断是否已经登陆
await page.locator(':text("Login / Register"), :text("登录/注册")').click()
await page.wait_for_load_state("domcontentloaded")
account_info = {
"username": "[email protected]",
"password": "Aa2238117",
}
username = account_info["username"]
password = account_info["password"]
await page.click('input[name="username"]')
# Fill input[name="username"]
await page.fill('input[name="username"]', username)
# Press Enter
await page.locator(':text("Next")').click()
await page.wait_for_timeout(2000)
# Click input[name="password"]
await page.click('input[name="password"]')
# Fill input[name="password"]
await page.fill('input[name="password"]', password)
# Click [email protected] 更改 密码 eyeeye-off大写锁定已打开 忘记密码? 登录 >> button
# with page.expect_navigation(url="https://www.ti.com.cn/product/cn/TPS7H4002-SP?keyMatch=TPS7H4002-SP&tisearch=search-everything&usecase=GPN&login-check=true"):
await page.wait_for_timeout(10000)
async with page.expect_navigation():
await page.locator("ti-button:has-text('Log in')").click()
while True:
await asyncio.sleep(10)
import scrapy
from billiard import Pool, Process
from scrapy import signals
from scrapy.crawler import CrawlerProcess, CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.reactor import install_reactor
from twisted.internet import asyncioreactor
class UrlCrawlerScript(Process):
def __init__(self, spider, accounts=None, proxy_provider="net_not"):
Process.__init__(self)
settings = get_project_settings()
settings["TELNETCONSOLE_ENABLED"] = False
# settings["DOWNLOADER_MIDDLEWARES"] = {
# # utils_tasks.ScraperResponseMiddleware: 1000,
# # "scrapy_deltafetch.DeltaFetch": 100,
# }
settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = 90000
settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"] = None
settings["DOWNLOAD_HANDLERS"] = {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
settings["CONCURRENT_REQUESTS"] = 40
# settings["PLAYWRIGHT_MAX_CONTEXTS"] = 10
# settings["PLAYWRIGHT_MAX_PAGES_PER_CONTEXT"] = 1
# settings["CONCURRENT_ITEMS"] = 1
# settings["REACTOR_THREADPOOL_MAXSIZE"] = 1
settings["ROBOTSTXT_OBEY"] = False
settings["PLAYWRIGHT_BROWSER_TYPE"] = "firefox"
settings["PLAYWRIGHT_LAUNCH_OPTIONS"] = {
"headless": False,
# "channel": "chrome",
# "executable_path": "/usr/bin/google-chrome-stable",
# "chromium_sandbox": False,
# "devtools": True,
}
settings["DELTAFETCH_ENABLED"] = False
block_list = [
"https://www.ti.com/akam",
"https://www.ti.com.cn/akam",
"https://cm.g.doubleclick.net",
"https://try.abtasty.com",
"https://connect.facebook.net",
"https://www.googletagmanager.com",
"https://img.en25.com",
"https://collect.tealiumiq.com",
"https://cdn.decibelinsight.net",
"https://s.adroll.com",
"https://analytics.supplyframe.com",
"https://www.tij.co.jp/akam",
"https://visitor-service.tealiumiq.com",
"https://try.abtasty.com",
"https://metrics.brightcove.com",
"https://t.supplyframe.com",
# "https://www.gstatic.cn/recaptcha",
"https://maps.googleapis.com/maps/api/mapsjs",
"https://js-agent.newrelic.com",
]
def should_abort_request(req):
# for block_url in block_list:
# if req.url.startswith(block_url):
# logger.info(f"Aborting request: {req.url}")
# return True
return False
if req.resource_type == "image" or req.resource_type == "font":
logger.info(f"Aborting {req.resource_type} request: {req.url}")
return True
settings["PLAYWRIGHT_ABORT_REQUEST"] = should_abort_request
settings[
"TWISTED_REACTOR"
] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
self.accounts = accounts
self.proxy_provider = proxy_provider
self.crawler = CrawlerRunner(settings)
self.spider = spider
def run(self):
scrapy.utils.reactor.install_reactor(
"twisted.internet.asyncioreactor.AsyncioSelectorReactor"
)
is_asyncio_reactor_installed = (
scrapy.utils.reactor.is_asyncio_reactor_installed()
)
print(f"Is asyncio reactor installed: {is_asyncio_reactor_installed}")
from twisted.internet import reactor
params = {}
self.crawler.crawl(self.spider, params=params)
# self.crawler.start()
# self.crawler.join()
# Thread(target=self.crawler.start).start()
# reactor.run()
# tp = reactor.getThreadPool()
# tp.adjustPoolsize(maxthreads=10)
# reactor.addSystemEventTrigger("before", "shutdown", self.crawler.stop)
d = self.crawler.join()
d.addBoth(lambda _: reactor.stop())
reactor.run() # the script will block here until all crawling jobs are finished
crawler = UrlCrawlerScript(
TiJiaoCha, accounts=accounts, proxy_provider=proxy_provider
)
crawler.start()
crawler.join()
This is a support question, not a bug report of feature request. Please provide a minimal, reproducible example (without proxies, additional scripts, processes).
This is minimal code. Thank you very much
import scrapy
from playwright.async_api import Page
from scrapy.http import HtmlResponse
import asyncio
class TiJiaoCha(scrapy.Spider):
name = "ti_jiaocha_spyder"
allowed_domains = [
"ti.com",
"www.ti.com",
"login.ti.com",
"*.ti.com",
"www.tij.co.jp",
"www.ti.com.cn",
'baidu.com',
]
headers = {}
def start_requests(self):
yield scrapy.Request(
url='http://baidu.com',
dont_filter=True,
callback=self.check_login,
headers=self.headers,
meta={
"playwright": True,
# "playwright_context": f"{account_info['username']}context",
"playwright_page_goto_kwargs": {
# "wait_until": "domcontentloaded",
},
"playwright_include_page": True,
"playwright_context_kwargs": {
# "java_script_enabled": False,
# "headless": False,
"locale": "zh-CN",
"ignore_https_errors": True,
# "user_data_dir": f"data/{account_info['username']}",
},
},
)
async def check_login(self, response: HtmlResponse):
page: Page = response.meta["playwright_page"]
await page.goto("https://ti.com/")
# # 判断是否已经登陆
await page.locator(':text("Login / Register"), :text("登录/注册")').click()
await page.wait_for_load_state("domcontentloaded")
account_info = {
"username": "[email protected]",
"password": "Aa2238117",
}
username = account_info["username"]
password = account_info["password"]
await page.click('input[name="username"]')
# Fill input[name="username"]
await page.fill('input[name="username"]', username)
# Press Enter
await page.locator(':text("Next")').click()
await page.wait_for_timeout(2000)
# Click input[name="password"]
await page.click('input[name="password"]')
# Fill input[name="password"]
await page.fill('input[name="password"]', password)
await page.wait_for_timeout(10000)
async with page.expect_navigation():
await page.locator("ti-button:has-text('Log in')").click()
while True:
await asyncio.sleep(10)