scrapy-playwright
scrapy-playwright copied to clipboard
Javascript not triggering in ASPX web page (works in regular playwright).
After clicking two radio buttons, the page should post-back and display a form. Unfortunately this is not happening. In regular playwright it works. I can't understand why.
This is the broken code:
import scrapy
class EmpenhosSpider(scrapy.Spider):
name = "empenhos.py"
custom_settings = {
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
}
def start_requests(self):
url = "https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao="
yield scrapy.Request(
url,
meta={
"playwright": True,
"playwright_include_page": True,
},
)
async def parse(self, response):
page = response.meta["playwright_page"]
await page.get_by_label('Empenhado').click()
await page.get_by_label('Ordem Bancária').click()
await page.wait_for_load_state(state='load')
await self.enter_cnpj(page)
await page.wait_for_timeout(10000)
await page.close()
return {}
async def enter_cnpj(self, page):
input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
await credor.click()
await page.wait_for_load_state(state="load")
And this is the pure Playwright working code:
import asyncio
import json
from playwright.async_api import async_playwright
count = 1
empenhos = []
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
await page.goto("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao=")
print(await page.title())
await page.get_by_label('Empenhado').click()
await page.get_by_label('Ordem Bancária').click()
await page.wait_for_load_state(state="load")
await entra_cnpj(page)
await processa_orgaos(page)
await browser.close()
# Escrever todos os empenhos novamente no arquivo JSON
with open('results/empenhos.json', 'w', encoding='utf-8') as json_file:
json.dump(empenhos, json_file, ensure_ascii=False)
async def entra_cnpj(page):
input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
await credor.click()
await page.wait_for_load_state(state="load")
async def processa_orgaos(page):
tabela_orgaos = page.locator('#ctl00_ContentPlaceHolder1_gdvListaOrgao')
links = await tabela_orgaos.locator('td a').all()
for link in links:
await link.click()
await page.wait_for_load_state(state="load")
await processa_unidades_gestoras(page)
async def processa_unidades_gestoras(page):
global count
tabela_ugestoras = page.locator('#ctl00_ContentPlaceHolder1_gdvListaUg')
links = await tabela_ugestoras.locator('td a').all()
for link in links:
await link.click()
await page.wait_for_load_state(state="load")
await page.screenshot(path=f'results/tela{count}.png')
count += 1
await processa_empenhos(page)
# Necessário voltar para continuar processando demais Órgãos e Unidades Gestoras
await page.go_back(wait_until='load')
async def processa_empenhos(page):
global empenhos
novos_empenhos = []
exercicio = await page.locator('#ctl00_ContentPlaceHolder1_lblAno').inner_text()
credor = await page.locator('#ctl00_ContentPlaceHolder1_lblCgcCpfNomeCredor').inner_text()
orgao = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeOrgao').inner_text()
unidade_gestora = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeUgResponsavel').inner_text()
# Extrair os campos desejados da tabela
tabela_documentos = page.locator('#ctl00_ContentPlaceHolder1_gdvDocumento')
linhas = await tabela_documentos.locator('tr.linha_grid_alt').all()
for linha in linhas:
colunas = linha.locator('td')
data = await colunas.locator("nth=0").inner_text()
numero_doc = await colunas.locator("nth=1").inner_text()
pagamento_referente = await colunas.locator("nth=2").inner_text()
descricao = await colunas.locator("nth=3").inner_text()
nota_empenho_origem = await colunas.locator("nth=4").inner_text()
fonte_recurso = await colunas.locator("nth=5").inner_text()
valor_documento = await colunas.locator("nth=6").inner_text()
# Criar um dicionário com os campos extraĂdos
item = {
'Exercicio': exercicio,
'Credor': credor,
'Orgao': orgao,
'UnidadeGestora': unidade_gestora,
'Data': data,
'NumeroDoc': numero_doc,
'PagamentoReferente': pagamento_referente,
'Descricao': descricao,
'NotaDeEmpenhoOrigem': nota_empenho_origem,
'FonteRecurso': fonte_recurso,
'ValorDocumento': valor_documento
}
novos_empenhos.append(item)
empenhos += novos_empenhos
asyncio.run(main())