scrapy-playwright icon indicating copy to clipboard operation
scrapy-playwright copied to clipboard

Javascript not triggering in ASPX web page (works in regular playwright).

Open rasert opened this issue 1 month ago • 7 comments

After clicking two radio buttons, the page should post-back and display a form. Unfortunately this is not happening. In regular playwright it works. I can't understand why.

This is the broken code:

import scrapy

class EmpenhosSpider(scrapy.Spider):
    name = "empenhos.py"
    custom_settings = {
        "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
        "DOWNLOAD_HANDLERS": {
            "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
            "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
        },
    }
    
    def start_requests(self):
        url = "https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao="
        yield scrapy.Request(
            url,
            meta={
                "playwright": True,
                "playwright_include_page": True,
            },
        )
    
    async def parse(self, response):
        page = response.meta["playwright_page"]
        
        await page.get_by_label('Empenhado').click()
        await page.get_by_label('Ordem Bancária').click()
        
        await page.wait_for_load_state(state='load')
        
        await self.enter_cnpj(page)
        
        await page.wait_for_timeout(10000)
        await page.close()
        return {}
        
    async def enter_cnpj(self, page):
        input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
        await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
        await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
        credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
        await credor.click()
        await page.wait_for_load_state(state="load")

And this is the pure Playwright working code:

import asyncio
import json
from playwright.async_api import async_playwright

count = 1
empenhos = []

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://www.fazenda.sp.gov.br/SigeoLei131/Paginas/ConsultaDespesaAno.aspx?orgao=")
        print(await page.title())
        
        await page.get_by_label('Empenhado').click()
        await page.get_by_label('Ordem Bancária').click()
        
        await page.wait_for_load_state(state="load")
        
        await entra_cnpj(page)
        await processa_orgaos(page)
        
        await browser.close()
        
        # Escrever todos os empenhos novamente no arquivo JSON
        with open('results/empenhos.json', 'w', encoding='utf-8') as json_file:
            json.dump(empenhos, json_file, ensure_ascii=False)

async def entra_cnpj(page):
    input_cnpj = page.locator('#ctl00_ContentPlaceHolder1_txtCPF')
    await input_cnpj.fill('07797967000195') # TODO: executar para todos os CNPJs conhecidos
    await page.locator('#ctl00_ContentPlaceHolder1_btnPesquisar').click()
    credor = page.locator('#ctl00_ContentPlaceHolder1_gdvCredor td a')
    await credor.click()
    await page.wait_for_load_state(state="load")

async def processa_orgaos(page):
    tabela_orgaos = page.locator('#ctl00_ContentPlaceHolder1_gdvListaOrgao')
    links = await tabela_orgaos.locator('td a').all()
    for link in links:
        await link.click()
        await page.wait_for_load_state(state="load")
        await processa_unidades_gestoras(page)

async def processa_unidades_gestoras(page):
    global count
    tabela_ugestoras = page.locator('#ctl00_ContentPlaceHolder1_gdvListaUg')
    links = await tabela_ugestoras.locator('td a').all()
    for link in links:
        await link.click()
        await page.wait_for_load_state(state="load")
        await page.screenshot(path=f'results/tela{count}.png')
        count += 1
        await processa_empenhos(page)
        # Necessário voltar para continuar processando demais Órgãos e Unidades Gestoras
        await page.go_back(wait_until='load')

async def processa_empenhos(page):
    global empenhos
    novos_empenhos = []
    
    exercicio = await page.locator('#ctl00_ContentPlaceHolder1_lblAno').inner_text()
    credor = await page.locator('#ctl00_ContentPlaceHolder1_lblCgcCpfNomeCredor').inner_text()
    orgao = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeOrgao').inner_text()
    unidade_gestora = await page.locator('#ctl00_ContentPlaceHolder1_lblCodNomeUgResponsavel').inner_text()
        
    # Extrair os campos desejados da tabela
    tabela_documentos = page.locator('#ctl00_ContentPlaceHolder1_gdvDocumento')
    linhas = await tabela_documentos.locator('tr.linha_grid_alt').all()
    for linha in linhas:
        colunas = linha.locator('td')
        data = await colunas.locator("nth=0").inner_text()
        numero_doc = await colunas.locator("nth=1").inner_text()
        pagamento_referente = await colunas.locator("nth=2").inner_text()
        descricao = await colunas.locator("nth=3").inner_text()
        nota_empenho_origem = await colunas.locator("nth=4").inner_text()
        fonte_recurso = await colunas.locator("nth=5").inner_text()
        valor_documento = await colunas.locator("nth=6").inner_text()
        # Criar um dicionário com os campos extraídos
        item = {
            'Exercicio': exercicio,
            'Credor': credor,
            'Orgao': orgao,
            'UnidadeGestora': unidade_gestora,
            'Data': data,
            'NumeroDoc': numero_doc,
            'PagamentoReferente': pagamento_referente,
            'Descricao': descricao,
            'NotaDeEmpenhoOrigem': nota_empenho_origem,
            'FonteRecurso': fonte_recurso,
            'ValorDocumento': valor_documento
        }
        novos_empenhos.append(item)
    
    empenhos += novos_empenhos

asyncio.run(main())

rasert avatar May 09 '24 06:05 rasert