BrowserGym icon indicating copy to clipboard operation
BrowserGym copied to clipboard

Problem in click action/PDF interaction

Open DsDastgheib opened this issue 8 months ago • 1 comments

It seems the click action does not work when agent want to click on View Pdf for archive article. Here is a sample code (parse_content_to_elements, and find_matching_anchor borrowed from here)

import re
import browsergym.core  # register the openended task as a gym environment
from browsergym.utils.obs import flatten_axtree_to_str
from dataclasses import dataclass, field

def parse_content_to_elements(content: str):
    """Parse the observation content into a dictionary mapping anchors to their descriptions"""
    elements = {}
    current_anchor = None
    description_lines = []

    for line in content.split('\n'):
        line = line.strip()
        if not line:
            continue

        # Check for anchor line
        anchor_match = re.match(r'\[(\d+)\](.*)', line)
        if anchor_match:
            # Save previous element if it exists
            if current_anchor and description_lines:
                elements[current_anchor] = ' '.join(description_lines)

            # Start new element
            current_anchor = anchor_match.group(1)
            description_lines = [anchor_match.group(2).strip()]
        else:
            # Add to current description if we have an anchor
            if current_anchor:
                description_lines.append(line)

    # Save last element
    if current_anchor and description_lines:
        elements[current_anchor] = ' '.join(description_lines)

    return elements

def find_matching_anchor(content: str, selector: str):
    """Find the anchor ID that matches the given selector description"""
    elements = parse_content_to_elements(content)

    # Clean up selector and create a pattern
    selector = selector.lower().strip()

    for anchor, description in elements.items():
        description = description.lower().strip()
        if selector in description:
            return anchor

    return None


if __name__ == '__main__':


    env = gym.make(
        "browsergym/openended",
        task_kwargs={"start_url": "https://www.google.com/"},  # starting URL
        wait_for_user_message=False,  # wait for a user message after each agent message sent to the chat
    )
    # run the environment <> agent loop until termination
    obs, info = env.reset()





    action0 = 'goto("https://arxiv.org/abs/1706.03762")'
    obs, reward, terminated, truncated, info = env.step(action0)
    print(obs["url"])

    action1 = "noop(2000)"
    obs, reward, terminated, truncated, info = env.step(action1)
    print(obs["url"])

    extra_element_properties={}
    select = find_matching_anchor(flatten_axtree_to_str(obs["axtree_object"],
                                                        extra_properties=extra_element_properties,
                                                        with_clickable=True,
                                                        skip_generic=True,
                                                        filter_visible_only=True,
                                                        ), "link 'View PDF',")
    action2 = f'click("{select}", "left")'
    print(action2)
    obs, reward, terminated, truncated, info = env.step(action2)
    print(obs["url"])

    # release the environment
    env.close()

The output is as

https://arxiv.org/abs/1706.03762
https://arxiv.org/abs/1706.03762
https://arxiv.org/abs/1706.03762

We can see after the click action the URL doesn't change while we expect we redirect to https://arxiv.org/pdf/1706.03762. I've tested for couple of archive articles and this did not work.

DsDastgheib avatar Feb 16 '25 10:02 DsDastgheib