onyx
onyx copied to clipboard
Confluence Connector doesn't suport Short URLs for Single Page indexing
Currently, the confluence connector is unable to parse pageIds for Short URLs, in the format /display/<space>/<title>
This kind of URL requires an extra call to the Confluence API in order to resolve the pageId.
Here is a probable fix. (Incomplete as it requires modifications also in the confluence_client definitions)
def _extract_confluence_keys_from_datacenter_url(wiki_url: str, confluence_client: Confluence) -> tuple[str, str, str]:
"""
Extracts wiki base, space key, and page ID from a Confluence URL.
Only attempts to resolve page ID if content is provided after the space name.
"""
parsed_url = urlparse(wiki_url)
wiki_base = f"{parsed_url.scheme}://{parsed_url.netloc}"
# Try to extract space key and page ID from URL
space_match = re.search(r'/display/([^/]+)', parsed_url.path)
page_id_match = re.search(r'pageId=(\d+)', parsed_url.query)
space_key = space_match.group(1) if space_match else ""
page_id = page_id_match.group(1) if page_id_match else ""
# Check if there's content after the space name
path_parts = parsed_url.path.split('/')
space_index = path_parts.index(space_key) if space_key in path_parts else -1
# Only try to resolve page ID if there's content after the space name
if space_index != -1 and not page_id:
try:
# Extract the page title from the URL
title = path_parts[-1]
# Use the Confluence API to get the page ID
page = confluence_client.get_page_by_title(space=space_key, title=title)
page_id = page['id']
except Exception as e:
print(f"Error retrieving page ID: {e}")
return wiki_base, space_key, page_id