firecrawl icon indicating copy to clipboard operation
firecrawl copied to clipboard

[Feat] Add ability/option to transform relative to absolute urls in page

Open oliviermills opened this issue 10 months ago • 1 comments

When scraping, and mostly crawling, provide the ability to have all relative urls changed to absolute urls (for further processing or link extraction).

Eg. [The PDF file][/assets/file.pdf] => [The PDF file][https://site.com/assets/file.pdf]

Sample solution a md-post-processor hook:

import re
from urllib.parse import urljoin

def convert_relative_urls(text, base_url):
    # Regex to match Markdown links that don't start with http
    regex = r'\]\((?!http)([^)]+)\)'
    # Function to prepend the base URL to the matched relative URL, handling '../'
    def replace_func(match):
        # Combine the base URL with the relative URL properly handling '../'
        full_url = urljoin(base_url + '/', match.group(1))
        return f"]({full_url})"
    # Replace the matched patterns in the text
    return re.sub(regex, replace_func, text)

# Example usage
markdown_text = "[The PDF file](/assets/file.pdf), [other file](../page/thing.pdf)"
base_url = "https://site.com/subdir/"
converted_text = convert_relative_urls(markdown_text, base_url)

oops ..noticed we are in typescript 😝:

function convertRelativeUrls(text: string, baseUrl: string): string {
  const regex = /\]\((?!http)([^)]+)\)/g;
  
  // Function to prepend the base URL to the matched relative URL, handling '../'
  const replaceFunc = (match: string, group1: string): string => {
    // Create a new URL based on the relative path and the base URL
    const fullUrl = new URL(group1, baseUrl).toString();
    return `](${fullUrl})`;
  };

  // Replace the matched patterns in the text
  return text.replace(regex, replaceFunc);
}

// Example usage
const markdownText = "[The PDF file](/assets/file.pdf), [other file](../page/thing.pdf)";
const baseUrl = "https://site.com/subdir/";
const convertedText = convertRelativeUrls(markdownText, baseUrl);

oliviermills avatar Apr 16 '24 18:04 oliviermills

#4 In the works.

calebpeffer avatar Apr 16 '24 19:04 calebpeffer