firecrawl
firecrawl copied to clipboard
[Feat] Add ability/option to transform relative to absolute urls in page
When scraping, and mostly crawling, provide the ability to have all relative urls changed to absolute urls (for further processing or link extraction).
Eg. [The PDF file][/assets/file.pdf]
=> [The PDF file][https://site.com/assets/file.pdf]
Sample solution a md-post-processor
hook:
import re
from urllib.parse import urljoin
def convert_relative_urls(text, base_url):
# Regex to match Markdown links that don't start with http
regex = r'\]\((?!http)([^)]+)\)'
# Function to prepend the base URL to the matched relative URL, handling '../'
def replace_func(match):
# Combine the base URL with the relative URL properly handling '../'
full_url = urljoin(base_url + '/', match.group(1))
return f"]({full_url})"
# Replace the matched patterns in the text
return re.sub(regex, replace_func, text)
# Example usage
markdown_text = "[The PDF file](/assets/file.pdf), [other file](../page/thing.pdf)"
base_url = "https://site.com/subdir/"
converted_text = convert_relative_urls(markdown_text, base_url)
oops ..noticed we are in typescript 😝:
function convertRelativeUrls(text: string, baseUrl: string): string {
const regex = /\]\((?!http)([^)]+)\)/g;
// Function to prepend the base URL to the matched relative URL, handling '../'
const replaceFunc = (match: string, group1: string): string => {
// Create a new URL based on the relative path and the base URL
const fullUrl = new URL(group1, baseUrl).toString();
return `](${fullUrl})`;
};
// Replace the matched patterns in the text
return text.replace(regex, replaceFunc);
}
// Example usage
const markdownText = "[The PDF file](/assets/file.pdf), [other file](../page/thing.pdf)";
const baseUrl = "https://site.com/subdir/";
const convertedText = convertRelativeUrls(markdownText, baseUrl);
#4 In the works.