crawl4ai
crawl4ai copied to clipboard
Issue with the LLMExtractionStrategy
i"ve given as "Extract everything, but leave off the links."
from pydantic import BaseModel
from typing import List
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler
import asyncio
class KnowledgeGraph(BaseModel):
entities: List[dict]
relationships: List[dict]
strategy = LLMExtractionStrategy(
provider="ollama/llama3.1",
instruction="Extract everything, but leave off the links."
)
async def main():
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(url="https://www.nbcnews.com/business",
bypass_cache=True, simulate_user=True,
extraction_strategy=strategy)
knowledge_graph = result.extracted_content
print(knowledge_graph)
asyncio.run(main())
but it gave like only links , am i missing something @unclecode
[
{
"index": 0,
"tags": [
"header"
],
"content": [
"www.nbcnews.com/news/world/australia-police-arrest-13-seize-23-tons-cocaine-boat-rcna182393"
],
"error": false
},
{
"index": 1,
"tags": [
"see-all"
],
"content": [
"[See All](https://www.nbcnews.com/latest-stories)"
],
"error": false
},
{
"index": 2,
"tags": [
"about"
],
"content": [
"About (https://www.nbcnews.com/information/nbc-news-info/about-nbc-news-digital-n1232178)"
],
"error": false
},
{
"index": 3,
"tags": [
"contact"
],
"content": [
"Contact (https://www.nbcnews.com/information/nbc-news-info/contact-us-n1232521)"
],
"error": false
},
{
"index": 4,
"tags": [
"help"
],
"content": [
"Help (https://nbcnews.zendesk.com/hc/en-us)"
],
"error": false
},
{
"index": 5,
"tags": [
"careers"
],
"content": [
"Careers (https://www.nbcunicareers.com/)"
],
"error": false
},
{
"index": 6,
"tags": [
"ad-choices"
],
"content": [
"Ad Choices (https://www.nbcuniversal.com/privacy/cookies#accordionheader2)"
],
"error": false
},
{
"index": 7,
"tags": [
"privacy-policy"
],
"content": [
"Privacy Policy (https://www.nbcuniversal.com/privacy?intake=NBC_News)"
],
"error": false
},
{
"index": 8,
"tags": [
"your-privacy-choices"
],
"content": [
"Your Privacy Choices (https://www.nbcuniversal.com/privacy/notrtoo/?intake=NBC_News)"
],
"error": false
},
{
"index": 9,
"tags": [
"ca-notice"
],
"content": [
"CA Notice (https://www.nbcuniversal.com/privacy/california-consumer-privacy-act?intake=NBC_News)"
],
"error": false
},
{
"index": 10,
"tags": [
"terms-of-service"
],
"content": [
"Terms of Service (Updated JULY 7, 2023) (https://www.nbcuniversal.com/terms)"
],
"error": false
},
{
"index": 11,
"tags": [
"sitemap"
],
"content": [
"NBC News Sitemap (https://www.nbcnews.com/archive)"
],
"error": false
},
{
"index": 12,
"tags": [
"closed-captioning"
],
"content": [
"Closed Captioning (https://www.nbcnews.com/information/nbc-news-info/closed-captioning-n1307063)"
],
"error": false
},
{
"index": 13,
"tags": [
"advertise"
],
"content": [
"Advertise (https://together.nbcuni.com/advertise/?utm_source=nbc_news&utm_medium=referral&utm_campaign=property_ad_pages)"
],
"error": false
},
{
"index": 14,
"tags": [
"select-shopping"
],
"content": [
"Select Shopping (https://www.nbcnews.com/select)"
],
"error": false
},
{
"index": 15,
"tags": [
"select-personal-finance"
],
"content": [
"Select Personal Finance (https://www.cnbc.com/select/)"
],
"error": false
},
{
"index": 16,
"tags": [
"copyright"
],
"content": [
"© 2024 NBCUniversal Media, LLC [NBC News Logo](https://www.nbcnews.com)[MSNBC Logo](https://www.msnbc.com)[Today Logo](https://www.today.com)"
],
"error": false
},
{
"index": 17,
"tags": [
"tracking"
],
"content": [
"data:image/s3,"s3://crabby-images/ae163/ae163925eb850e062b256ab7bf54f7fd6ccf5d21" alt="""
],
"error": false
},
{
"index": 18,
"tags": [
"article-header"
],
"content": [
"Australia Police Arrest 13, Seize 23 Tons of Cocaine on Boat"
],
"error": false
}
]