crawl4ai icon indicating copy to clipboard operation
crawl4ai copied to clipboard

Issue with the LLMExtractionStrategy

Open stsfaroz opened this issue 2 months ago • 0 comments

i"ve given as "Extract everything, but leave off the links."

from pydantic import BaseModel
from typing import List
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler
import asyncio

class KnowledgeGraph(BaseModel):
    entities: List[dict]
    relationships: List[dict]

strategy = LLMExtractionStrategy(
    provider="ollama/llama3.1",  
    instruction="Extract everything, but leave off the links."
)

async def main():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(url="https://www.nbcnews.com/business",
                                    bypass_cache=True, simulate_user=True,
                                    extraction_strategy=strategy)
        knowledge_graph = result.extracted_content
        print(knowledge_graph)

asyncio.run(main())

but it gave like only links , am i missing something @unclecode

[
    {
        "index": 0,
        "tags": [
            "header"
        ],
        "content": [
            "www.nbcnews.com/news/world/australia-police-arrest-13-seize-23-tons-cocaine-boat-rcna182393"
        ],
        "error": false
    },
    {
        "index": 1,
        "tags": [
            "see-all"
        ],
        "content": [
            "[See All](https://www.nbcnews.com/latest-stories)"
        ],
        "error": false
    },
    {
        "index": 2,
        "tags": [
            "about"
        ],
        "content": [
            "About (https://www.nbcnews.com/information/nbc-news-info/about-nbc-news-digital-n1232178)"
        ],
        "error": false
    },
    {
        "index": 3,
        "tags": [
            "contact"
        ],
        "content": [
            "Contact (https://www.nbcnews.com/information/nbc-news-info/contact-us-n1232521)"
        ],
        "error": false
    },
    {
        "index": 4,
        "tags": [
            "help"
        ],
        "content": [
            "Help (https://nbcnews.zendesk.com/hc/en-us)"
        ],
        "error": false
    },
    {
        "index": 5,
        "tags": [
            "careers"
        ],
        "content": [
            "Careers (https://www.nbcunicareers.com/)"
        ],
        "error": false
    },
    {
        "index": 6,
        "tags": [
            "ad-choices"
        ],
        "content": [
            "Ad Choices (https://www.nbcuniversal.com/privacy/cookies#accordionheader2)"
        ],
        "error": false
    },
    {
        "index": 7,
        "tags": [
            "privacy-policy"
        ],
        "content": [
            "Privacy Policy (https://www.nbcuniversal.com/privacy?intake=NBC_News)"
        ],
        "error": false
    },
    {
        "index": 8,
        "tags": [
            "your-privacy-choices"
        ],
        "content": [
            "Your Privacy Choices (https://www.nbcuniversal.com/privacy/notrtoo/?intake=NBC_News)"
        ],
        "error": false
    },
    {
        "index": 9,
        "tags": [
            "ca-notice"
        ],
        "content": [
            "CA Notice (https://www.nbcuniversal.com/privacy/california-consumer-privacy-act?intake=NBC_News)"      
        ],
        "error": false
    },
    {
        "index": 10,
        "tags": [
            "terms-of-service"
        ],
        "content": [
            "Terms of Service (Updated JULY 7, 2023) (https://www.nbcuniversal.com/terms)"
        ],
        "error": false
    },
    {
        "index": 11,
        "tags": [
            "sitemap"
        ],
        "content": [
            "NBC News Sitemap (https://www.nbcnews.com/archive)"
        ],
        "error": false
    },
    {
        "index": 12,
        "tags": [
            "closed-captioning"
        ],
        "content": [
            "Closed Captioning (https://www.nbcnews.com/information/nbc-news-info/closed-captioning-n1307063)"      
        ],
        "error": false
    },
    {
        "index": 13,
        "tags": [
            "advertise"
        ],
        "content": [
            "Advertise (https://together.nbcuni.com/advertise/?utm_source=nbc_news&utm_medium=referral&utm_campaign=property_ad_pages)"
        ],
        "error": false
    },
    {
        "index": 14,
        "tags": [
            "select-shopping"
        ],
        "content": [
            "Select Shopping (https://www.nbcnews.com/select)"
        ],
        "error": false
    },
    {
        "index": 15,
        "tags": [
            "select-personal-finance"
        ],
        "content": [
            "Select Personal Finance (https://www.cnbc.com/select/)"
        ],
        "error": false
    },
    {
        "index": 16,
        "tags": [
            "copyright"
        ],
        "content": [
            "© 2024 NBCUniversal Media, LLC [NBC News Logo](https://www.nbcnews.com)[MSNBC Logo](https://www.msnbc.com)[Today Logo](https://www.today.com)"
        ],
        "error": false
    },
    {
        "index": 17,
        "tags": [
            "tracking"
        ],
        "content": [
            "![](https://tag.researchnow.com/t/beacon?adn=13&ca=direct&pl=https%3A%2F%2Fwww.nbcnews.com%2Fbusiness&pr=284801&si=NBCNEWS)"
        ],
        "error": false
    },
    {
        "index": 18,
        "tags": [
            "article-header"
        ],
        "content": [
            "Australia Police Arrest 13, Seize 23 Tons of Cocaine on Boat"
        ],
        "error": false
    }
]

stsfaroz avatar Dec 02 '24 18:12 stsfaroz