croissant icon indicating copy to clipboard operation
croissant copied to clipboard

Extended croissant draft for CCF crawls

Open handecelikkanat opened this issue 2 months ago • 3 comments

Here is an extended croissant draft for CCF crawls. Please give feedback.

@benjelloun @wumpus FYI.

What we dont have syntax for atm, and related issues:

  • Lineage: https://github.com/mlcommons/croissant/issues/738
  • Size information for FileSets: We have size information for whole File Sets (the whole warc files), afaiu FileSet now doesnt have contentSize info. (FileObject has.)
  • Parquet schemas for Columnar index: Afaiu parquet schemas are not supported yet - Will it be?

Draft Croissant

EDITS:

  • Corrected version to include build-version (2025-10-16)
  • Updated croissant specs version to http://mlcommons.org/croissant/1.1, noticed some datasets already use it, though it is not available at this link yet. (2025-10-16)
  • Added base IRI: "https://data.commoncrawl.org/crawl-data/CC-MAIN-YYYY-WW/" (2025-10-16)
{
  "@context": {
    "@base": "cr_base_iri/",
    "@language": "en",
    "@vocab": "https://schema.org/",
    "sc": "https://schema.org/",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "dct": "http://purl.org/dc/terms/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform",
    "@base": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/"
  },
  "@type": "sc:Dataset",
  "conformsTo": "http://mlcommons.org/croissant/1.1",
  "name": "CC-MAIN-2022-05",
  "description": "Common Crawl January 2022 Crawl Archive",
  "license": "https://commoncrawl.org/terms-of-use",
  "url": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/index.html",
  "creator": {
    "@type": "Organization",
    "name": "The Common Crawl Foundation",
    "url": "https://commoncrawl.org/"
  },
  "citeAs": "https://commoncrawl.org/",
  "version": "1.0.0+20251015",
  "datePublished": "2022-01-29T15:24:05Z",
  "temporalCoverage": "2022-01-16T09:31:37Z/2022-01-29T15:24:05Z",
  "distribution": [
    {
      "@type": "cr:FileObject",
      "@id": "warc.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/warc.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "e0131030d08e07d93362feae152f3854e23093f623edd6757bb270a92f91b55b"
    },
    {
      "@type": "cr:FileSet",
      "@id": "warc-paths",
      "containedIn": {
        "@id": "warc.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "wat.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wat.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "c61ecb0b91767f334621da59a84ecf2b858018e5cb0c3f36789a6ac77cb6ca54"
    },
    {
      "@type": "cr:FileSet",
      "@id": "wat-paths",
      "containedIn": {
        "@id": "wat.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.wat.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "wet.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/wet.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "36650ea4871ee5ed5da0a5d23ab4d6050dc2d2792c86980d76e1167d8e354a25"
    },
    {
      "@type": "cr:FileSet",
      "@id": "wet-paths",
      "containedIn": {
        "@id": "wet.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.wet.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "robotstxt.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/robotstxt.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "239c08056c67673e53d1e55d1903669c8f0f5ce3d867ba56357c7f87f5387621"
    },
    {
      "@type": "cr:FileSet",
      "@id": "robotstxt-paths",
      "containedIn": {
        "@id": "robotstxt.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "non200responses.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/non200responses.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "69a973bdc7c9f6ed60f80286fdc881db4265208adc6f0ba05488e8d52f4d9738"
    },
    {
      "@type": "cr:FileSet",
      "@id": "non200responses-paths",
      "containedIn": {
        "@id": "non200responses.paths.gz"
      },
      "encodingFormat": "application/warc",
      "includes": "*.warc.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "cc-index.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "29e5ad77d5be5c2e0c8adc25231174966d82257a78d4e3d9441a10e55c0086e0"
    },
    {
      "@type": "cr:FileSet",
      "@id": "cc-index-paths",
      "containedIn": {
        "@id": "cc-index.paths.gz"
      },
      "encodingFormat": "application/gzip",
      "includes": "*.gz"
    },
    {
      "@type": "cr:FileObject",
      "@id": "cc-index-table.paths.gz",
      "contentUrl": "https://data.commoncrawl.org/crawl-data/CC-MAIN-2022-05/cc-index-table.paths.gz",
      "encodingFormat": "application/gzip",
      "sha256": "e38cdc860dfb831d9f69ef7d16ce5bcdb7e862d57611335ad2b689c9a4cf5436"
    },
    {
      "@type": "cr:FileSet",
      "@id": "cc-index-table-paths",
      "containedIn": {
        "@id": "cc-index-table.paths.gz"
      },
      "encodingFormat": "application/parquet",
      "includes": "*.gz.parquet"
    }
  ]
}

handecelikkanat avatar Oct 15 '25 16:10 handecelikkanat

@benjelloun We are good to add this draft to MLcroissant examples in repo - but we need to update frequently, its very much evolving. Is this ok?

handecelikkanat avatar Oct 16 '25 08:10 handecelikkanat

Apologies for the very late reply. The dataset looks good to me. I'm considering adding a more explicit representation of the manifest mechanism to the spec, but that can be added later to this dataset. I will post about that in #920.

Adding the example to the Croissant repository is a good idea. It's totally okay if it changing frequently.

benjelloun avatar Nov 21 '25 13:11 benjelloun

Please take a look at https://github.com/mlcommons/croissant/issues/920#issuecomment-3563971887

I'd love to hear if this mechanism makes sense to you, and if you can adopt it in CCF crawls.

benjelloun avatar Nov 21 '25 17:11 benjelloun