GenomicDataCommons icon indicating copy to clipboard operation
GenomicDataCommons copied to clipboard

size parameter in `manifest` not working

Open LiNk-NY opened this issue 1 year ago • 3 comments

Hi Sean, @seandavi I can't really put my finger on it but recently the size parameter in the request is not working. For reprexample,

suppressPackageStartupMessages({
    library(GenomicDataCommons)
})
files() |>
    filter(~ cases.project.project_id == "TCGA-COAD" &
               data_type == "Copy Number Segment") |>
    manifest(size = 1)
#> # A tibble: 976 × 5
#>    id                                   filename               md5    size state
#>  * <chr>                                <chr>                  <chr> <dbl> <chr>
#>  1 e137a35c-282e-45e5-8ea5-e101cc653279 HELVE_p_TCGA_b139_154… 5f4f… 44635 rele…
#>  2 97e9235b-2050-4190-969b-439e4c0c19df VENUE_p_TCGAb28_SNP_N… a82d… 73454 rele…
#>  3 5db10469-5914-4516-bdf7-f1a61de95f01 VENUE_p_TCGAb28_SNP_N… 24dc… 50568 rele…
#>  4 0ee34fc9-3bde-4cad-9b80-5432a38c5b3c BAIZE_p_TCGA_b138_SNP… dd41… 30525 rele…
#>  5 7a604078-50e4-425d-8b25-e1fc8d06b1bb VENUE_p_TCGAb28_SNP_N… 1db5… 57800 rele…
#>  6 36b468a0-5a44-46e8-8df0-5b68b7b111fe SONGS_p_TCGAb36_SNP_N… 84a6… 58002 rele…
#>  7 ab13a04d-22a5-43b0-b9d6-4d6326e8e6cc SONGS_p_TCGAb36_SNP_N… a6d8… 28838 rele…
#>  8 174da332-f213-4a66-b5ec-add480af21dc GRIPS_p_TCGA_b116_SNP… 9ecc… 45921 rele…
#>  9 08c8091d-71ee-4d67-8ab5-9476d6b34d2a HELVE_p_TCGA_b139_154… 5401… 36633 rele…
#> 10 7ac16098-8206-4d0f-9b84-cd53d1756728 KEYED_p_TCGAb41_SNP_N… b3ff… 51794 rele…
#> # … with 966 more rows

Created on 2022-07-13 by the reprex package (v2.0.1)

returns a data.frame with 976 rows instead of 1. Any ideas?

I'm seeing the effects here: https://bioconductor.org/checkResults/devel/bioc-LATEST/TCGAutils/nebbiolo2-checksrc.html and here https://bioconductor.org/checkResults/devel/bioc-LATEST/GenomicDataCommons/nebbiolo2-checksrc.html

LiNk-NY avatar Jul 13 '22 18:07 LiNk-NY

Update: I've contacted the GDC folks as this seems to be an issue with the API responses:

curl --remote-name --remote-header-name 'https://api.gdc.cancer.gov/files?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22experimental_strategy%22%2C%22value%22%3A%5B%22RNA-Seq%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-KIRC%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.samples.sample_type%22%2C%22value%22%3A%5B%22Solid+Tissue+Normal%22%5D%7D%7D%5D%7D&size=3&return_type=manifest'
tmp <- readr::read_tsv("gdc_manifest.2022-07-22.txt", col_types = "cccdc")
dim(tmp)
#' [1] 648   5

LiNk-NY avatar Jul 22 '22 20:07 LiNk-NY

Update2: Requests via curl return a proper JSON format that can be parsed:

curl 'https://api.gdc.cancer.gov/files?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22experimental_strategy%22%2C%22value%22%3A%5B%22RNA-Seq%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-KIRC%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.samples.sample_type%22%2C%22value%22%3A%5B%22Solid+Tissue+Normal%22%5D%7D%7D%5D%7D&size=3'
{"data": {"hits": [{"id": "fc69f03c-9e3a-4983-8d96-a59eba9b7748", "data_format": "TSV", "access": "controlled", "file_name": "TCGA-KIRC.3d77d6b5-6626-4cc9-84fe-2c79a0b572bf.arriba.rna_fusion.tsv", "submitter_id": "51afb64a-7100-45ff-95f7-c35453ba23ed", "data_category": "Structural Variation", "acl": ["phs000178"], "type": "structural_variation", "file_size": 4077, "created_datetime": "2021-12-13T21:35:29.534183-06:00", "md5sum": "6b9aa4d85f738485e791810e385a34fb", "updated_datetime": "2022-01-19T12:49:49.189936-06:00", "file_id": "fc69f03c-9e3a-4983-8d96-a59eba9b7748", "data_type": "Transcript Fusion", "state": "released", "experimental_strategy": "RNA-Seq", "version": "1", "data_release": "32.0 - 34.0"}, {"id": "45b9d64c-8705-4850-b136-c6eb36e945f9", "data_format": "TSV", "access": "controlled", "file_name": "TCGA-KIRC.add60c10-6d0b-45ca-b799-a7d938c1e65f.star_fusion.rna_fusion.tsv", "submitter_id": "bc2d7a02-5f81-43c6-8f92-64b506cf05be", "data_category": "Structural Variation", "acl": ["phs000178"], "type": "structural_variation", "file_size": 234, "created_datetime": "2021-12-13T21:35:26.060934-06:00", "md5sum": "b1afeba0fb7f76577dd6942a72137537", "updated_datetime": "2022-01-19T12:49:49.189936-06:00", "file_id": "45b9d64c-8705-4850-b136-c6eb36e945f9", "data_type": "Transcript Fusion", "state": "released", "experimental_strategy": "RNA-Seq", "version": "1", "data_release": "32.0 - 34.0"}, {"id": "14f44050-a0d2-461b-a29d-1c273d5c05bc", "proportion_reads_mapped": 0.9705836986111223, "data_format": "BAM", "total_reads": 134200148, "access": "controlled", "file_name": "29c9d4b8-b879-49d8-82e6-fd5ac79c2e6c.rna_seq.genomic.gdc_realn.bam", "proportion_base_mismatch": 0.002389675, "proportion_reads_duplicated": 0, "submitter_id": "cf5d418d-b8e2-4030-8d39-f1f271259e58", "data_category": "Sequencing Reads", "acl": ["phs000178"], "type": "aligned_reads", "platform": "Illumina", "created_datetime": "2021-12-13T18:05:50.844590-06:00", "file_size": 5987704359, "average_base_quality": 38, "md5sum": "a77d7ff58322b72bc27be872d3b406d8", "updated_datetime": "2022-01-19T12:49:49.189936-06:00", "pairs_on_diff_chr": 257196, "file_id": "14f44050-a0d2-461b-a29d-1c273d5c05bc", "data_type": "Aligned Reads", "average_insert_size": 1313, "average_read_length": 50, "state": "released", "experimental_strategy": "RNA-Seq", "version": "2", "data_release": "32.0 - 34.0"}], "pagination": {"count": 3, "total": 648, "size": 3, "from": 0, "sort": "", "page": 1, "pages": 216}}, "warnings": {}}

Unfortunately, a similar request via httr does not return JSON but XML.

content(POST("https://api.gdc.cancer.gov/files?filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22experimental_strategy%22%2C%22value%22%3A%5B%22RNA-Seq%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-KIRC%22%5D%7D%7D%2C%7B%22op%22%3A%22%3D%22%2C%22content%22%3A%7B%22field%22%3A%22cases.samples.sample_type%22%2C%22value%22%3A%5B%22Solid+Tissue+Normal%22%5D%7D%7D%5D%7D&size=3", encode = "json", add_headers(`Content-Type` = "application/json")), as = "text")
No encoding supplied: defaulting to UTF-8.
[1] "<?xml version=\"1.0\" encoding=\"UTF-8\" ?><response><data><hits><item><id>fc69f03c-9e3a-4983-8d96-a59eba9b7748</id><data_format>TSV</data_format><access>controlled</access><file_name>TCGA-KIRC.3d77d6b5-6626-4cc9-84fe-2c79a0b572bf.arriba.rna_fusion.tsv</file_name><submitter_id>51afb64a-7100-45ff-95f7-c35453ba23ed</submitter_id><data_category>Structural Variation</data_category><acl><item>phs000178</item></acl><type>structural_variation</type><file_size>4077</file_size><created_datetime>2021-12-13T21:35:29.534183-06:00</created_datetime><md5sum>6b9aa4d85f738485e791810e385a34fb</md5sum><updated_datetime>2022-01-19T12:49:49.189936-06:00</updated_datetime><file_id>fc69f03c-9e3a-4983-8d96-a59eba9b7748</file_id><data_type>Transcript Fusion</data_type><state>released</state><experimental_strategy>RNA-Seq</experimental_strategy><version>1</version><data_release>32.0 - 34.0</data_release></item><item><id>45b9d64c-8705-4850-b136-c6eb36e945f9</id><data_format>TSV</data_format><access>controlled</access><file_name>TCGA-KIRC.add60c10-6d0b-45ca-b799-a7d938c1e65f.star_fusion.rna_fusion.tsv</file_name><submitter_id>bc2d7a02-5f81-43c6-8f92-64b506cf05be</submitter_id><data_category>Structural Variation</data_category><acl><item>phs000178</item></acl><type>structural_variation</type><file_size>234</file_size><created_datetime>2021-12-13T21:35:26.060934-06:00</created_datetime><md5sum>b1afeba0fb7f76577dd6942a72137537</md5sum><updated_datetime>2022-01-19T12:49:49.189936-06:00</updated_datetime><file_id>45b9d64c-8705-4850-b136-c6eb36e945f9</file_id><data_type>Transcript Fusion</data_type><state>released</state><experimental_strategy>RNA-Seq</experimental_strategy><version>1</version><data_release>32.0 - 34.0</data_release></item><item><id>14f44050-a0d2-461b-a29d-1c273d5c05bc</id><proportion_reads_mapped>0.9705836986111223</proportion_reads_mapped><data_format>BAM</data_format><total_reads>134200148</total_reads><access>controlled</access><file_name>29c9d4b8-b879-49d8-82e6-fd5ac79c2e6c.rna_seq.genomic.gdc_realn.bam</file_name><proportion_base_mismatch>0.002389675</proportion_base_mismatch><proportion_reads_duplicated>0</proportion_reads_duplicated><submitter_id>cf5d418d-b8e2-4030-8d39-f1f271259e58</submitter_id><data_category>Sequencing Reads</data_category><acl><item>phs000178</item></acl><type>aligned_reads</type><platform>Illumina</platform><created_datetime>2021-12-13T18:05:50.844590-06:00</created_datetime><file_size>5987704359</file_size><average_base_quality>38</average_base_quality><md5sum>a77d7ff58322b72bc27be872d3b406d8</md5sum><updated_datetime>2022-01-19T12:49:49.189936-06:00</updated_datetime><pairs_on_diff_chr>257196</pairs_on_diff_chr><file_id>14f44050-a0d2-461b-a29d-1c273d5c05bc</file_id><data_type>Aligned Reads</data_type><average_insert_size>1313</average_insert_size><average_read_length>50</average_read_length><state>released</state><experimental_strategy>RNA-Seq</experimental_strategy><version>2</version><data_release>32.0 - 34.0</data_release></item></hits><pagination><count>3</count><total>648</total><size>3</size><from>0</from><sort></sort><page>1</page><pages>216</pages></pagination></data><warnings></warnings></response>"

LiNk-NY avatar Aug 02 '22 15:08 LiNk-NY

FWIW Updates to the API should show up at https://docs.gdc.cancer.gov/API/Release_Notes/API_Release_Notes/

LiNk-NY avatar Oct 21 '22 15:10 LiNk-NY