dataRetrieval
dataRetrieval copied to clipboard
use POST instead of GET for downloading data from the Water Quality Portal
There are a variety of ways that GETs can end up breaking, such as URLS being to long, problems escaping characters, to firewalls interpreting queries as attacks and blocking them, to apache routing getting all confused. Many of these problems can be avoided by POSTing a JSON payload instead.
The syntax is pretty reasonable, and is documented in the swaggerdocs and also below.
you do need to send the headers, and include the mimetype in the URL. here is an example curl:
curl -X POST --header 'Content-Type: application/json' --header 'Accept: application/zip' -d '{"statecode":["US:55"],"countycode":["US:55:025","US:55:049"],"siteType":["Lake, Reservoir, Impoundment","Stream"],"sampleMedia":["Air","Water"]}' 'https://www.waterqualitydata.us/data/Station/search?mimeType=csv&zip=yes'
{
"analyticalmethod": [
"string"
],
"assemblage": [
"string"
],
"bBox": "string",
"characteristicName": [
"string"
],
"characteristicType": [
"string"
],
"countrycode": [
"string"
],
"countycode": [
"string"
],
"dataProfile": "string",
"huc": [
"string"
],
"lat": "string",
"long": "string",
"mimeType": "string",
"minactivities": "string",
"minresults": "string",
"nldiurl": "string",
"organization": [
"string"
],
"pCode": [
"string"
],
"project": [
"string"
],
"providers": [
"string"
],
"sampleMedia": [
"string"
],
"siteType": [
"string"
],
"siteid": [
"string"
],
"startDateHi": "string",
"startDateLo": "string",
"statecode": [
"string"
],
"subjectTaxonomicName": [
"string"
],
"within": "string",
"zip": "string"
}
See @limnoliver issue:
readWQPdata(siteid = 'COE/ISU_WQX-17630001', characteristicName = 'Temperature, water')
What I've learned.... We are close to being able to do this, but needs special care because it will affect ALL the WQP functions. Here's my "playing_with_post" script:
obs_url_orig <- "https://www.waterqualitydata.us/Result/search?siteid=KENAI_WQX-10000349%3BWIDNR_WQX-10012578%3BQIN_WQX-TM2G%3BWIDNR_WQX-133548%3BWIDNR_WQX-10039277%3BQIN_WQX-TM7-a%3BQUILEUTE_WQX-025%3BCBP_WQX-LE2.3%3BWIDNR_WQX-10039686%3BWIDNR_WQX-10016481%3BCBP_WQX-LE1.2%3BWIDNR_WQX-10038078%3BWIDNR_WQX-10011948%3BCBP_WQX-CB3.3W%3BWIDNR_WQX-10034036%3BTCEQMAIN-17017%3BMNPCA-10-0045-00-202%3BWIDNR_WQX-683125%3BCBP_WQX-PMS29%3B21VASWCB-6APNR001.82%3BWIDNR_WQX-10037203%3B21AWIC-199%3BMNPCA-82-0104-00-201%3BLRBOI_WQX-TDam%3B21AWIC-893%3BTCEQMAIN-11935%3BMNPCA-27-0133-03-202%3BGNLK01_WQX-BTLK%3BOREGONDEQ-11005-ORDEQ%3BCBP_WQX-903A%3BUSGS-01477050%3BTCEQMAIN-17016%3BUSGS-06428500%3BOREGONDEQ-30501-ORDEQ%3BUSGS-08317300%3B21IOWA_WQX-15220006%3B11NPSWRD_WQX-CONG_USGS_11%3BTCEQMAIN-11977%3B21SCSANT-SC-046%3B11NPSWRD_WQX-BICA_MFG_E%3BUSGS-07056000%3BUSGS-06644085%3BMNPCA-70-0120-01-401%3BWIDNR_WQX-213136%3BWIDNR_WQX-583067%3BOREGONDEQ-34755-ORDEQ%3BOREGONDEQ-24447-ORDEQ%3B11NPSWRD_WQX-CONG_USGS_01%3BUSGS-50043000%3B1VTDECWQ-504155%3BUSGS-04122030%3B21KAN001_WQX-SC207%3BSDDENR_WQX-460815%3B21FLSFWM_WQX-G342C%3B21FLEECO_WQX-10MIGR80%3B21AWIC-2101%3BIL_EPA_WQX-HCCC-02%3BWIDNR_WQX-363312%3BOREGONDEQ-36865-ORDEQ%3B21NC03WQ-CPF0884A%3BOREGONDEQ-31405-ORDEQ%3BUSGS-03383000%3B21GAEPD_WQX-LK_11_3520%3BMNPCA-82-0031-00-451%3BUSGS-05529000%3B21MSWQ_WQX-303ARK03%3BCOQUILLE_WA-29918-ORDEQ%3BUSGS-03049652%3B21AWIC-10328%3B21FLSFWM-G108%3BMNPCA-21-0216-00-201%3BUSGS-09144200%3BTCEQMAIN-10222%3BMNPCA-S000-173%3B21IOWA-21910001%3BAZDEQ_SW-663%3BUSGS-05320270%3BMNPCA-27-0133-02-402%3B21AWIC-50%3B21FLEECO_WQX-20A-11GR%3B1VTDECWQ-503001%3B21NCMONITORING-J6450000%3BUSGS-08079600%3BMDE_FIELDSERVICES_WQX-LDR0012%3BLADEQWPD-103%3BUSGS-15389000%3BUSGS-07211500%3BMENOM_WQX-LSKI%3BTCEQMAIN-10576%3BUSGS-01454700%3BUSGS-475301102220100%3BUSACOEND-3MAR20004%3BTSWQC_WQX-CFRPO-29%3B21DELAWQ_WQX-202021%3BUSGS-08037000%3BUSGS-02464360%3BUSGS-02297100%3BUSGS-05388250%3B21VASWCB-5ARDC007.30%3BUTAHDWQ_WQX-4995600%3B"
library(dataRetrieval)
x <- importWQP(obs_url_orig)
Request failed [406]. Retrying in 2 seconds...
Request failed [406]. Retrying in 3 seconds...
split <- strsplit(obs_url_orig, "?", fixed=TRUE)
obs_url <- split[[1]][1]
query <- split[[1]][2]
json_request_stuff <- strsplit(query, "=", fixed=TRUE)
things_to_request <- json_request_stuff[[1]][1]
things <- strsplit(json_request_stuff[[1]][2], "%3B", fixed=TRUE)
names(things) <- things_to_request
things <- strsplit(json_request_stuff[[1]][2], "%3B", fixed=TRUE)
things <- list(things[[1]][1:2])
names(things) <- things_to_request
things_json <- jsonlite::toJSON(things, pretty = TRUE)
####################################################
library(httr)
library(readr)
system.time({
x <- POST(paste0(obs_url,"?mimeType=tsv"),
body = things_json,
content_type("application/json"),
accept("text/tsv"),
user_agent(dataRetrieval:::default_ua()))
returnedDoc <- content(x,
type="text",
encoding = "UTF-8")
retval <- suppressWarnings(read_delim(returnedDoc,
col_types = cols(`ActivityStartTime/Time` = col_character(),
`ActivityEndTime/Time` = col_character(),
USGSPCode = col_character(),
ResultCommentText=col_character(),
`ActivityDepthHeightMeasure/MeasureValue` = col_number(),
`DetectionQuantitationLimitMeasure/MeasureValue` = col_number(),
ResultMeasureValue = col_number(),
`WellDepthMeasure/MeasureValue` = col_number(),
`WellHoleDepthMeasure/MeasureValue` = col_number(),
`HUCEightDigitCode` = col_character(),
`ActivityEndTime/TimeZoneCode` = col_character()),
quote = "", delim = "\t"))
})
user system elapsed
4.49 0.42 47.09
####################################################
system.time({
temp <- tempfile()
temp <- paste0(temp,".zip")
x <- POST(paste0(obs_url,"?mimeType=tsv&zip=yes"),
body = things_json,
content_type("application/json"),
accept("application/zip"),
httr::write_disk(temp),
user_agent(dataRetrieval:::default_ua()))
headerInfo <- httr::headers(x)
file1 <- tempdir()
doc <- utils::unzip(temp, exdir=file1)
unlink(temp)
retval1 <- suppressWarnings(read_delim(doc,
col_types = cols(`ActivityStartTime/Time` = col_character(),
`ActivityEndTime/Time` = col_character(),
USGSPCode = col_character(),
ResultCommentText=col_character(),
`ActivityDepthHeightMeasure/MeasureValue` = col_number(),
`DetectionQuantitationLimitMeasure/MeasureValue` = col_number(),
ResultMeasureValue = col_number(),
`WellDepthMeasure/MeasureValue` = col_number(),
`WellHoleDepthMeasure/MeasureValue` = col_number(),
`HUCEightDigitCode` = col_character(),
`ActivityEndTime/TimeZoneCode` = col_character()),
quote = "", delim = "\t"))
unlink(doc)
})
user system elapsed
0.78 0.33 21.06
So, we're really close to being able to do this with this sort of framework:
values <- readWQPdots(...)
zip <- values[["zip"]] == "yes"
values <- values[names(values)[names(values) != "zip"]]
post_body <- jsonlite::toJSON(values, pretty = TRUE)
urlCall <- post_url(drURL("wqpSiteSummary"), zip=zip, csv = TRUE)
where readWQPdots
loses this line:
values <- sapply(matchReturn, function(x) as.character(paste(eval(x),collapse=";",sep="")))
(ie...keeping it a list instead of a character vector)
The issue is we need to know when WQP wants a single string vs an array.
This works:
{
"siteid": ["USGS-07144100"],
"summaryYears": "5",
"dataProfile": "periodOfRecord"
}
but this is what the code currently does:
{
"siteid": ["USGS-07144100"],
"summaryYears": ["5"],
"dataProfile": ["periodOfRecord"]
}
So, we'll need a way to figure out what parts of the JSON are single values, and what are arrays (and since everything in R trickles down to being a list...that's going to be tough to do programatically...we might need to hard code it.)
If we do hard-code it....we should consider tying it together with #115
This this currently a WIP, or is there a branch that has this functionality on it?
Nope, no working branch. The scripts I pasted above were the exploratory work I did.
There's a Swagger link that describes the JSON required for each field in a POST. (@jkreft-usgs could you paste that here if it's public ?) It's kind of a mish-mash of lists and single values (so, depending on the argument, it may or may not be required to be surrounded by square brackets).
jsonlite
by default makes everything vectors (like "summaryYears": ["5"]
instead of "summaryYears": "5"
...and WQP requires some to be vectors/lists, some to be single values. In my ideal world...WQP would be more flexible (allowing square brackets around single values...)....but if that never happens, we need to figure out a way to map out which arguments are required vectors, and which ones are required single values. This task became too complicated for the time/resources we had at the time of working on it.
You might try and see if the openAPI generator miraculously works for this. It's autogenerated code, so it might be a mess, but it might also do a lot of the annoying bits for you. https://openapi-generator.tech/docs/generators/r
I'm looking at the above and trying to use for the funky monIDs that exist (Like Sam's example of COE/ISU_WQX-17630001
)
The example does this:
things <- list(things[[1]][1:2])
names(things) <- things_to_request
things_json <- jsonlite::toJSON(things, pretty = TRUE)
which subsets the site ids down to the first two, which are kind of normal:
things[[1]][1:2]
[1] "KENAI_WQX-10000349" "WIDNR_WQX-10012578"
The request works w/
{
"siteid": ["KENAI_WQX-10000349", "WIDNR_WQX-10012578"]
}
But fails when I add the funky site name:
{
"siteid": ["KENAI_WQX-10000349", "COE/ISU_WQX-17630001", "WIDNR_WQX-10012578"]
}
(I get Error: '' does not exist in current working directory
, which I think is because no file was built?)
http error 400:
<- HTTP/1.1 400 Bad Request
<- Date: Sat, 17 Aug 2019 11:28:58 GMT
<- Content-Length: 0
<- Warning: 299 WQP "The value of siteid=COE/ISU_WQX-17630001 must match the format [\w]+\-.*\S"
<- Strict-Transport-Security: max-age=31536000
<-
Maybe I am misinterpreting the code. But if I am not, I wonder if there is a way to get data from these sites out the WQP through the API, or if these sites are kind of locked in and either need their IDs fixed or data need to be pulled from them using a different query (e.g., using combination of Agency IDs and other things...which sounds like a pain to get to unique sites...).
How common is this issue?
wqp_sites <- dataRetrieval::whatWQPsites(CountryCode = 'US')
library(dplyr)
wqp_sites %>% dplyr::select(OrganizationIdentifier, OrganizationFormalName, MonitoringLocationIdentifier) %>%
group_by(OrganizationIdentifier, OrganizationFormalName) %>%
summarize(bad_mon_ID_cnt = sum(stringr::str_extract(MonitoringLocationIdentifier, "/") == '/', na.rm = TRUE)) %>%
filter(bad_mon_ID_cnt > 0) %>% arrange(desc(bad_mon_ID_cnt)) %>%
dplyr::select(OrganizationIdentifier, bad_mon_ID_cnt, everything()) %>% data.frame()
OrganizationIdentifier bad_mon_ID_cnt OrganizationFormalName
1 CBP_WQX 5612 Chesapeake Bay Program (CBP)
2 MDNR 3963 Missouri Dept. of Natural Resources
3 MDC 421 Missouri Dept. of Conservation
4 UMC 388 Univ. of Missouri, Columbia
5 21COL001_WQX 178 Colorado Dept. of Public Health & Environment-WQCD
6 MEC 136 Midwest Envir. Consultants
7 MSU 48 Missouri State University
8 FWC/FWRI 43 Fish Wildlife Conservation / Wildlife Research Institute(FL)
9 31DRBCSP 36 Delaware River Basin Commission
10 JCPH_WQX 28 Jefferson County Public Health
11 CC 26 Crowder College, Neosho, Mo.
12 COE/ISU 24 Des Moines River - Corp of Engineers (IOWA)
13 COE/ISU_WQX 24 Des Moines River - Corp of Engineers (IOWA)
14 PCSWD 21 Perry Co. Soil and Water District
15 URS 18 URS Corporation
16 11NPSWRD_WQX 17 National Park Service Water Resources Division
17 VERSAR 16 Versar Inc.
18 WMAT_WQX 14 White Mountain Apache Tribe of the Fort Apache Reservation, Arizona
19 21HI 11 Hawaii Department of Health Clean Water Branch
20 CEDEN 11 California State Water Resources Control Board
21 WU 11 Washington University, St. Louis
22 EPA_R7_WQX 9 EPA R7
23 0801505 6 French Gulch Superfund site (US EPA Region 8)
24 NJDEPVMG 5 NJDEP Volunteer Monitoring Group
25 TYSON 5 Tyson Foods
26 UMR 5 Univ of Missouri Rolla
27 EPA_GLNPO 4 EPA Great Lakes National Program
28 NJDEP_AMERICORPS 3 NJDEP Americorps Program
29 UTEMTN 3 Ute Mountain Utes Tribe (Colorado)
30 WHITEMOUNTAIN_WQX 3 White Mountain Apache Tribe of the Fort Apache Reservation, Arizona
31 21COL001 2 Colorado Dept. of Public Health & Environment
32 21FLCOT_WQX 2 City of Tallahassee Stormwater
33 21FLPOLK 2 Polk County Water Resources (Florida)
34 ANIMASWP 2 Animas Watershed Partnership
35 CCWF 2 Clear Creek Watershed Foundation (CCWF)
36 MONT_PPL_WQX 2 MONT_PPL_WQX - Montana PPL Corporation
37 NJDEPNJWAP 2 NJDEP, NJ Watershed Ambassador Prog
38 OKDEQ 2 Oklahoma Dept. of Environmental Quality
39 R10SUMASMOUNTAIN 2 EPA Region 10 Superfund Sumas Mountain Asbestos Site
40 USEPA_REGION8 2 USEPA Region 8
41 YUROKTEP 2 The Yurok Tribe Environemtal Program (CALIFORNIA)
42 0801194 1 Summitville Superfund site (US EPA Region 8)
43 0801600 1 Captain Jack Mine (Colorado)
44 21CABCH 1 Southern California Coastal Water Research Project
45 21NEV1_WQX 1 Nevada Division of Environmental Protection
46 21VASWCB 1 VIRGINIA DEPARTMENT OF ENVIRONMENTAL QUALITY
47 BUNKER_WQX 1 Bunker Hill Mining and Metallurgical Complex
48 CABEACH_WQX 1 California State Water Resource Control Board
49 CALWR_WQX 1 California Department Of Water Resources
50 CDATRUST 1 CDA TRUST
51 MDE_FIELDSERVICES_WQX 1 Maryland Dept. of the Environment In House Water Data
52 OVIWC 1 Owens Valley Indian Water Commission
53 R10BOOMSNUB 1 EPA Region 10 Boomsnub Superfund Site Data 1987-2013
54 R10BUNKER 1 EPA Region 10 Superfund Bunker Hill Mining and Metallurgical Complex
55 R10SUPERFUNDHISTORICAL 1 EPA Region 10 Superfund Historical Data
56 SCI 1 SCI Engineering
57 SRPMIC_WQX 1 Salt River Pima-Maricopa Indian Community of the Salt River Reservation, Arizona
Note 👆there are some org IDs that have the same issue in there (e.g., COE/ISU_WQX
)