shot-scraper
shot-scraper copied to clipboard
Validate YAML mini-language using Pydantic
Since PDF will accept a different set of options from regular screenshots I think this will be the point at which I add much more robust validation of the YAML mini-language, probably using https://pydantic-docs.helpmanual.io/
Originally posted by @simonw in https://github.com/simonw/shot-scraper/issues/27#issuecomment-1065949072
from typing import List, Optional
from pydantic import BaseModel, StrictStr, validator
import pathlib
class Shot(BaseModel):
class Config:
# Error on fields that are not defined
extra = "forbid"
url: StrictStr
output: Optional[StrictStr]
@validator("url")
def url_must_be_file_or_start_with_protocol(cls, url):
path = pathlib.Path(url)
if path.exists():
return "file:{}".format(path.absolute())
if not (url.startswith("https://") or url.startswith("http://")):
return "http://{}".format(url)
return url
class Shots(BaseModel):
shots: List[Shot]
>>> Shots(shots=[{"url":"simonwillison.net/", "output": "o"}])
Shots(shots=[Shot(url='http://simonwillison.net/', output='o')])
>>> Shots(shots=[{"url":"/tmp/index.html"}])
Shots(shots=[Shot(url='file:/tmp/index.html', output=None)])
Next step: figure out how to do conditional validation, e.g. if type = 'pdf'
then landscape
is allowed.
https://stackoverflow.com/a/65128634/6083
Can do this:
@validator("some_date", always=True)
def validate_date(cls, value, values):
if len(values["some_list"]) < 2:
return None
return values["some_list"][0]
Or this:
@root_validator
def validate_date(cls, values):
if not len(values["some_list"]) < 2:
values["some_date"] = values["some_list"][0]
return values
See https://pydantic-docs.helpmanual.io/usage/validators/#root-validators
Looks like I don't need always=True
- I can define that optional second values
argument and use that to see the values for other fields, which will have been processed in the order that they were defined.
Here's a full validator based on the options in take_shot()
at the moment:
from typing import List, Optional
from pydantic import BaseModel, StrictStr, validator, conint
import pathlib
class Shot(BaseModel):
url: StrictStr
output: Optional[StrictStr]
quality: Optional[conint(strict=True, gt=0, lt=101)]
wait: Optional[conint(strict=True, gt=0)]
padding: Optional[conint(strict=True, gt=-1)]
width: Optional[conint(strict=True, gt=0)]
height: Optional[conint(strict=True, gt=0)]
selector: Optional[StrictStr]
selectors: Optional[List[StrictStr]]
javascript: Optional[StrictStr]
class Config:
# Don't allow unknown properties
extra = "forbid"
@validator("selectors", always=True)
def add_selector_to_selectors(cls, selectors, values):
selectors = selectors or []
if values.get("selector"):
selectors.append(values["selector"])
return selectors
@validator("url")
def url_must_be_file_or_start_with_protocol(cls, url):
path = pathlib.Path(url)
if path.exists():
return "file:{}".format(path.absolute())
if not (url.startswith("https://") or url.startswith("http://")):
return "http://{}".format(url)
return url
class Shots(BaseModel):
shots: List[Shot]