stac-pydantic icon indicating copy to clipboard operation
stac-pydantic copied to clipboard

ItemProperties modifies input data in place

Open eseglem opened this issue 1 year ago • 4 comments

While trying to do some benchmarking I noticed that somehow the input datetime string was being modified in place when validating the model. Which was extremely confusing to me. I did some more testing and found this: https://github.com/stac-utils/stac-pydantic/blob/main/stac_pydantic/item.py#L38

It is modifying the input data before it gets to the pydantic validators, and doing extra validation within python when pydantic can handle it better / faster.

I will submit a related PR shortly.

eseglem avatar Dec 07 '23 13:12 eseglem

This is what I used for testing purposes:

import copy
from datetime import datetime
from pydantic import BaseModel
from stac_pydantic import Item, ItemProperties


class MyModel(BaseModel):
    date: datetime
    num: int
    text: str


MY_MODEL_DICT = {
    "date": "2022-11-21T19:09:07.587000Z",
    "num": "12345",
    "text": "Some Text",
}


def test_mymodel_unpack():
    test_dict = copy.deepcopy(MY_MODEL_DICT)
    _ = MyModel(**test_dict)
    assert test_dict == MY_MODEL_DICT


def test_mymodel_validate():
    test_dict = copy.deepcopy(MY_MODEL_DICT)
    _ = MyModel.model_validate(test_dict)
    assert test_dict == MY_MODEL_DICT


class NestedMyModel(BaseModel):
    my_model: MyModel


NESTED_MY_MODEL_DICT = {
    "my_model": {
        "date": "2022-11-21T19:09:07.587000Z",
        "num": "12345",
        "text": "Some Text",
    }
}


def test_nested_mymodel_unpack():
    test_dict = copy.deepcopy(NESTED_MY_MODEL_DICT)
    _ = NestedMyModel(**test_dict)
    assert test_dict == NESTED_MY_MODEL_DICT


def test_nested_mymodel_validate():
    test_dict = copy.deepcopy(NESTED_MY_MODEL_DICT)
    _ = NestedMyModel.model_validate(test_dict)
    assert test_dict == NESTED_MY_MODEL_DICT


ITEM_DICT = {
    "type": "Feature",
    "stac_version": "1.0.0",
    "id": "my_item",
    "collection": "mycollection",
    "geometry": {
        "coordinates": [
            [
                [149.19940003830305, -36.428645762574625],
                [149.05104951177714, -39.259123256881225],
                [152.81379005469256, -39.32599372427865],
                [152.82080633518694, -36.489064495233244],
                [149.19940003830305, -36.428645762574625],
            ]
        ],
        "type": "Polygon",
    },
    "bbox": [
        149.05104951177714,
        -39.32599372427865,
        152.82080633518694,
        -36.428645762574625,
    ],
    "properties": {
        "datetime": "2022-11-21T19:09:07.587000Z",
        "created": "2023-08-03T00:31:23.346946Z",
        "updated": "2023-08-03T00:31:23.346946Z",
    },
    "stac_extensions": [],
    "links": [
        {
            "rel": "self",
            "type": "application/geo+json",
            "href": "https://my.stac/v0/collections/mycollection/items/my_item",
        },
        {
            "rel": "collection",
            "type": "application/json",
            "href": "https://my.stac/v0/collections/mycollection",
        },
        {
            "rel": "parent",
            "type": "application/json",
            "href": "https://my.stac/v0/collections/mycollection",
        },
    ],
    "assets": {
        "data": {
            "href": "https://my.stac/v0/collections/mycollection/items/my_item/data.tif",
            "title": "Data",
            "media_type": "image/tiff; application=geotiff",
            "roles": ["data"],
        },
        "thumbnail": {
            "href": "https://my.stac/v0/collections/mycollection/items/my_item/thumbnail.png",
            "media_type": "image/png",
            "roles": ["thumbnail"],
        },
        "metadata": {
            "href": "https://my.stac/v0/collections/mycollection/items/my_item/metadata.json",
            "media_type": "application/json",
            "roles": ["metadata"],
        },
    },
}


def test_item_unpack():
    item_dict = copy.deepcopy(ITEM_DICT)
    _ = Item(**item_dict)
    assert item_dict == ITEM_DICT


def test_item_validate():
    item_dict = copy.deepcopy(ITEM_DICT)
    _ = Item.model_validate(item_dict)
    assert item_dict == ITEM_DICT


ITEM_PROPERTIES_DICT = {
    "datetime": "2022-11-21T19:09:07.587000Z",
    "created": "2023-08-03T00:31:23.346946Z",
    "updated": "2023-08-03T00:31:23.346946Z",
}


def test_item_properties_unpack():
    item_dict = copy.deepcopy(ITEM_PROPERTIES_DICT)
    _ = ItemProperties(**item_dict)
    assert item_dict == ITEM_PROPERTIES_DICT


def test_item_properties_validate():
    item_dict = copy.deepcopy(ITEM_PROPERTIES_DICT)
    _ = ItemProperties.model_validate(item_dict)
    assert item_dict == ITEM_PROPERTIES_DICT

Three of them fail with current main, and they all pass with my PR.

eseglem avatar Dec 07 '23 14:12 eseglem

@eseglem I think the real issue here is that stac-pydantic will serialized the datetime using https://github.com/stac-utils/stac-pydantic/blob/c03272e7e1a9730d9d7dd18794bd660005c3ed0b/stac_pydantic/shared.py#L20

while your input data was in form of %Y-%m-%dT%H:%M:%S.%fZ which is still a valid RFC3339 format

vincentsarago avatar Feb 08 '24 17:02 vincentsarago

I can't find a straight forward path to make sure the serialization respect the input date other than doing something like

    @field_serializer("datetime")
    def serialize_datetime(self, v: dt, _info: Any) -> str:
        if str(v)[19] == ".":
            return v.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        else:
            return v.strftime("%Y-%m-%dT%H:%M:%SZ")

vincentsarago avatar Feb 08 '24 19:02 vincentsarago

Oo I see the value do get converted to datetime object during the validation step (pre) and thus the value in the input dictionary is changed 😓

so I guess #131 is 👍

vincentsarago avatar Feb 08 '24 20:02 vincentsarago