clickhouse-docs icon indicating copy to clipboard operation
clickhouse-docs copied to clipboard

Protobuf default values

Open BorisTyshkevich opened this issue 1 year ago • 1 comments

The doc said that:

Default values defined in a protobuf schema like this

syntax = "proto2";

message MessageType {
  optional int32 result_per_page = 3 [default = 10];
}

are not applied; the table defaults are used instead of them.

But on recent Clickhouse versions (23-24) the defaults applied from Protobuf schema, not from create table statement.

BorisTyshkevich avatar Mar 14 '24 14:03 BorisTyshkevich

This did bit us and is still valid for clickhouse version 24.5.3.5 and probably many others. Feel free to use our reproduction example with the collapsed sections below.

Click Here to see Reproduction Summary
========================= PROTO SCHEMA =========================
   ┌─c1─────────────────────────────────────────┐
1. │ syntax = "proto2";                         │
2. │ package P;                                 │
3. │ message M {                                │
4. │ required uint64 b  = 100;                  │
5. │ required string s  = 200;                  │
6. │ optional uint64 u  = 300; // no default    │
7. │ optional uint64 v  = 400 [default = 8008]; │
8. │ }                                          │
   └────────────────────────────────────────────┘
========================= TABLE SCHEMA =========================
   ┌─statement─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
1. │ CREATE TABLE default.test
(
    `b` UInt64,
    `s` String,
    `u` UInt64 DEFAULT b + 42,
    `v` UInt64 DEFAULT 8383
)
ENGINE = Memory │
   └───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
========================= INSERTION SHAPES =====================
   ┌────b─┬─s───────────────────┐
1. │ 1100 │ json-1000-single    │
2. │ 5100 │ json-5000-delimited │
3. │ 6100 │ json-6000-delimited │
   └──────┴─────────────────────┘
========================= INSERTION RESULTS ====================
   ┌────b─┬─s────────────────────┬────u─┬────v─┐
1. │ 1100 │ json-1000-single     │ 1142 │ 8383 │
2. │ 5100 │ json-5000-delimited  │ 5142 │ 8383 │
3. │ 6100 │ json-6000-delimited  │ 6142 │ 8383 │
4. │ 1100 │ proto-1000-single    │    0 │ 8008 │
5. │ 5100 │ proto-5000-delimited │    0 │ 8008 │
6. │ 6100 │ proto-6000-delimited │    0 │ 8008 │
   └──────┴──────────────────────┴──────┴──────┘
========================= DOCUMENTATION MISMATCH ? =============
Click Here to see Reproduction Steps
# docker run --rm -it $(docker build -q .)
# ===========================================================================================
FROM scratch AS data
COPY <<EOF /schema.proto
syntax = "proto2";
package P;
message M {
    required uint64 b  = 100;
    required string s  = 200;
    optional uint64 u  = 300; // no default
    optional uint64 v  = 400 [default = 8008];
}
EOF

COPY <<1000.json /msgs/
{"b": 1100, "s": "json-1000-single"}
1000.json

COPY <<5000.ndjson /msgs/
{"b": 5100, "s": "json-5000-delimited"}
{"b": 6100, "s": "json-6000-delimited"}
5000.ndjson

# ===========================================================================================
FROM python:3.12.8-slim-bullseye AS compiled
RUN apt update && apt install -y protobuf-compiler
RUN pip3 install protobuf==5.29.3
ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

COPY --from=data /schema.proto /proto/schema.proto
COPY --from=data /msgs /msgs

WORKDIR /app
RUN protoc --python_out=. --proto_path=/proto schema.proto

COPY <<EOF /app/app.py
import os
import json
import struct
from schema_pb2 import M

def encode_varint(value):
    result = b''
    while value >= 0x80:
        result += struct.pack('B', (value & 0x7f) | 0x80)
        value >>= 7
    result += struct.pack('B', value & 0x7f)
    return result

def json_to_protobuf(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            if filename.endswith('.json'):
                data = json.loads(f.read().strip().replace('json', 'proto'))
                message = M(**data)
                with open(file_path.replace('.json', '.protosingle'), 'wb') as out:
                    out.write(message.SerializeToString())
            elif filename.endswith('.ndjson'):
                with open(file_path.replace('.ndjson', '.protomulti'), 'wb') as out:
                    for line in f:
                        data = json.loads(line.replace('json', 'proto'))
                        message = M(**data)
                        serialized_message = message.SerializeToString()
                        out.write(encode_varint(len(serialized_message)))
                        out.write(serialized_message)

json_to_protobuf('/msgs')
EOF

RUN python3 /app/app.py

# ===========================================================================================
FROM clickhouse/clickhouse-server:24.5.3.5

COPY --from=compiled /msgs /msgs
COPY --from=data /schema.proto /schema.proto

COPY <<EOF /commands.sql

CREATE TABLE test (
    b UInt64,
    s String,
    u UInt64 DEFAULT b + 42,
    v UInt64 DEFAULT 8383
) ENGINE = Memory ;

INSERT INTO test FROM INFILE '/msgs/*.json' ;
INSERT INTO test FROM INFILE '/msgs/*.ndjson' ;
INSERT INTO test FROM INFILE '/msgs/*.protosingle' SETTINGS format_schema = '/schema.proto:M' FORMAT ProtobufSingle;
INSERT INTO test FROM INFILE '/msgs/*.protomulti' SETTINGS format_schema = '/schema.proto:M' FORMAT Protobuf ;

SELECT '========================= PROTO SCHEMA =========================' ;
SELECT * FROM file('/schema.proto') FORMAT PrettyCompactMonoBlock ; 
SELECT '========================= TABLE SCHEMA =========================' ;
SHOW CREATE TABLE test FORMAT PrettyCompactMonoBlock ; 
SELECT '========================= INSERTION SHAPES =====================' ;
SELECT * FROM file('/msgs/*.*json') ORDER BY s FORMAT PrettyCompactMonoBlock ; 
SELECT '========================= INSERTION RESULTS ====================' ;
SELECT b, s, u, v
FROM test ORDER BY s FORMAT PrettyCompactMonoBlock ; 
SELECT '========================= DOCUMENTATION MISMATCH ? =============' ;
EOF

# -------------------------------------------------------------------------------------------
RUN clickhouse-local --queries-file /commands.sql 2>&1 > /out.txt
COPY <<EOF /run.bash
cat /out.txt
EOF

# -------------------------------------------------------------------------------------------
RUN chmod a+rx /run.bash
ENTRYPOINT [ "bash", "-c" ]
CMD [ "/run.bash"]

hileef avatar Jan 30 '25 22:01 hileef