[C++][Parquet] Cannot read encrypted parquet datasets via _metadata file
Describe the bug, including details regarding any error messages, version, and platform.
Fails with:
Cannot decrypt ColumnMetadata. FileDecryption is not setup correctly
This is using plaintext footer.
Reproducer:
import os
import pyarrow.parquet.encryption as pe
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pyarrow as pa
import base64
import polars as pl
class KmsClient(pe.KmsClient):
def unwrap_key(self, wrapped_key, master_key_identifier):
return base64.b64decode(wrapped_key)
def wrap_key(self, key_bytes, master_key_identifier):
return base64.b64encode(key_bytes)
def write(location):
cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
df = pl.DataFrame({
"col1": [1, 2, 3],
"col2": [1, 2, 3],
"year": [2020, 2020, 2021]
})
ecfg = pe.EncryptionConfiguration(
footer_key="TEST",
column_keys={
"TEST": ["col2"]
},
double_wrapping=False,
plaintext_footer=False,
)
table = df.to_arrow()
parquet_encryption_cfg = ds.ParquetEncryptionConfig(
cf, pe.KmsConnectionConfig(), ecfg
)
metadata_collector = []
pq.write_to_dataset(
table,
location,
partitioning=ds.partitioning(
schema=pa.schema([
pa.field("year", pa.int16())
]),
flavor="hive"
),
encryption_config=parquet_encryption_cfg,
metadata_collector=metadata_collector
)
pq.write_metadata(
pa.schema(
field
for field in table.schema
if field.name != "year"
),
os.path.join(location, "_metadata"),
metadata_collector
)
def read(location):
decryption_config = pe.DecryptionConfiguration(cache_lifetime=300)
kms_connection_config = pe.KmsConnectionConfig()
cf = pe.CryptoFactory(lambda *a, **k: KmsClient())
parquet_decryption_cfg = ds.ParquetDecryptionConfig(
cf, kms_connection_config, decryption_config
)
decryption_properties = cf.file_decryption_properties(
kms_connection_config, decryption_config)
pq_scan_opts = ds.ParquetFragmentScanOptions(
decryption_config=parquet_decryption_cfg,
# If using build from master
# decryption_properties=decryption_properties
)
pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts)
dataset = ds.parquet_dataset(
os.path.join(location, "_metadata"),
format=pformat,
partitioning=ds.partitioning(
schema=pa.schema([
pa.field("year", pa.int16())
]),
flavor="hive"
)
)
print(dataset.to_table())
if __name__ == '__main__':
location = r"/tmp/dataset-test"
os.makedirs(location, exist_ok=True)
write(location)
read(location)
Presumably the metadata read out of _metadata file is not decrypted or the footer indicates incorrectly whether it's encrypted or not.
Tried with latest master which contains: https://github.com/apache/arrow/commit/bd444106af494b3d4c6cce0af88f6ce2a6a327eb
Component(s)
C++, Python
Seems you can rebuild the dataset from what parquet_dataset returned:
from pyarrow import fs
filesystem = fs.LocalFileSystem()
remade_dataset = ds.FileSystemDataset(
[
pformat.make_fragment(
fragment.path,
filesystem,
fragment.partition_expression,
[rg.id for rg in fragment.row_groups]
)
for fragment in dataset.get_fragments()
],
dataset.schema,
pformat,
)
print(remade_dataset.to_table())
but I assume this re-fetches the metadata (instead of using it from the _metadata file), beating the purpose of having the _metadata file in the first place.
Actually, I think the issue might be with writing the data. I think the _metadata file has no encryption algorithm set, so it doesn't even attempt to decrypt the metadata.
I think we'd need an equivalent of:
https://github.com/apache/arrow/blob/5e1a4fd8a4ed3630c9549c611222d2d6c32357ca/cpp/src/parquet/file_writer.cc#L546
but based on:
https://github.com/apache/arrow/blob/5e1a4fd8a4ed3630c9549c611222d2d6c32357ca/cpp/src/parquet/file_writer.cc#L551
Hi @rok, as it looks like the PR is still ongoing, would it be alright to move bump this to the next milestone (20.0.0)?
Sure, it will not make it through review in time.